# Linear Regression

Use 'satisfaction score' instead of 'satisfaction' as target variable. 

## Libraries and read files

In [95]:
import pandas as pd
from sklearn.model_selection import KFold
import statsmodels.api as sm

In [96]:
df = pd.read_csv("Airline Passenger Satisfaction.csv").set_index('id').drop(columns=['Unnamed: 0'])
df.head()

Unnamed: 0_level_0,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,Ease of Online booking,Gate location,...,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes,satisfaction,satisfaction score
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
70172,Male,Loyal Customer,13,Personal Travel,Eco Plus,460,3,4,3,1,...,4,3,4,4,5,5,25,18,neutral or dissatisfied,7
5047,Male,disloyal Customer,25,Business travel,Business,235,3,2,3,3,...,1,5,3,1,4,1,1,6,neutral or dissatisfied,3
110028,Female,Loyal Customer,26,Business travel,Business,1142,2,2,2,2,...,4,3,4,4,4,5,0,0,satisfied,9
24026,Female,Loyal Customer,25,Business travel,Business,562,2,5,5,5,...,2,5,3,1,4,2,11,9,neutral or dissatisfied,6
119299,Male,Loyal Customer,61,Business travel,Business,214,3,3,3,3,...,3,4,4,3,3,3,0,0,satisfied,10


## K-fold linear regression

In [97]:
# One-hot encode categorical data 
df = pd.get_dummies(df, columns=['Gender', 'Customer Type', 'Type of Travel', 'Class'])

# Isolate target variable from independent variables
X = df.drop(['satisfaction', 'satisfaction score'], axis=1)
y = df['satisfaction score']

# Set up KFold
k = 10
kf = KFold(n_splits=k, shuffle=True, random_state=1)

# Track best model
best_mse = float('inf')
best_model = None

for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    # Create a linear regression model using statsmodels
    X_train_with_const = sm.add_constant(X_train)
    model = sm.OLS(y_train, X_train_with_const)
    results = model.fit()
    
    # Get the mean squared error (MSE) for this fold
    y_pred = results.predict(sm.add_constant(X_test))
    mse = ((y_test - y_pred) ** 2).mean()
    
    # Check if this model is better than the previous best model
    if mse < best_mse:
        best_mse = mse
        best_model = results

# Display the results of the best fit model
print("Best Fit Model Coefficients:\n")
for feature, coefficient, p_value in zip(X.columns, best_model.params[1:], best_model.pvalues[1:]):
    print(feature, ":", coefficient, "(p-value:", p_value, ")")
print("\nIntercept:", best_model.params[0])

Best Fit Model Coefficients:

Age : -0.004766522028753011 (p-value: 8.75170297486186e-57 )
Flight Distance : -1.9059158329072792e-05 (p-value: 0.00011217739036813655 )
Inflight wifi service : 0.2140910466964094 (p-value: 0.0 )
Departure/Arrival time convenient : 0.025800482375027186 (p-value: 4.0798219038929553e-13 )
Ease of Online booking : -0.06135181573342661 (p-value: 1.3510638698058603e-35 )
Gate location : 0.04225932984303359 (p-value: 1.3579052164319763e-25 )
Food and drink : 0.09158152600345516 (p-value: 1.0338245215731735e-85 )
Online boarding : 0.3751214580263409 (p-value: 0.0 )
Seat comfort : 0.10988084704823428 (p-value: 2.5977316624998327e-109 )
Inflight entertainment : 0.16742235594553548 (p-value: 8.925349367629145e-159 )
On-board service : 0.2018166620485985 (p-value: 0.0 )
Leg room service : 0.18532850039759785 (p-value: 0.0 )
Baggage handling : 0.12910034760453215 (p-value: 1.2342940247993385e-150 )
Checkin service : 0.2214435032343046 (p-value: 0.0 )
Inflight service

In [98]:
results_df = pd.DataFrame({'Feature': X.columns,
                           'Coefficient': best_model.params[1:],
                           'P-Value': best_model.pvalues[1:]})

results_df

Unnamed: 0,Feature,Coefficient,P-Value
Age,Age,-0.004767,8.751703e-57
Flight Distance,Flight Distance,-1.9e-05,0.0001121774
Inflight wifi service,Inflight wifi service,0.214091,0.0
Departure/Arrival time convenient,Departure/Arrival time convenient,0.0258,4.079822e-13
Ease of Online booking,Ease of Online booking,-0.061352,1.351064e-35
Gate location,Gate location,0.042259,1.357905e-25
Food and drink,Food and drink,0.091582,1.033825e-85
Online boarding,Online boarding,0.375121,0.0
Seat comfort,Seat comfort,0.109881,2.597732e-109
Inflight entertainment,Inflight entertainment,0.167422,8.925349000000001e-159


In [99]:
sig_feat = []
crit_val = 0.001

for ind in results_df.index:
    if results_df['P-Value'][ind] < 0.05:
        sig_feat.append(results_df['Feature'][ind])

sig_feat

['Age',
 'Flight Distance',
 'Inflight wifi service',
 'Departure/Arrival time convenient',
 'Ease of Online booking',
 'Gate location',
 'Food and drink',
 'Online boarding',
 'Seat comfort',
 'Inflight entertainment',
 'On-board service',
 'Leg room service',
 'Baggage handling',
 'Checkin service',
 'Inflight service',
 'Cleanliness',
 'Departure Delay in Minutes',
 'Arrival Delay in Minutes',
 'Gender_Female',
 'Gender_Male',
 'Customer Type_Loyal Customer',
 'Customer Type_disloyal Customer',
 'Type of Travel_Business travel',
 'Type of Travel_Personal Travel',
 'Class_Business',
 'Class_Eco',
 'Class_Eco Plus']