In [12]:
import os
os.chdir("/Users/mammoc/Desktop/Python")
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import statsmodels.api as sm
import numpy as np

In [3]:
airport = pd.read_csv("airport_original1.csv", sep = ';', header = 0)

In [4]:
# Dropping rows where 'Destination' is 4
airport = airport[airport['Destination'] != 4]

In [5]:
###-------------- STEP 1: FILLING UP NAs ----------------

#Drop NAs in Destination
airport = airport.dropna(subset=['Destination'])

#Age has 1 missing value (ID 214) => Replace by average age of 40
airport.loc[airport['ID'] == 214, 'Age'] = 40

#Gender has 3 missing values (ID 179, 199, 259)
# => Fill up by most frequency category - female - 2.0
airport.loc[airport['ID'].isin([179, 199, 259]), 'Gender'] = 2.0

#SeatClass has 3 missing value 
#Fill up by most frequency category = 1.0
airport.loc[airport['ID'].isin([245, 255, 450, 475]), 'SeatClass'] = 1.0

#AccessTime:
AccessTime_by_ProvinceResidence = airport.groupby('ProvinceResidence')['AccessTime'].mean()
airport['AccessTime'] = airport.apply(
    lambda row: AccessTime_by_ProvinceResidence[row['ProvinceResidence']] if pd.isnull(row['AccessTime']) 
    else row['AccessTime'],axis=1)

#Noticeable Difference in Airfare Between Legacy Airlines and Others
#print(airport.groupby('Airline[Legacy]')['Airfare'].mean())
#print(airport.groupby('Airline[Legacy]')['Airfare'].median())
# Recategorize 'Airline' into binary categories: 'Legacy' and 'Other'
airport['Airline[Legacy]'] = airport['Airline'].isin([1, 2]).astype(int)

#Airfare:
# Calculate the mean 'Airfare' for each combination of 'Airline_Legacy','Destination'
Airfare_by_Airline_Destination = airport.groupby(['Airline[Legacy]','Destination'])['Airfare'].mean()
#Fill the missing values in 'Airfare' with the average values calculated above
airport['Airfare'] = airport.apply(
    lambda row: Airfare_by_Airline_Destination[row['Airline[Legacy]'], row['Destination']] if pd.isnull(row['Airfare']) 
    else row['Airfare'], axis=1)

In [6]:
##----------- STEP 2: RE-CATEGORIZING -------------
airport["Intercept"] = 1.0

airport['Airport_binary'] = (airport['Airport'] == 1).astype(int)

airport['Nationality[Japan]'] = (airport['Nationality'].isin([3])).astype(int)
airport['Nationality[SE]'] = (airport['Nationality'].isin([4])).astype(int)
airport['Nationality[China]'] = (airport['Nationality'].isin([2])).astype(int)
#Reference group is Korean

airport['TripDuration[<4]'] = (airport['TripDuration']<4).astype(int)
#Reference group is >=4

airport['ProvinceResidence[Incheon,Kyungki-do]'] = (airport['ProvinceResidence'].isin([2,3])).astype(int)
#Reference group is other provinces

airport['Destination[Japan]'] = (airport['Destination'] == 2).astype(int)
airport['Destination[SE]'] = (airport['Destination'] == 3).astype(int)
#Reference group is China

airport['DepartureTime[noon]'] = (airport['DepartureTime'] == 2).astype(int)
airport['DepartureTime[night]'] = (airport['DepartureTime'] == 3).astype(int)
airport['DepartureTime[midnight]'] = (airport['DepartureTime'] == 4).astype(int)
#Reference group is morning

airport['AccessTime[<=30]'] = (airport['AccessTime']<= 30).astype(int)
#Reference group is >30

In [8]:
## ----------------------- LOGIT - AIRPORT ----------------------

X = airport[["Intercept",  
             'NoTransport',
             'Airfare',
             'FlyingCompanion',
             'Nationality[Japan]',
             'Nationality[China]',
             'Nationality[SE]', 
             'TripDuration[<4]',  
             'ProvinceResidence[Incheon,Kyungki-do]',
             'Destination[Japan]', 'Destination[SE]',
             'DepartureTime[noon]', 'DepartureTime[night]', 'DepartureTime[midnight]',
             'AccessTime[<=30]',
]]
y = airport['Airport_binary']
logit_model1 = sm.Logit(y, X).fit()
print(logit_model1.summary())

Optimization terminated successfully.
         Current function value: 0.328834
         Iterations 8
                           Logit Regression Results                           
Dep. Variable:         Airport_binary   No. Observations:                  459
Model:                          Logit   Df Residuals:                      444
Method:                           MLE   Df Model:                           14
Date:                Fri, 08 Mar 2024   Pseudo R-squ.:                  0.5247
Time:                        21:33:03   Log-Likelihood:                -150.93
converged:                       True   LL-Null:                       -317.58
Covariance Type:            nonrobust   LLR p-value:                 1.309e-62
                                            coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------------------------
Intercept                                -4.7653      

In [13]:
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score, precision_score

kf = KFold(n_splits=10, shuffle=True, random_state=1) # Ensure reproducibility with random_state

# Lists to store metrics for each fold
accuracies = []
recalls = []
precisions = []

for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Fit logistic regression model
    logit_model = sm.Logit(y_train, X_train).fit(disp=0) # disp=0 suppresses the fit summary
    
    # Predictions
    y_pred = logit_model.predict(X_test)
    y_pred_class = (y_pred > 0.5).astype(int) # Convert probabilities to class labels

    # Compute confusion matrix
    cm = confusion_matrix(y_test, y_pred_class)
    #print("Confusion Matrix:")
    #print(cm)
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred_class)
    recall = recall_score(y_test, y_pred_class)
    precision = precision_score(y_test, y_pred_class)

    # Append metrics to lists
    accuracies.append(accuracy)
    recalls.append(recall)
    precisions.append(precision)

# Print average metrics
print(f"Average Accuracy: {np.mean(accuracies)}")
print(f"Average Recall: {np.mean(recalls)}")
print(f"Average Precision: {np.mean(precisions)}")

Average Accuracy: 0.8757487922705314
Average Recall: 0.8259798772293051
Average Precision: 0.9062753667900727
