In [2]:
import os
os.chdir("/Users/mammoc/Desktop/Python")
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import statsmodels.api as sm
import numpy as np

In [3]:
airport = pd.read_csv("airport_original1.csv", sep = ';', header = 0)

In [4]:
# Dropping rows where 'Destination' is 4
airport = airport[airport['Destination'] != 4]

In [5]:
###-------------- STEP 1: FILLING UP NAs ----------------

#Drop NAs in Destination
airport = airport.dropna(subset=['Destination'])

#Age has 1 missing value (ID 214) => Replace by average age of 40
airport.loc[airport['ID'] == 214, 'Age'] = 40

#Gender has 3 missing values (ID 179, 199, 259)
# => Fill up by most frequency category - female - 2.0
airport.loc[airport['ID'].isin([179, 199, 259]), 'Gender'] = 2.0

#SeatClass has 3 missing value 
#Fill up by most frequency category = 1.0
airport.loc[airport['ID'].isin([245, 255, 450, 475]), 'SeatClass'] = 1.0

#AccessTime:
AccessTime_by_ProvinceResidence = airport.groupby('ProvinceResidence')['AccessTime'].mean()
airport['AccessTime'] = airport.apply(
    lambda row: AccessTime_by_ProvinceResidence[row['ProvinceResidence']] if pd.isnull(row['AccessTime']) 
    else row['AccessTime'],axis=1)

# Recategorize 'Airline' into binary categories: 'Legacy' and 'Other'
airport['Airline[Legacy]'] = airport['Airline'].isin([1, 2]).astype(int)

#Airfare:
# Calculate the mean 'Airfare' for each combination of 'Airline_Legacy','Destination'
Airfare_by_Airline_Destination = airport.groupby(['Airline[Legacy]','Destination'])['Airfare'].mean()
#Fill the missing values in 'Airfare' with the average values calculated above
airport['Airfare'] = airport.apply(
    lambda row: Airfare_by_Airline_Destination[row['Airline[Legacy]'], row['Destination']] if pd.isnull(row['Airfare']) 
    else row['Airfare'], axis=1)

#Airline:
#For Incheon: Uniform distribution
airport.loc[airport['ID'].isin([69, 72]), 'Airline'] = 1.0
airport.loc[airport['ID'].isin([82, 83]), 'Airline'] = 2.0
airport.loc[airport['ID'].isin([89, 188]), 'Airline'] = 3.0
airport.loc[airport['ID'].isin([212, 214]), 'Airline'] = 4.0

#For Gimpo: Most frequency group
airport.loc[airport['ID'].isin([266, 314]), 'Airline'] = 4.0

In [6]:
## --------- STEP 2: RE-CATEGORIZING Y ----------
airport['Airline[Legacy]'] = airport['Airline'].isin([1, 2]).astype(int)

#Reason: there is a difference in range of Airfare:
print(airport.groupby('Airline[Legacy]')['Airfare'].mean())

#Checking sampple size each group: make sure they relatively even
airport['Airline[Other]'] = airport['Airline'].isin([3,4]).astype(int)
print(airport['Airline[Legacy]'].sum())
print(airport['Airline[Other]'].sum())

Airline[Legacy]
0    45.496782
1    51.008293
Name: Airfare, dtype: float64
249
210


In [7]:
##----------- STEP 3: RE-CATEGORIZING -------------
airport["Intercept"] = 1.0

airport['FlyingCompanion[>=3]'] = (airport['FlyingCompanion']>= 3).astype(int)
##Reference group is < 3

airport['ProvinceResidence[Seoul,Kyungki-do]'] = (airport['ProvinceResidence'].isin([1,3])).astype(int)
#Reference group is other provinces

airport['Destination[Japan]'] = (airport['Destination'] == 2).astype(int)
airport['Destination[China]'] = (airport['Destination'] == 1).astype(int)
#Reference group is South East

In [8]:
X = airport[["Intercept", 
             'AccessTime',
             'Airfare', 
             'NoTripsLastYear',
             'GroupTravel',
             'Age',
             'TripDuration', 
             'ProvinceResidence[Seoul,Kyungki-do]',
             'FlyingCompanion[>=3]', 
             'Destination[China]', 
             'Destination[Japan]',
]]

y = airport['Airline[Legacy]']

logit_model1 = sm.Logit(y, X).fit()

print(logit_model1.summary())

Optimization terminated successfully.
         Current function value: 0.607284
         Iterations 7
                           Logit Regression Results                           
Dep. Variable:        Airline[Legacy]   No. Observations:                  459
Model:                          Logit   Df Residuals:                      448
Method:                           MLE   Df Model:                           10
Date:                Sat, 09 Mar 2024   Pseudo R-squ.:                  0.1193
Time:                        11:49:47   Log-Likelihood:                -278.74
converged:                       True   LL-Null:                       -316.50
Covariance Type:            nonrobust   LLR p-value:                 3.794e-12
                                          coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------------------------------
Intercept                              -0.8887      0.819 

In [28]:
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score, precision_score

kf = KFold(n_splits=10, shuffle=True, random_state=1) # Ensure reproducibility with random_state

# Lists to store metrics for each fold
accuracies = []
recalls = []
precisions = []

for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Fit logistic regression model
    logit_model = sm.Logit(y_train, X_train).fit(disp=0) # disp=0 suppresses the fit summary
    
    # Predictions
    y_pred = logit_model.predict(X_test)
    y_pred_class = (y_pred > 0.5).astype(int) # Convert probabilities to class labels

    # Compute confusion matrix
    cm = confusion_matrix(y_test, y_pred_class)
    #print("Confusion Matrix:")
    #print(cm)
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred_class)
    recall = recall_score(y_test, y_pred_class)
    precision = precision_score(y_test, y_pred_class)

    # Append metrics to lists
    accuracies.append(accuracy)
    recalls.append(recall)
    precisions.append(precision)

# Print average metrics
print(f"Average Accuracy: {np.mean(accuracies)}")
print(f"Average Recall: {np.mean(recalls)}")
print(f"Average Precision: {np.mean(precisions)}")

Average Accuracy: 0.6730434782608695
Average Recall: 0.7456937598933969
Average Precision: 0.6855636469119639
