In [142]:
import numpy as np
import pandas as pd

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, PredefinedSplit, GridSearchCV
from sklearn.metrics import f1_score, accuracy_score, recall_score, precision_score

In [143]:
air_data = pd.read_csv('../datasets/Invistico_Airline.csv')
air_data.head()

Unnamed: 0,satisfaction,Customer Type,Age,Type of Travel,Class,Flight Distance,Seat comfort,Departure/Arrival time convenient,Food and drink,Gate location,...,Online support,Ease of Online booking,On-board service,Leg room service,Baggage handling,Checkin service,Cleanliness,Online boarding,Departure Delay in Minutes,Arrival Delay in Minutes
0,satisfied,Loyal Customer,65,Personal Travel,Eco,265,0,0,0,2,...,2,3,3,0,3,5,3,2,0,0.0
1,satisfied,Loyal Customer,47,Personal Travel,Business,2464,0,0,0,3,...,2,3,4,4,4,2,3,2,310,305.0
2,satisfied,Loyal Customer,15,Personal Travel,Eco,2138,0,0,0,3,...,2,2,3,3,4,4,4,2,0,0.0
3,satisfied,Loyal Customer,60,Personal Travel,Eco,623,0,0,0,3,...,3,1,1,0,1,4,1,3,0,0.0
4,satisfied,Loyal Customer,70,Personal Travel,Eco,354,0,0,0,3,...,4,2,2,0,2,4,2,5,0,0.0


In [144]:
air_data.dtypes

satisfaction                          object
Customer Type                         object
Age                                    int64
Type of Travel                        object
Class                                 object
Flight Distance                        int64
Seat comfort                           int64
Departure/Arrival time convenient      int64
Food and drink                         int64
Gate location                          int64
Inflight wifi service                  int64
Inflight entertainment                 int64
Online support                         int64
Ease of Online booking                 int64
On-board service                       int64
Leg room service                       int64
Baggage handling                       int64
Checkin service                        int64
Cleanliness                            int64
Online boarding                        int64
Departure Delay in Minutes             int64
Arrival Delay in Minutes             float64
dtype: obj

In [145]:
air_data.shape

(129880, 22)

In [146]:
air_data.isnull().any(axis=1).sum()

np.int64(393)

In [147]:
air_data_subset = air_data.dropna(axis=0)

In [148]:
air_data_subset.isnull().any(axis=1).sum()

np.int64(0)

In [149]:
air_data_subset_dummies = pd.get_dummies(air_data_subset, columns=['Customer Type','Type of Travel','Class'], drop_first=True)

In [150]:
air_data_subset_dummies.dtypes

satisfaction                          object
Age                                    int64
Flight Distance                        int64
Seat comfort                           int64
Departure/Arrival time convenient      int64
Food and drink                         int64
Gate location                          int64
Inflight wifi service                  int64
Inflight entertainment                 int64
Online support                         int64
Ease of Online booking                 int64
On-board service                       int64
Leg room service                       int64
Baggage handling                       int64
Checkin service                        int64
Cleanliness                            int64
Online boarding                        int64
Departure Delay in Minutes             int64
Arrival Delay in Minutes             float64
Customer Type_disloyal Customer         bool
Type of Travel_Personal Travel          bool
Class_Eco                               bool
Class_Eco 

In [151]:
air_data_subset_dummies.head()

Unnamed: 0,satisfaction,Age,Flight Distance,Seat comfort,Departure/Arrival time convenient,Food and drink,Gate location,Inflight wifi service,Inflight entertainment,Online support,...,Baggage handling,Checkin service,Cleanliness,Online boarding,Departure Delay in Minutes,Arrival Delay in Minutes,Customer Type_disloyal Customer,Type of Travel_Personal Travel,Class_Eco,Class_Eco Plus
0,satisfied,65,265,0,0,0,2,2,4,2,...,3,5,3,2,0,0.0,False,True,True,False
1,satisfied,47,2464,0,0,0,3,0,2,2,...,4,2,3,2,310,305.0,False,True,False,False
2,satisfied,15,2138,0,0,0,3,2,0,2,...,4,4,4,2,0,0.0,False,True,True,False
3,satisfied,60,623,0,0,0,3,3,4,3,...,1,4,1,3,0,0.0,False,True,True,False
4,satisfied,70,354,0,0,0,3,4,3,4,...,2,4,2,5,0,0.0,False,True,True,False


In [152]:
y = air_data_subset.satisfaction
X = air_data_subset_dummies.drop('satisfaction', axis=1)

In [153]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [154]:
# cv_params = {
#     'n_estimators' : [50, 100],
#     'max_depth' : [10, 50],
#     'min_samples_leaf' : [1, 2, 5],
#     'min_samples_split' : [2, 5, 10],
#     'max_features' : ["sqrt"],
#     'max_samples' : [0.5, 0.9]
# }
cv_params = {'n_estimators' : [50,100],
              'max_depth' : [10,50],
              'min_samples_leaf' : [0.5,1],
              'min_samples_split' : [0.001, 0.01],
              'max_features' : ["sqrt"],
              'max_samples' : [.5,.9]}

In [155]:
split_index = [0 if x in X_val.index else -1 for x in X_train.index]
custom_split = PredefinedSplit(test_fold=split_index)

In [156]:
split_df = pd.Series(split_index)
split_df

0        -1
1        -1
2        -1
3        -1
4        -1
         ..
103584   -1
103585   -1
103586   -1
103587   -1
103588    0
Length: 103589, dtype: int64

In [157]:
rf = RandomForestClassifier(random_state=42)

In [158]:
rf_val = GridSearchCV(estimator=rf, param_grid=cv_params, cv=custom_split, refit='f1', verbose=1, n_jobs=-1)

In [159]:
rf_val.fit(X_train, y_train)

Fitting 1 folds for each of 32 candidates, totalling 32 fits


0,1,2
,estimator,RandomForestC...ndom_state=42)
,param_grid,"{'max_depth': [10, 50], 'max_features': ['sqrt'], 'max_samples': [0.5, 0.9], 'min_samples_leaf': [0.5, 1], ...}"
,scoring,
,n_jobs,-1
,refit,'f1'
,cv,"PredefinedSpl...pe=(103589,)))"
,verbose,1
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,n_estimators,50
,criterion,'gini'
,max_depth,50
,min_samples_split,0.001
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [161]:
rf_val.best_params_

{'max_depth': 50,
 'max_features': 'sqrt',
 'max_samples': 0.9,
 'min_samples_leaf': 1,
 'min_samples_split': 0.001,
 'n_estimators': 50}

In [162]:
rf_optimal = RandomForestClassifier(
        random_state=42,
        max_depth= 50,
        max_features = 'sqrt',
        max_samples=0.9,
        min_samples_leaf=1,
        min_samples_split=0.001,
        n_estimators=50)

In [163]:
rf_optimal.fit(X_train, y_train)

0,1,2
,n_estimators,50
,criterion,'gini'
,max_depth,50
,min_samples_split,0.001
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [165]:
y_pred = rf_optimal.predict(X_test)

In [168]:
precision = precision_score(y_test, y_pred, pos_label='satisfied')
accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred, pos_label='satisfied')
f1 = f1_score(y_test, y_pred, pos_label='satisfied')

In [169]:
print("Precision: %.3f" % precision)
print("Accuracy: %.3f" % accuracy)
print("Recall: %.3f" % recall)
print("F1: %.3f" % f1)

Precision: 0.949
Accuracy: 0.944
Recall: 0.947
F1: 0.948
