In [9]:
import pandas as pd
import numpy as np 
import pickle as pkl 
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, PredefinedSplit, GridSearchCV
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score 
air_data = pd.read_csv("C:/Users/vkyc2/Downloads/airline_data.csv")
air_data.head()

Unnamed: 0,satisfaction,Customer Type,Age,Type of Travel,Class,Flight Distance,Seat comfort,Departure/Arrival time convenient,Food and drink,Gate location,...,Online support,Ease of Online booking,On-board service,Leg room service,Baggage handling,Checkin service,Cleanliness,Online boarding,Departure Delay in Minutes,Arrival Delay in Minutes
0,satisfied,Loyal Customer,65,Personal Travel,Eco,265,0,0,0,2,...,2,3,3,0,3,5,3,2,0,0.0
1,satisfied,Loyal Customer,47,Personal Travel,Business,2464,0,0,0,3,...,2,3,4,4,4,2,3,2,310,305.0
2,satisfied,Loyal Customer,15,Personal Travel,Eco,2138,0,0,0,3,...,2,2,3,3,4,4,4,2,0,0.0
3,satisfied,Loyal Customer,60,Personal Travel,Eco,623,0,0,0,3,...,3,1,1,0,1,4,1,3,0,0.0
4,satisfied,Loyal Customer,70,Personal Travel,Eco,354,0,0,0,3,...,4,2,2,0,2,4,2,5,0,0.0


## Data cleaning 

In [10]:
air_data.head(10)

Unnamed: 0,satisfaction,Customer Type,Age,Type of Travel,Class,Flight Distance,Seat comfort,Departure/Arrival time convenient,Food and drink,Gate location,...,Online support,Ease of Online booking,On-board service,Leg room service,Baggage handling,Checkin service,Cleanliness,Online boarding,Departure Delay in Minutes,Arrival Delay in Minutes
0,satisfied,Loyal Customer,65,Personal Travel,Eco,265,0,0,0,2,...,2,3,3,0,3,5,3,2,0,0.0
1,satisfied,Loyal Customer,47,Personal Travel,Business,2464,0,0,0,3,...,2,3,4,4,4,2,3,2,310,305.0
2,satisfied,Loyal Customer,15,Personal Travel,Eco,2138,0,0,0,3,...,2,2,3,3,4,4,4,2,0,0.0
3,satisfied,Loyal Customer,60,Personal Travel,Eco,623,0,0,0,3,...,3,1,1,0,1,4,1,3,0,0.0
4,satisfied,Loyal Customer,70,Personal Travel,Eco,354,0,0,0,3,...,4,2,2,0,2,4,2,5,0,0.0
5,satisfied,Loyal Customer,30,Personal Travel,Eco,1894,0,0,0,3,...,2,2,5,4,5,5,4,2,0,0.0
6,satisfied,Loyal Customer,66,Personal Travel,Eco,227,0,0,0,3,...,5,5,5,0,5,5,5,3,17,15.0
7,satisfied,Loyal Customer,10,Personal Travel,Eco,1812,0,0,0,3,...,2,2,3,3,4,5,4,2,0,0.0
8,satisfied,Loyal Customer,56,Personal Travel,Business,73,0,0,0,3,...,5,4,4,0,1,5,4,4,0,0.0
9,satisfied,Loyal Customer,22,Personal Travel,Eco,1556,0,0,0,3,...,2,2,2,4,5,3,4,2,30,26.0


In [11]:
air_data.dtypes

satisfaction                          object
Customer Type                         object
Age                                    int64
Type of Travel                        object
Class                                 object
Flight Distance                        int64
Seat comfort                           int64
Departure/Arrival time convenient      int64
Food and drink                         int64
Gate location                          int64
Inflight wifi service                  int64
Inflight entertainment                 int64
Online support                         int64
Ease of Online booking                 int64
On-board service                       int64
Leg room service                       int64
Baggage handling                       int64
Checkin service                        int64
Cleanliness                            int64
Online boarding                        int64
Departure Delay in Minutes             int64
Arrival Delay in Minutes             float64
dtype: obj

From the result above, we can see that there are 3 types of variables included in the data: we have integer, float, and object. 
The object variables are satisfaction, customer type, type of travel, and class. 

### Next, we want to understand the size of the dataset, so we identify the number of rows and columns. 

In [12]:
air_data.shape

(129880, 22)

### Now, we check for missing values. 

In [18]:
air_data.isna().any(axis=1).sum()

# From the result below, we can see that there are 393 rows of missing data. 



393

In [None]:
# We drop the rows with missing values. 
air_data_subset = air_data.dropna(axis=0)
air_data_subset.head(10)

In [16]:
air_data_subset.isna().sum() 

satisfaction                         0
Customer Type                        0
Age                                  0
Type of Travel                       0
Class                                0
Flight Distance                      0
Seat comfort                         0
Departure/Arrival time convenient    0
Food and drink                       0
Gate location                        0
Inflight wifi service                0
Inflight entertainment               0
Online support                       0
Ease of Online booking               0
On-board service                     0
Leg room service                     0
Baggage handling                     0
Checkin service                      0
Cleanliness                          0
Online boarding                      0
Departure Delay in Minutes           0
Arrival Delay in Minutes             0
dtype: int64

### Convert the categorial features to one-hot encoded/indicator features. 
The drop_first argument can be kept as default (False) during one-hot encoding for random forest models, so it does not need to be specified. 
The target variable, satisfaction, does not need to be encoded and will be extracted later. 

It is necessary to convert categorical data into dummy variables, because the sklearn implementation of RandomForestClassifier() requires that categorical features to be encoded to numeric, which can be one using dummy variables or one-hot encoding. 

In [None]:
air_data_subset_dummies = pd.get_dummies(air_data_subset, columns=['Customer Type','Type of Travel','Class'])



In [24]:
air_data_subset_dummies.head(30)

Unnamed: 0,satisfaction,Age,Flight Distance,Seat comfort,Departure/Arrival time convenient,Food and drink,Gate location,Inflight wifi service,Inflight entertainment,Online support,...,Online boarding,Departure Delay in Minutes,Arrival Delay in Minutes,Customer Type_Loyal Customer,Customer Type_disloyal Customer,Type of Travel_Business travel,Type of Travel_Personal Travel,Class_Business,Class_Eco,Class_Eco Plus
0,satisfied,65,265,0,0,0,2,2,4,2,...,2,0,0.0,True,False,False,True,False,True,False
1,satisfied,47,2464,0,0,0,3,0,2,2,...,2,310,305.0,True,False,False,True,True,False,False
2,satisfied,15,2138,0,0,0,3,2,0,2,...,2,0,0.0,True,False,False,True,False,True,False
3,satisfied,60,623,0,0,0,3,3,4,3,...,3,0,0.0,True,False,False,True,False,True,False
4,satisfied,70,354,0,0,0,3,4,3,4,...,5,0,0.0,True,False,False,True,False,True,False
5,satisfied,30,1894,0,0,0,3,2,0,2,...,2,0,0.0,True,False,False,True,False,True,False
6,satisfied,66,227,0,0,0,3,2,5,5,...,3,17,15.0,True,False,False,True,False,True,False
7,satisfied,10,1812,0,0,0,3,2,0,2,...,2,0,0.0,True,False,False,True,False,True,False
8,satisfied,56,73,0,0,0,3,5,3,5,...,4,0,0.0,True,False,False,True,True,False,False
9,satisfied,22,1556,0,0,0,3,2,0,2,...,2,30,26.0,True,False,False,True,False,True,False


In [23]:
# Check the variables of the new dataset
air_data_subset_dummies.dtypes

satisfaction                          object
Age                                    int64
Flight Distance                        int64
Seat comfort                           int64
Departure/Arrival time convenient      int64
Food and drink                         int64
Gate location                          int64
Inflight wifi service                  int64
Inflight entertainment                 int64
Online support                         int64
Ease of Online booking                 int64
On-board service                       int64
Leg room service                       int64
Baggage handling                       int64
Checkin service                        int64
Cleanliness                            int64
Online boarding                        int64
Departure Delay in Minutes             int64
Arrival Delay in Minutes             float64
Customer Type_Loyal Customer            bool
Customer Type_disloyal Customer         bool
Type of Travel_Business travel          bool
Type of Tr

# Model building 
### The first step to building your model is separating the labels (y) from the features (X). 

In [25]:
y = air_data_subset_dummies['satisfaction']
X = air_data_subset_dummies.drop("satisfaction", axis = 1)

# Splitting the data into train, validate, and test sets. 
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = .25, random_state = 0)
X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train, test_size = 0.25, random_state=0)


# Tune the model 
### Fit and tune a random forest model with a separate validation set. We begin by determining a set of hyperparameters for tuning the model using GridSearchCV. 

We create a dictionary that maps each hyperparameter name to a list of values. 
The GridSearch conducted will set the hyperparameter to each possible value and determine which value is optimal. 


In [26]:
cv_params = {
    'n_estimators': [50,100],
    'max_depth': [10, 50],
    'min_samples_leaf': [0.5,1],
    'min_samples_split':[0.001, 0.01],
    'max_features': ['sqrt'],
    'max_samples': [0.5,0.9]
}

### Now, we create a list of split indices. 
### Using list comprehension, we iterate over the indices of X_train. The list can consist of 0s to indicate data points that should be treated as validation data, and -1 to indicate data points that should be treated as training data. 

Using PredefinedSplit(), passing in the split index. 
This will serve as a custom split that will identify which data points from the train set should be treated as validation data during GridSearch. 

In [27]:
split_index = [0 if x in X_val.index else -1 for x in X_train.index]
custom_split = PredefinedSplit(split_index)

We can finally instantiate a random forest model. 
We use RandomForestClassifier() and specify the random_state argument for reproducible results. 
Next, GridSearchCV is used to search over the specified parameters. 

In [29]:
rf = RandomForestClassifier(random_state = 0)
rf_val = GridSearchCV(rf, cv_params, cv=custom_split, refit='f1', n_jobs =-1, verbose = 1)

In [30]:
%%time 
rf_val.fit(X_train, y_train)

Fitting 1 folds for each of 32 candidates, totalling 32 fits
CPU times: total: 4.47 s
Wall time: 34.9 s


In [31]:
rf_val.best_params_

{'max_depth': 50,
 'max_features': 'sqrt',
 'max_samples': 0.9,
 'min_samples_leaf': 1,
 'min_samples_split': 0.001,
 'n_estimators': 50}

### We now use the selected model to predict on the test data. Use the optimal parameter found via GridSearchCV. 


In [32]:
rf_opt = RandomForestClassifier(n_estimators=50, max_depth=50, min_samples_leaf=1,min_samples_split = 0.001, max_features='sqrt',max_samples=0.9, random_state=0)

In [35]:
# Fit the optimal model using the fit() method. 
rf_opt.fit(X_train, y_train)

# Predict on the test set using the opitmal model. 
# The predict() function is called to make predictions on X_test using rf_opt.
# The predictions are saved as y_pred to compare to the true labels.
y_pred = rf_opt.predict(X_test)


# Obtain performance scores. 

In [None]:
# Get precision score. The precision_score function is from scikit-learn.
pc_test = precision_score(y_test, y_pred, pos_label='satisfied')
print(pc_test)
print(f"The precision score is {pc_test:.3f}.")

0.9501276595744681
The precision score is 0.950.


In [41]:
# Collect the recall score. 
rc_test = recall_score(y_test, y_pred, pos_label='satisfied')
print(f"The recall score is {rc_test:.3f}.")

The recall score is 0.945.


In [42]:
# Collect accuracy score. 
ac_test= accuracy_score(y_test, y_pred)
print(f"The accuracy score is {ac_test: .3f}.")

The accuracy score is  0.942.


In [44]:
# Get F1 score. 
f1_test = f1_score(y_test, y_pred, pos_label='satisfied')
print(f"The F1 score is {f1_test: .3f}.")

The F1 score is  0.947.


In [None]:
## Creating a table of results that can be used to evaluate the performance of the model. 


## A short summary to provide to stakeholders. 
The random forest model predicted satisfaction with more than 94.2% accuracy. 
The precision is over 95% and the recall is approximately 94.5%. 
As stakeholders were interested in learning about the factors that are most important to customer satisfaction, this would be shared based on the tuned random forest. 