### **Import the necessary libraries**

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme()

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression

from sklearn import metrics
from sklearn.metrics import confusion_matrix, classification_report,accuracy_score,precision_score,recall_score,f1_score

from sklearn import tree
from sklearn.tree import DecisionTreeClassifier

from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier

import scipy.stats as stats

from sklearn.model_selection import GridSearchCV


import warnings
warnings.filterwarnings('ignore')

## Travel Datasets & Treatment of missing values

In [2]:
df1 = pd.read_csv("Traveldata_train.csv")
df1['dataset_type'] = 'train'

In [3]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 94379 entries, 0 to 94378
Data columns (total 10 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   ID                       94379 non-null  int64  
 1   Gender                   94302 non-null  object 
 2   Customer_Type            85428 non-null  object 
 3   Age                      94346 non-null  float64
 4   Type_Travel              85153 non-null  object 
 5   Travel_Class             94379 non-null  object 
 6   Travel_Distance          94379 non-null  int64  
 7   Departure_Delay_in_Mins  94322 non-null  float64
 8   Arrival_Delay_in_Mins    94022 non-null  float64
 9   dataset_type             94379 non-null  object 
dtypes: float64(3), int64(2), object(5)
memory usage: 7.2+ MB


In [4]:
df2 = pd.read_csv('Traveldata_test.csv')
df2['dataset_type'] = 'test'

In [5]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35602 entries, 0 to 35601
Data columns (total 10 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   ID                       35602 non-null  int64  
 1   Gender                   35572 non-null  object 
 2   Customer_Type            32219 non-null  object 
 3   Age                      35591 non-null  float64
 4   Type_Travel              32154 non-null  object 
 5   Travel_Class             35602 non-null  object 
 6   Travel_Distance          35602 non-null  int64  
 7   Departure_Delay_in_Mins  35573 non-null  float64
 8   Arrival_Delay_in_Mins    35479 non-null  float64
 9   dataset_type             35602 non-null  object 
dtypes: float64(3), int64(2), object(5)
memory usage: 2.7+ MB


## Union the two tables

In [6]:
df_travel = pd.concat([df1, df2])

In [7]:
df_travel.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 129981 entries, 0 to 35601
Data columns (total 10 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   ID                       129981 non-null  int64  
 1   Gender                   129874 non-null  object 
 2   Customer_Type            117647 non-null  object 
 3   Age                      129937 non-null  float64
 4   Type_Travel              117307 non-null  object 
 5   Travel_Class             129981 non-null  object 
 6   Travel_Distance          129981 non-null  int64  
 7   Departure_Delay_in_Mins  129895 non-null  float64
 8   Arrival_Delay_in_Mins    129501 non-null  float64
 9   dataset_type             129981 non-null  object 
dtypes: float64(3), int64(2), object(5)
memory usage: 10.9+ MB


In [8]:
travel = df_travel.copy()

In [9]:
travel.head()

Unnamed: 0,ID,Gender,Customer_Type,Age,Type_Travel,Travel_Class,Travel_Distance,Departure_Delay_in_Mins,Arrival_Delay_in_Mins,dataset_type
0,98800001,Female,Loyal Customer,52.0,,Business,272,0.0,5.0,train
1,98800002,Male,Loyal Customer,48.0,Personal Travel,Eco,2200,9.0,0.0,train
2,98800003,Female,Loyal Customer,43.0,Business Travel,Business,1061,77.0,119.0,train
3,98800004,Female,Loyal Customer,44.0,Business Travel,Business,780,13.0,18.0,train
4,98800005,Female,Loyal Customer,50.0,Business Travel,Business,1981,0.0,0.0,train


In [10]:
travel.tail()

Unnamed: 0,ID,Gender,Customer_Type,Age,Type_Travel,Travel_Class,Travel_Distance,Departure_Delay_in_Mins,Arrival_Delay_in_Mins,dataset_type
35597,99935598,Male,Loyal Customer,8.0,Personal Travel,Eco,1334,0.0,0.0,test
35598,99935599,Female,Loyal Customer,53.0,Business Travel,Business,1772,0.0,0.0,test
35599,99935600,Male,Disloyal Customer,22.0,Business Travel,Eco,1180,0.0,0.0,test
35600,99935601,Female,Loyal Customer,67.0,Personal Travel,Eco,420,23.0,16.0,test
35601,99935602,Male,,20.0,Personal Travel,Eco,1680,0.0,0.0,test


### **Checking for missing values**

In [11]:
missing_values_travel = travel.isnull().sum()
missing_values_travel.sort_values(ascending=False)

Type_Travel                12674
Customer_Type              12334
Arrival_Delay_in_Mins        480
Gender                       107
Departure_Delay_in_Mins       86
Age                           44
ID                             0
Travel_Class                   0
Travel_Distance                0
dataset_type                   0
dtype: int64

In [12]:
share_missing_values_travel = missing_values_travel/travel.isnull().count()
share_missing_values_travel.sort_values(ascending=False)

Type_Travel                0.097507
Customer_Type              0.094891
Arrival_Delay_in_Mins      0.003693
Gender                     0.000823
Departure_Delay_in_Mins    0.000662
Age                        0.000339
ID                         0.000000
Travel_Class               0.000000
Travel_Distance            0.000000
dataset_type               0.000000
dtype: float64

## Treating missing values of travel data

In [13]:
def clean_travel(df):
    # Separating categories and numerics 
    cat_list = df.select_dtypes(['object']).columns.tolist()
    num_list = df.select_dtypes(['number']).columns.tolist()

    # Replacing NaN values in the data
    #10% of values for cust type and travel type are unknown - this could be significant
    #df.loc[df['Customer_Type'].isnull(),'Customer_Type'] =  df['Customer_Type'].mode()[0]
    df.loc[df['Customer_Type'].isnull(),'Customer_Type'] =  "Unknown"
    #df.loc[df['Type_Travel'].isnull(),'Type_Travel'] =  df['Type_Travel'].mode()[0]
    df.loc[df['Type_Travel'].isnull(),'Type_Travel'] =  "Unknown"
    # added this 
    df.loc[df['Gender'].isnull(),'Gender'] =  "Unknown"

    df.loc[df['Age'].isnull(),'Age'] =  df['Age'].mean()
    
    df.loc[df['Departure_Delay_in_Mins'].isnull(),'Departure_Delay_in_Mins'] = df['Departure_Delay_in_Mins'].median() 
    df.loc[df['Arrival_Delay_in_Mins'].isnull(),'Arrival_Delay_in_Mins'] = df['Arrival_Delay_in_Mins'].median() 

    df.loc[df['Gender'].isnull(),'Gender'] = df['Gender'].mode() 
    #dropping Gender Na would lose too much info however less than 0.1% of samples are unknown.  we will just use mode
    
    for col in cat_list:
        df[col] = df[col].astype('category')
    
    return df

In [14]:
cleaned_travel = clean_travel(travel)

In [15]:
cleaned_travel.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 129981 entries, 0 to 35601
Data columns (total 10 columns):
 #   Column                   Non-Null Count   Dtype   
---  ------                   --------------   -----   
 0   ID                       129981 non-null  int64   
 1   Gender                   129981 non-null  category
 2   Customer_Type            129981 non-null  category
 3   Age                      129981 non-null  float64 
 4   Type_Travel              129981 non-null  category
 5   Travel_Class             129981 non-null  category
 6   Travel_Distance          129981 non-null  int64   
 7   Departure_Delay_in_Mins  129981 non-null  float64 
 8   Arrival_Delay_in_Mins    129981 non-null  float64 
 9   dataset_type             129981 non-null  category
dtypes: category(5), float64(3), int64(2)
memory usage: 6.6 MB


## Survey Datasets & Treatment of missing values

In [16]:
df3 =pd.read_csv("Surveydata_train.csv")
df3['dataset_type'] = 'train'

In [17]:
df3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 94379 entries, 0 to 94378
Data columns (total 18 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   ID                       94379 non-null  int64 
 1   Overall_Experience       94379 non-null  int64 
 2   Seat_Comfort             94318 non-null  object
 3   Seat_Class               94379 non-null  object
 4   Arrival_Time_Convenient  85449 non-null  object
 5   Catering                 85638 non-null  object
 6   Platform_Location        94349 non-null  object
 7   Onboard_Wifi_Service     94349 non-null  object
 8   Onboard_Entertainment    94361 non-null  object
 9   Online_Support           94288 non-null  object
 10  Ease_of_Online_Booking   94306 non-null  object
 11  Onboard_Service          86778 non-null  object
 12  Legroom                  94289 non-null  object
 13  Baggage_Handling         94237 non-null  object
 14  CheckIn_Service          94302 non-nul

In [18]:
# it's not a 50/50 split balanced but it means that in general in our training data we have more customers who were satisfied (54%)
df3.groupby(['Overall_Experience'])['Overall_Experience'].count()

Overall_Experience
0    42786
1    51593
Name: Overall_Experience, dtype: int64

In [19]:
df4 = pd.read_csv('Surveydata_test.csv')
df4['dataset_type'] = 'test'

In [20]:
# Notice that in the survery data TEST set we don't have the column Overall_Experience. It's normal as our models will need to predict it. 
# However, we will need to exclude a treatment of missing vsalues for that column when we merge the two datasets.
df4.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35602 entries, 0 to 35601
Data columns (total 17 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   ID                       35602 non-null  int64 
 1   Seat_Comfort             35580 non-null  object
 2   Seat_Class               35602 non-null  object
 3   Arrival_Time_Convenient  32277 non-null  object
 4   Catering                 32245 non-null  object
 5   Platform_Location        35590 non-null  object
 6   Onboard_Wifi_Service     35590 non-null  object
 7   Onboard_Entertainment    35594 non-null  object
 8   Online_Support           35576 non-null  object
 9   Ease_of_Online_Booking   35584 non-null  object
 10  Onboard_Service          32730 non-null  object
 11  Legroom                  35577 non-null  object
 12  Baggage_Handling         35562 non-null  object
 13  CheckIn_Service          35580 non-null  object
 14  Cleanliness              35600 non-nul

## Union the two tables

In [21]:
df_survey = pd.concat([df3, df4])

In [22]:
df_survey.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 129981 entries, 0 to 35601
Data columns (total 18 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   ID                       129981 non-null  int64  
 1   Overall_Experience       94379 non-null   float64
 2   Seat_Comfort             129898 non-null  object 
 3   Seat_Class               129981 non-null  object 
 4   Arrival_Time_Convenient  117726 non-null  object 
 5   Catering                 117883 non-null  object 
 6   Platform_Location        129939 non-null  object 
 7   Onboard_Wifi_Service     129939 non-null  object 
 8   Onboard_Entertainment    129955 non-null  object 
 9   Online_Support           129864 non-null  object 
 10  Ease_of_Online_Booking   129890 non-null  object 
 11  Onboard_Service          119508 non-null  object 
 12  Legroom                  129866 non-null  object 
 13  Baggage_Handling         129799 non-null  object 
 14  Check

In [23]:
survey =df_survey.copy()

In [24]:
survey.head()

Unnamed: 0,ID,Overall_Experience,Seat_Comfort,Seat_Class,Arrival_Time_Convenient,Catering,Platform_Location,Onboard_Wifi_Service,Onboard_Entertainment,Online_Support,Ease_of_Online_Booking,Onboard_Service,Legroom,Baggage_Handling,CheckIn_Service,Cleanliness,Online_Boarding,dataset_type
0,98800001,0.0,Needs Improvement,Green Car,Excellent,Excellent,Very Convenient,Good,Needs Improvement,Acceptable,Needs Improvement,Needs Improvement,Acceptable,Needs Improvement,Good,Needs Improvement,Poor,train
1,98800002,0.0,Poor,Ordinary,Excellent,Poor,Needs Improvement,Good,Poor,Good,Good,Excellent,Needs Improvement,Poor,Needs Improvement,Good,Good,train
2,98800003,1.0,Needs Improvement,Green Car,Needs Improvement,Needs Improvement,Needs Improvement,Needs Improvement,Good,Excellent,Excellent,Excellent,Excellent,Excellent,Good,Excellent,Excellent,train
3,98800004,0.0,Acceptable,Ordinary,Needs Improvement,,Needs Improvement,Acceptable,Needs Improvement,Acceptable,Acceptable,Acceptable,Acceptable,Acceptable,Good,Acceptable,Acceptable,train
4,98800005,1.0,Acceptable,Ordinary,Acceptable,Acceptable,Manageable,Needs Improvement,Good,Excellent,Good,Good,Good,Good,Good,Good,Good,train


In [25]:
survey.tail()

Unnamed: 0,ID,Overall_Experience,Seat_Comfort,Seat_Class,Arrival_Time_Convenient,Catering,Platform_Location,Onboard_Wifi_Service,Onboard_Entertainment,Online_Support,Ease_of_Online_Booking,Onboard_Service,Legroom,Baggage_Handling,CheckIn_Service,Cleanliness,Online_Boarding,dataset_type
35597,99935598,,Needs Improvement,Green Car,Excellent,Needs Improvement,Manageable,Acceptable,Needs Improvement,Acceptable,Acceptable,Good,Excellent,Good,Acceptable,Good,Acceptable,test
35598,99935599,,Needs Improvement,Ordinary,Needs Improvement,Good,Needs Improvement,Acceptable,Excellent,Excellent,Good,Good,Good,Good,Acceptable,Good,Good,test
35599,99935600,,Good,Green Car,Extremely Poor,Good,Needs Improvement,Needs Improvement,Good,Poor,Needs Improvement,Poor,Acceptable,Poor,Poor,Excellent,Needs Improvement,test
35600,99935601,,Excellent,Ordinary,Excellent,Excellent,Inconvenient,Acceptable,Excellent,Good,Excellent,Excellent,Excellent,Excellent,Acceptable,Excellent,Good,test
35601,99935602,,Good,Ordinary,Acceptable,Good,Manageable,Poor,Good,Poor,Poor,Acceptable,Good,Good,Needs Improvement,Good,Poor,test


## Checking for missing values

In [26]:
missing_values_survey = survey.isnull().sum()
missing_values_survey.sort_values(ascending=False)

Overall_Experience         35602
Arrival_Time_Convenient    12255
Catering                   12098
Onboard_Service            10473
Baggage_Handling             182
Online_Support               117
Legroom                      115
CheckIn_Service               99
Ease_of_Online_Booking        91
Seat_Comfort                  83
Platform_Location             42
Onboard_Wifi_Service          42
Onboard_Entertainment         26
Cleanliness                    8
Online_Boarding                8
ID                             0
Seat_Class                     0
dataset_type                   0
dtype: int64

In [27]:
share_missing_values_survey = missing_values_survey/survey.isnull().count()
share_missing_values_survey.sort_values(ascending=False)

Overall_Experience         0.273902
Arrival_Time_Convenient    0.094283
Catering                   0.093075
Onboard_Service            0.080573
Baggage_Handling           0.001400
Online_Support             0.000900
Legroom                    0.000885
CheckIn_Service            0.000762
Ease_of_Online_Booking     0.000700
Seat_Comfort               0.000639
Platform_Location          0.000323
Onboard_Wifi_Service       0.000323
Onboard_Entertainment      0.000200
Cleanliness                0.000062
Online_Boarding            0.000062
ID                         0.000000
Seat_Class                 0.000000
dataset_type               0.000000
dtype: float64

## Treating missing values of Survey Data

In [28]:
# to not treat the values for the test set, we don't have missing values in the train set
survey.drop('Overall_Experience',axis=1,inplace=True)

In [29]:
# I had to do it separately otherwise missing values remain but by doing it before hand it works
survey['Platform_Location'].fillna('no answer', inplace = True)

In [30]:
#this one cell cleans all the data for the survey train/test
def clean_survey(df):
    #create a list of ratings columns. Exclude ID and Overall experience
    #remove seat class - it is a classifier
    #reomve Platform location for separate treatment
    cols_rating = df.columns[2:]
    cols_rating = cols_rating.drop(['Seat_Class','Platform_Location'])
    #fill NA in both train and test
    for i in cols_rating:
        df[i].fillna('no answer', inplace = True)
    #***NOTE - I moved no answer to the middle value instead of the low value.  may want to test both.
    #***NOTE - What happens if we increase the weights on thses numbers doubled? squared? etc
    #set values to replace
    names = [
        'Excellent',
        'Good',
        'Acceptable',
        'no answer',
        'Needs Improvement',
        'Poor',
        'Extremely Poor'
    ]
    #set values to replace for Platform_Location
    names_plat = [
        'Very Convenient',
        'Convenient',
        'Manageable',
        'no answer',
        'Needs Improvement', 
        'Inconvenient',
        'Very Inconvenient',
    ]
    #numbers that will replace inputs
    numbers = [6,5,4,3,2,1,0]
    #replace the values in the dataframes
    for i in df[cols_rating]:
        df[i].replace(to_replace = names, value = numbers, inplace=True)
    #Also replace 'Platform_Location' column with numbers
    df['Platform_Location'].replace(to_replace = names_plat, value = numbers, inplace=True)
    #Set seat class as a category
    df['Seat_Class'].astype('category')
    df['Platform_Locatio'].astype('category')
    for i in cols_rating:
        df[i].astype('category')
    
    return df

In [31]:
cleaned_survey = clean_survey(survey)

KeyError: 'Platform_Locatio'

In [None]:
cleaned_survey.info()

In [None]:
survey_train_set = cleaned_survey[cleaned_survey['dataset_type']=='train']

In [None]:
survey_train_set.info()

In [None]:
df3_overall_experience = df3[['ID','Overall_Experience']]

In [None]:
survey_train_set = survey_train_set.merge(df3_overall_experience,on='ID',how='inner')

In [None]:
survey_train_set.info()

In [None]:
survey_train_set.head().T

In [None]:
survey_test_set = survey[survey['dataset_type']=='test']

In [None]:
cleaned_survey = pd.concat([survey_train_set, survey_test_set])

In [None]:
cleaned_survey.info()

In [None]:
cleaned_survey.head().T

# Joining Travel + Survey treated DFs

In [None]:
df = cleaned_travel.merge(cleaned_survey,on='ID',how='inner')

In [None]:
df.info()

In [None]:
cols = df.select_dtypes(['object']).columns.tolist()
cols.append('Overall_Experience')
for i in cols:
    df[i] = df[i].astype('category')

In [None]:
df.info()

In [None]:
df.head().T

In [None]:
df.tail().T

# Descriptive Analysis

### **Analyzing Summary Statistics of the dataset**

In [None]:
df.drop(columns=['ID','dataset_type_x','dataset_type_y'],axis=1,inplace=True)

In [None]:
# Analyzing the summary statistics for numerical variables
df.describe().T

In [None]:
df.describe(include=['category']).T

In [None]:
# Checking the count of unique values in each categorical column 
cols_cat= df.select_dtypes(['category'])

for i in cols_cat.columns:
    print('Unique values in',i, 'are :')
    print(cols_cat[i])
    print('Nbr of missing values',cols_cat[i].isnull().sum())    
    print(cols_cat[i].value_counts())    
    print('*'*40)

In [None]:
def histogram_boxplot(feature, figsize=(15,10), bins = None):
    """ Boxplot and histogram combined
    feature: 1-d feature array
    figsize: size of fig (default (9,8))
    bins: number of bins (default None / auto)
    """
    f2, (ax_box2, ax_hist2) = plt.subplots(nrows = 2, # Number of rows of the subplot grid= 2
                                           sharex = True, # x-axis will be shared among all subplots                                        
                                           gridspec_kw = {"height_ratios": (.25, .75)}, 
                                           figsize = figsize,                                        
                                           ) # creating the 2 subplots
    print(col)
    print('Skew :', round(cols_not_cat[col].skew(), 2))
    sns.boxplot(feature, ax=ax_box2, showmeans=True, color='violet') # boxplot will be created and a star will indicate the mean value of the column
    sns.distplot(feature, kde=F, ax=ax_hist2, bins=bins,palette="winter") if bins else sns.distplot(feature, kde=False, ax=ax_hist2) # For histogram
    ax_hist2.axvline(np.mean(feature), color='green', linestyle='--') # Add mean to the histogram
    ax_hist2.axvline(np.median(feature), color='black', linestyle='-') # Add median to the histogram
    plt.show()

In [None]:
cols_not_cat = df.select_dtypes(exclude=['category'])

In [None]:
cols_not_cat.columns

In [None]:
for col in cols_not_cat.columns:
    histogram_boxplot(cols_not_cat[col])

# Bivariate Analysis

In [None]:
independent_variables = df.loc[:,df.columns != 'Overall_Experience']

In [None]:
for col in cols_not_cat.columns:
    sns.boxplot(df["Overall_Experience"],cols_not_cat[col],palette="PuBu")
    plt.show()

In [None]:
def stacked_plot(x):
    sns.set(palette='nipy_spectral')
    tab1 = pd.crosstab(x,df['Overall_Experience'],margins=True)
    tab2 = pd.crosstab(x,df['Overall_Experience'],margins=True,normalize='index')
    print(tab1)
    print(tab2)    
    print('-'*120)
    tab = pd.crosstab(x,df['Overall_Experience'],normalize='index')
    tab.plot(kind='bar',stacked=True,figsize=(10,5))
    plt.legend(loc='lower left', frameon=False)
    plt.legend(loc="upper left", bbox_to_anchor=(1,1))
    plt.show()

In [None]:
for col in independent_variables.columns:
    stacked_plot(independent_variables[col])
    plt.show()

In [None]:
# Separating numerical variables
numerical_col = df.select_dtypes(include=np.number).columns.tolist()

# Building correlation matrix for numerical columns
corr = df[numerical_col].corr()

# ploting the heatmap
plt.figure(figsize=(12,8))
sns.heatmap(corr,cmap='coolwarm',vmax=1,vmin=-1, annot = True,
        fmt=".2f",
        xticklabels=corr.columns,
        yticklabels=corr.columns);

In [None]:
# sns.pairplot(df, hue='Overall_Experience')
# df[numerical_col]

In [None]:
# Outlier columns 
outline_col = ['Travel_Distance', 'Departure_Delay_in_Mins', 'Arrival_Delay_in_Mins',     
       'Online_Support', 'CheckIn_Service', 'Cleanliness']

for col in outline_col:
    
    print('Skew :', round(df[col].skew(), 2))
    
    plt.figure(figsize = (15, 4))
    
    plt.subplot(1,2,1)
   
    plt.title(col + ' Histogram')
    
    df[col].hist(bins = 10, grid = False)
    
#     plt.tite
    plt.xlabel(col)
    
    plt.ylabel('count')
    
    plt.subplot(1, 2, 2)
    
    plt.title(col + ' Boxplot')
    sns.boxplot(x = df[col])
    
    plt.show()

# Treatment of Outliers

We really need to be carefull here as we have a lot of features that are highly right skewed and some of them could be potential signal for our target variable:

- Departure/Arrival delays 
-> typically, if we go on a trip and the delays are very long we would tend to be disatisfied so we need to think about it, maybe we can cap it or create a new column where we could segment (less than 60 min, 60 min to 120 min, 120 min to 180 min, more than 180 min)



## Replacing Outliers with Median Values
    In this technique, we replace the extreme values with median values. It is advised to not use mean values as they are affected by outliers. The first line of code below prints the 50th percentile value, or the median, which comes out to be 140. The second line prints the 95th percentile value, which comes out to be around 326. The third line of code below replaces all those values in the 'Loan_amount' variable, which are greater than the 95th percentile, with the median value. Finally, the fourth line prints summary statistics after all these techniques have been employed for outlier treatment. (Ref: https://www.pluralsight.com/guides/cleaning-up-data-from-outliers)

In [None]:
def outlier_treament(df,col):
    middle_num = (df[col].quantile(0.50)) 
    quartile_95 = (df[col].quantile(0.95)) 
    print('Median:',middle_num)
    treated_col = np.where(df[col] > quartile_95, middle_num, df[col])
    return treated_col
# df.describe()

In [None]:
# Treating the cols with the median values. 

for col_names in outline_col:
#     print(col_names)
    df[col_names] = outlier_treament(df,col_names)


In [None]:
# Numerical columns 
for col in outline_col:
    
    print('Skew :', round(df[col].skew(), 2))
    
    plt.figure(figsize = (15, 4))
    
    plt.subplot(1,2,1)
   
    plt.title(col + ' Histogram')
    
    df[col].hist(bins = 10, grid = False)
    
#     plt.tite
    plt.xlabel(col)
    
    plt.ylabel('count')
    
    plt.subplot(1, 2, 2)
    
    plt.title(col + ' Boxplot')
    sns.boxplot(x = df[col])
    
    plt.show()

In [None]:
# Rechecking the number of cats for the category variables
for col in df.columns:
#     print(trav_train[col].dtypes) #dtypes
    if df[col].dtypes == 'category':
        print(col,' Categories:',df[col].unique())
        print(round((df[col].value_counts()/df.shape[0])*100,3),'\n')

# Suggestions 
## 5.2 Quantile based flooring and capping
In this technique, the outlier is capped at a certain value above the 90th percentile value or floored at a factor below the 10th percentile value.
    - Reference: https://www.analyticsvidhya.com/blog/2021/05/detecting-and-treating-outliers-treating-the-odd-one-out/


In [None]:
# def treat_outliers(df,col):
#     '''
#     treats outliers in a variable
#     col: str, name of the numerical varaible
#     df: data frame
#     col: name of the column
#     '''
    
#     Q1=df[col].quantile(0.25) # 25th quantile
#     Q3=df[col].quantile(0.75)  # 75th quantile
#     IQR=Q3-Q1   # IQR Range
#     Lower_Whisker = Q1-IQR*1.5  #define lower whisker
#     Upper_Whisker = Q3+IQR*1.5  # define upper Whisker
#     df[col] = np.clip(df[col], Lower_Whisker, Upper_Whisker) # all the values samller than Lower_Whisker will be assigned value of Lower_whisker 
#                                                              # and all the values above upper_whishker will be assigned value of upper_Whisker 
#     return df

# def treat_outliers_all(df, col_list):
#     '''
#     treat outlier in all numerical varaibles
#     col_list: list of numerical varaibles
#     df: data frame
#     '''
#     for c in col_list:
#         df = treat_outliers(df,c)
        
#     return df

In [None]:
# Drop the dependent variable from the dataframe and create the X(independent variable) matrix

X = df.drop(columns = 'Overall_Experience')
# reminder 1 = satisfied, 0 = unsatisfied 

# Create dummy variables for the categorical variables - Hint: use the get_dummies() function
X = pd.get_dummies(X, drop_first=True)

y = df['Overall_Experience']

In [None]:
#scaler = StandardScaler()
#scaler = MinMaxScaler()
scaler = RobustScaler()

# Applying fit_transform on the training features data
X_scale = scaler.fit_transform(X)

# The above scaler returns the data in array format, below we are converting it back to pandas DataFrame
X_scale = pd.DataFrame(X_scale, index = X.index, columns = X.columns)

#change the X data to the Xscale data
X = X_scale

In [None]:
#split out the test and train data sets
X = X.loc[df['dataset_type'] = 'train']
solution_test = X.loc[df['dataset_type'] = 'test']
#drop the test data from y
y = y.drop(col)

In [None]:
# Split the data into training and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state = 1)

In [None]:
#creating metric function 
def metrics_score(actual, predicted):
    print(classification_report(actual, predicted))
    cm = confusion_matrix(actual, predicted)
    plt.figure(figsize=(8,5))
    sns.heatmap(cm, annot=True,  fmt='.2f', xticklabels=['Not Eligible', 'Eligible'], yticklabels=['Not Eligible', 'Eligible'])
    plt.ylabel('Actual')
    plt.xlabel('Predicted')
    plt.show()

In [None]:
# Define Logistic Regression model 
log_reg= LogisticRegression(random_state=1)

# Fit the model
model = log_reg.fit(X_train, y_train)
model

In [None]:
pd.Series(log_reg.coef_[0], index = X_train.columns).sort_values(ascending = False)

In [None]:
#Predict for train set
y_pred_train = model.predict(X_train)

#checking the performance on the test dataset
metrics_score(y_train, y_pred_train)

In [None]:
#Predict for test set using the optimal threshold
y_pred_test = log_reg.predict(X_test)

#checking the performance on the test dataset
metrics_score(y_test, y_pred_test)

In [None]:
# predict_proba gives the probability of each observation belonging to each class

y_scores = log_reg.predict_proba(X_train) 

precisions, recalls, thresholds = precision_recall_curve(y_train, y_scores[:,1])

# Plotting values of precisions, recalls, and thresholds
plt.figure(figsize = (10, 7))

plt.plot(thresholds, precisions[:-1], 'b--', label = 'precision')

plt.plot(thresholds, recalls[:-1], 'g--', label = 'recall')

plt.xlabel('Threshold')

plt.legend(loc = 'upper left')

plt.ylim([0, 1])

plt.show()

In [None]:
# Calculating the exact threshold where precision and recall are equal
for i in np.arange(len(thresholds)):
    if precisions[i] == recalls[i]:
        optimal_threshold = thresholds[i]
        print("optimal_threshold is ", optimal_threshold,)

metrics_score(y_train, y_scores[:, 1] > optimal_threshold)

In [None]:
#Predict for test set using the optimal threshold
y_pred_test = log_reg.predict_proba(X_test)

#checking the performance on the test dataset
metrics_score(y_test, y_pred_test[:, 1] > optimal_threshold)

In [None]:
# Printing the coefficients of logistic regression
# Finding the odds
odds = np.exp(log_reg.coef_[0]) 

# Adding the odds to a dataframe and sorting the values
pd.DataFrame(odds, X_train.columns, columns = ['odds']).sort_values(by = 'odds', ascending = False) 

In [None]:
#Defining Decision tree model with class weights class_weight={0: 0.2, 1: 0.8}
dt = DecisionTreeClassifier(class_weight = {0: 0.2, 1: 0.8}, random_state = 1)

In [None]:
#fitting Decision tree model
dt.fit(X_train, y_train)

In [None]:
# Checking performance on the training data

y_train_pred_dt = dt.predict(X_train)

metrics_score(y_train, y_train_pred_dt)

In [None]:
# Checking performance on the testing data
y_test_pred_dt = dt.predict(X_test)

metrics_score(y_test, y_test_pred_dt)

In [None]:
# Choose the type of classifier
dtree_estimator = DecisionTreeClassifier(class_weight = {0: 0.6, 1: 0.4}, random_state = 1,)
#this class weight indicates a preference for identifying defaulters
# Grid of parameters to choose from
parameters = {'max_depth': np.arange(2, 5), 
              'criterion': ['gini', 'entropy'],
              'min_samples_leaf': list(range(5,20,5))
             }

# Type of scoring used to compare parameter combinations
scorer = metrics.make_scorer(recall_score, pos_label = 1)

# Run the grid search
gridCV = GridSearchCV(dtree_estimator, parameters, scoring = scorer, cv = 10)

# Fitting the grid search on the train data
gridCV = gridCV.fit(X_train, y_train)

# Set the classifier to the best combination of parameters
dtree_estimator = gridCV.best_estimator_

# Fit the best estimator to the data
dtree_estimator.fit(X_train, y_train)

In [None]:
# Checking performance on the training data based on the tuned model
y_train_pred_dt = dtree_estimator.predict(X_train)

metrics_score(y_train, y_train_pred_dt)

In [None]:
# Checking performance on the testing data based on the tuned model
y_test_pred_dt = dtree_estimator.predict(X_test)

metrics_score(y_test, y_test_pred_dt)

In [None]:
importances = dtree_estimator.feature_importances_

columns = X.columns

importance_df = pd.DataFrame(importances, index = columns, columns = ['Importance']).sort_values(by = 'Importance', ascending = False)

plt.figure(figsize = (13, 13))

sns.barplot(importance_df.Importance, importance_df.index)

In [None]:
# Plot the decision  tree and analyze it to build the decision rule

features = list(X.columns)

plt.figure(figsize = (30, 20))

tree.plot_tree(dtree_estimator, feature_names = features, filled = True, fontsize = 12, node_ids = True, class_names = class_names)

plt.show()

In [None]:
text_representation = tree.export_text(dtree_estimator,feature_names = features, show_weights=True)
print(text_representation)

In [None]:
!pip install dtreeviz
#this is a cool visual I used in my capstone

In [None]:
from dtreeviz.trees import dtreeviz # remember to load the package

viz = dtreeviz(dtree_estimator, X, y,
                target_name="BAD",
                feature_names=features,
                class_names=class_names,
                #orientation='LR'
               )
viz

RANDOM forest

In [None]:
# Checking performance on the training data
y_train_pred_dt = dt.predict(X_train)
metrics_score(y_train, y_train_pred_dt)

In [None]:
# Checking performance on the testing data
y_test_pred_dt = dt.predict(X_test)
metrics_score(y_test, y_test_pred_dt)

In [None]:
# Choose the type of classifier
dtree_estimator = DecisionTreeClassifier(class_weight = {0: 0.6, 1: 0.4}, random_state = 1,)
#this class weight indicates a preference for identifying defaulters
# Grid of parameters to choose from
parameters = {'max_depth': np.arange(2, 5), 
              'criterion': ['gini', 'entropy'],
              'min_samples_leaf': list(range(5,20,5))
             }

# Type of scoring used to compare parameter combinations
scorer = metrics.make_scorer(recall_score, pos_label = 1)

# Run the grid search
gridCV = GridSearchCV(dtree_estimator, parameters, scoring = scorer, cv = 10)

# Fitting the grid search on the train data
gridCV = gridCV.fit(X_train, y_train)

# Set the classifier to the best combination of parameters
dtree_estimator = gridCV.best_estimator_

# Fit the best estimator to the data
dtree_estimator.fit(X_train, y_train)

In [None]:
# Checking performance on the training data based on the tuned model
y_train_pred_dt = dtree_estimator.predict(X_train)
metrics_score(y_train, y_train_pred_dt)

In [None]:
# Checking performance on the testing data based on the tuned model
y_test_pred_dt = dtree_estimator.predict(X_test)
metrics_score(y_test, y_test_pred_dt)

In [None]:
importances = dtree_estimator.feature_importances_

columns = X.columns

importance_df = pd.DataFrame(importances, index = columns, columns = ['Importance']).sort_values(by = 'Importance', ascending = False)

plt.figure(figsize = (13, 13))

sns.barplot(importance_df.Importance, importance_df.index)

In [None]:
# Plot the decision  tree and analyze it to build the decision rule

features = list(X.columns)

plt.figure(figsize = (30, 20))

tree.plot_tree(dtree_estimator, feature_names = features, filled = True, fontsize = 12, node_ids = True, class_names = class_names)

plt.show()

In [None]:
!pip install dtreeviz

In [None]:
from dtreeviz.trees import dtreeviz # remember to load the package

viz = dtreeviz(dtree_estimator, X, y,
                target_name="BAD",
                feature_names=features,
                class_names=class_names,
                #orientation='LR'
               )
viz

Random Forest

In [None]:
# Defining Random forest CLassifier
rf_estimator = RandomForestClassifier(random_state = 1)
rf_estimator.fit(X_train, y_train)

In [None]:
#Checking performance on the training data
y_pred_train_rf = rf_estimator.predict(X_train)
metrics_score(y_train, y_pred_train_rf)

In [None]:
# Checking performance on the test data
y_pred_test_rf = rf_estimator.predict(X_test)
metrics_score(y_test, y_pred_test_rf)

In [None]:
# Defining Random Forest model with class weights class_weight={0: 0.2, 1: 0.8}
rf_estimator_weighted = RandomForestClassifier(class_weight = {0: 0.2, 1: 0.8}, random_state = 1)
# Fitting the Random Forest model
rf_estimator_weighted.fit(X_train, y_train)

In [None]:
# Checking performance on the train data
y_pred_train_rf = rf_estimator_weighted.predict(X_train)
metrics_score(y_train, y_pred_train_rf)

In [None]:
# Checking performance on the test data
y_pred_test_rf = rf_estimator_weighted.predict(X_test)
metrics_score(y_test, y_pred_test_rf)

In [None]:
params_rf = {  
        "n_estimators": [100, 250, 500],
        "min_samples_leaf": np.arange(1, 4, 1),
        "max_features": [0.7, 0.9, 'auto'],
}

In [None]:
# Choose the type of classifier
rf_estimator_tuned = RandomForestClassifier(class_weight = {0: 0.8, 1: 0.2}, random_state = 1)

# Grid of parameters to choose from
params_rf = {  
        "n_estimators": [100, 250, 500],
        "min_samples_leaf": np.arange(1, 4, 1),
        "max_features": [0.7, 0.9, 'auto'],
}


# Type of scoring used to compare parameter combinations - recall score for class 1
scorer = metrics.make_scorer(recall_score, pos_label = 1)

# Run the grid search
grid_obj = GridSearchCV(rf_estimator_tuned, params_rf, scoring = scorer, cv = 5)

grid_obj = grid_obj.fit(X_train, y_train)

# Set the classifier to the best combination of parameters
rf_estimator_tuned = grid_obj.best_estimator_

In [None]:
# Checking performance on the train data
y_pred_train_rf = rf_estimator_tuned.predict(X_train)

metrics_score(y_train, y_pred_train_rf)

In [None]:
# Checking performance on the test data
y_pred_test_rf = rf_estimator_tuned.predict(X_test)

metrics_score(y_test, y_pred_test_rf)

In [None]:
# importance of features in the tree building ( The importance of a feature is computed as the 
#(normalized) total reduction of the criterion brought by that feature. It is also known as the Gini importance )
# Checking performace on test dataset
importances = rf_estimator_tuned.feature_importances_

columns = X.columns

importance_df = pd.DataFrame(importances, index = columns, columns = ['Importance']).sort_values(by = 'Importance', ascending = False)

plt.figure(figsize = (13, 13))

sns.barplot(importance_df.Importance, importance_df.index)

Comparing model results

In [None]:
def get_recall_score(model,flag=True,X_train=X_train,X_test=X_test):
    '''
    model : classifier to predict values of X

    '''
    a = [] # defining an empty list to store train and test results
    pred_train = model.predict(X_train)
    pred_test = model.predict(X_test)
    train_recall = metrics.recall_score(y_train,pred_train)
    test_recall = metrics.recall_score(y_test,pred_test)
    a.append(train_recall) # adding train recall to list 
    a.append(test_recall) # adding test recall to list
    
    # If the flag is set to True then only the following print statements will be dispayed. The default value is set to True.
    if flag == True: 
        print("Recall on training set : ",metrics.recall_score(y_train,pred_train))
        print("Recall on test set : ",metrics.recall_score(y_test,pred_test))
    
    return a # returning the list with train and test scores

In [None]:
##  Function to calculate precision score
def get_precision_score(model,flag=True,X_train=X_train,X_test=X_test):
    '''
    model : classifier to predict values of X

    '''
    b = []  # defining an empty list to store train and test results
    pred_train = model.predict(X_train)
    pred_test = model.predict(X_test)
    train_precision = metrics.precision_score(y_train,pred_train)
    test_precision = metrics.precision_score(y_test,pred_test)
    b.append(train_precision) # adding train precision to list
    b.append(test_precision) # adding test precision to list
    
    # If the flag is set to True then only the following print statements will be dispayed. The default value is set to True.
    if flag == True: 
        print("Precision on training set : ",metrics.precision_score(y_train,pred_train))
        print("Precision on test set : ",metrics.precision_score(y_test,pred_test))

    return b # returning the list with train and test scores

In [None]:
##  Function to calculate accuracy score
def get_accuracy_score(model,flag=True,X_train=X_train,X_test=X_test):
    '''
    model : classifier to predict values of X

    '''
    c = [] # defining an empty list to store train and test results
    train_acc = model.score(X_train,y_train)
    test_acc = model.score(X_test,y_test)
    c.append(train_acc) # adding train accuracy to list
    c.append(test_acc) # adding test accuracy to list
    
    # If the flag is set to True then only the following print statements will be dispayed. The default value is set to True.
    if flag == True:
        print("Accuracy on training set : ",model.score(X_train,y_train))
        print("Accuracy on test set : ",model.score(X_test,y_test))
    
    return c # returning the list with train and test scores

In [None]:
# Make the list of all the model names 

models = [rf_estimator_tuned, dtree_estimator, log_reg]


# defining empty lists to add train and test results
acc_train = []
acc_test = []
recall_train = []
recall_test = []
precision_train = []
precision_test = []

# looping through all the models to get the accuracy,recall and precision scores
for model in models:
     # accuracy score
    j = get_accuracy_score(model,False)
    acc_train.append(j[0])
    acc_test.append(j[1])

    # recall score
    k = get_recall_score(model,False)
    recall_train.append(k[0])
    recall_test.append(k[1])

    # precision score
    l = get_precision_score(model,False)
    precision_train.append(l[0])
    precision_test.append(l[1])

In [None]:
# Mention the Model names in the list. for example 'Model': ['Decision Tree', 'Tuned Decision Tree'..... write tht names of all model built]


comparison_frame = pd.DataFrame({'Model':['Random Forest', 'Decision Tree', 'Logistic Regression'], 
                                          'Train_Accuracy': acc_train,
                                          'Test_Accuracy': acc_test,
                                          'Train_Recall': recall_train,
                                          'Test_Recall': recall_test,
                                          'Train_Precision': precision_train,
                                          'Test_Precision': precision_test}) 
comparison_frame