In [251]:
import pandas as pd
import seaborn as sns
import numpy as np

from category_encoders.target_encoder import TargetEncoder
from category_encoders.binary import BinaryEncoder
from category_encoders.one_hot import OneHotEncoder
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [226]:
data = pd.read_csv('data.csv')

In [227]:
data.columns

Index(['Gender', 'Customer Type', 'Age', 'Type of Travel', 'Class',
       'Flight Distance', 'Inflight wifi service',
       'Departure/Arrival time convenient', 'Ease of Online booking',
       'Gate location', 'Food and drink', 'Online boarding', 'Seat comfort',
       'Inflight entertainment', 'On-board service', 'Leg room service',
       'Baggage handling', 'Checkin service', 'Cleanliness',
       'Departure Delay in Minutes', 'Arrival Delay in Minutes',
       'satisfaction'],
      dtype='object')

In [228]:
gender_encoder = BinaryEncoder()
gender_encoder.fit(data['Gender'])
gender_encoded = gender_encoder.transform(data['Gender'])

for i in gender_encoded:
    data[i] = gender_encoded[i].astype('category')

In [229]:
type_of_travel_encoder = BinaryEncoder()
type_of_travel_encoder.fit(data['Type of Travel'])
type_of_travel_encoded = type_of_travel_encoder.transform(data['Type of Travel'])
for i in type_of_travel_encoded:
    data[i] = type_of_travel_encoded[i].astype('category')

In [230]:
data.drop('Gender', axis=1, inplace=True)
data.drop('Type of Travel', axis=1, inplace=True)

In [231]:
target_encoder_customer_type = TargetEncoder()
target_encoder_customer_type.fit(data['Customer Type'], data['satisfaction'])
target_encoded_customer_type = target_encoder_customer_type.transform(data['Customer Type'])
data['Customer Type'] = target_encoded_customer_type.astype('category')

In [232]:
target_encoder_class = TargetEncoder()
target_encoder_class.fit(data['Class'], data['satisfaction'])
target_encoded_class = target_encoder_class.transform(data['Class'])
data['Class'] = target_encoded_class.astype('category')
data

Unnamed: 0,Customer Type,Age,Class,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,Ease of Online booking,Gate location,Food and drink,Online boarding,...,Baggage handling,Checkin service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes,satisfaction,Gender_0,Gender_1,Type of Travel_0,Type of Travel_1
0,0.478115,13,0.246414,460,3,4,3,1,5,3,...,4,4,5,25,18.0,False,0,1,0,1
1,0.239697,25,0.694434,235,3,2,3,3,1,3,...,3,1,1,1,6.0,False,0,1,1,0
2,0.478115,26,0.694434,1142,2,2,2,2,5,5,...,4,4,5,0,0.0,True,1,0,1,0
3,0.478115,25,0.694434,562,2,5,5,5,2,2,...,3,1,2,11,9.0,False,1,0,1,0
4,0.478115,61,0.694434,214,3,3,3,3,4,5,...,4,3,3,0,0.0,True,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
129875,0.239697,34,0.694434,526,3,3,3,1,4,3,...,4,4,4,0,0.0,False,0,1,1,0
129876,0.478115,23,0.694434,646,4,4,4,4,4,4,...,5,5,4,0,0.0,True,0,1,1,0
129877,0.478115,17,0.187673,828,2,5,1,5,2,1,...,4,5,2,0,0.0,False,1,0,0,1
129878,0.478115,14,0.694434,1127,3,3,3,3,4,4,...,5,4,4,0,0.0,True,0,1,1,0


In [233]:
Y = data['satisfaction']
data = data[['Customer Type', 'Age', 'Class', 'Flight Distance', 'Inflight wifi service',
             'Online boarding', 'Seat comfort', 'Inflight entertainment',
            'On-board service', 'Leg room service', 'Baggage handling', 'Checkin service',
            'Type of Travel_0', 'Type of Travel_1']].astype('category')
data

Unnamed: 0,Customer Type,Age,Class,Flight Distance,Inflight wifi service,Online boarding,Seat comfort,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Type of Travel_0,Type of Travel_1
0,0.478115,13,0.246414,460,3,3,5,5,4,3,4,4,0,1
1,0.239697,25,0.694434,235,3,3,1,1,1,5,3,1,1,0
2,0.478115,26,0.694434,1142,2,5,5,5,4,3,4,4,1,0
3,0.478115,25,0.694434,562,2,2,2,2,2,5,3,1,1,0
4,0.478115,61,0.694434,214,3,5,5,3,3,4,4,3,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
129875,0.239697,34,0.694434,526,3,3,4,4,3,2,4,4,1,0
129876,0.478115,23,0.694434,646,4,4,4,4,4,5,5,5,1,0
129877,0.478115,17,0.187673,828,2,1,2,2,4,3,4,5,0,1
129878,0.478115,14,0.694434,1127,3,4,4,4,3,2,5,4,1,0


In [234]:
data.dtypes

Customer Type             category
Age                       category
Class                     category
Flight Distance           category
Inflight wifi service     category
Online boarding           category
Seat comfort              category
Inflight entertainment    category
On-board service          category
Leg room service          category
Baggage handling          category
Checkin service           category
Type of Travel_0          category
Type of Travel_1          category
dtype: object

In [235]:
target_encoder_class = TargetEncoder()
target_encoder_class.fit(data[['Inflight wifi service',
             'Online boarding', 'Seat comfort', 'Inflight entertainment',
            'On-board service', 'Leg room service', 'Baggage handling', 'Checkin service']], 
                         Y)
target_encoded_class = target_encoder_class.transform(data[['Inflight wifi service',
             'Online boarding', 'Seat comfort', 'Inflight entertainment',
            'On-board service', 'Leg room service', 'Baggage handling', 'Checkin service']])
target_encoded_class

Unnamed: 0,Inflight wifi service,Online boarding,Seat comfort,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service
0,0.251825,0.137626,0.651336,0.650615,0.534584,0.276062,0.480636,0.460023
1,0.251825,0.137626,0.223325,0.141946,0.196659,0.614561,0.237979,0.239570
2,0.247215,0.870561,0.651336,0.650615,0.534584,0.276062,0.480636,0.460023
3,0.247215,0.114525,0.226024,0.212673,0.255463,0.614561,0.237979,0.239570
4,0.251825,0.870561,0.651336,0.273154,0.318093,0.583096,0.480636,0.450794
...,...,...,...,...,...,...,...,...
129875,0.251825,0.137626,0.560293,0.611128,0.318093,0.275265,0.480636,0.460023
129876,0.600767,0.623011,0.560293,0.611128,0.534584,0.614561,0.615916,0.611714
129877,0.247215,0.137772,0.226024,0.212673,0.534584,0.276062,0.480636,0.611714
129878,0.251825,0.623011,0.560293,0.611128,0.318093,0.275265,0.615916,0.460023


In [236]:
for col in target_encoded_class:
    data[col] = target_encoded_class[col].astype('category')
    
data

Unnamed: 0,Customer Type,Age,Class,Flight Distance,Inflight wifi service,Online boarding,Seat comfort,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Type of Travel_0,Type of Travel_1
0,0.478115,13,0.246414,460,0.251825,0.137626,0.651336,0.650615,0.534584,0.276062,0.480636,0.460023,0,1
1,0.239697,25,0.694434,235,0.251825,0.137626,0.223325,0.141946,0.196659,0.614561,0.237979,0.239570,1,0
2,0.478115,26,0.694434,1142,0.247215,0.870561,0.651336,0.650615,0.534584,0.276062,0.480636,0.460023,1,0
3,0.478115,25,0.694434,562,0.247215,0.114525,0.226024,0.212673,0.255463,0.614561,0.237979,0.239570,1,0
4,0.478115,61,0.694434,214,0.251825,0.870561,0.651336,0.273154,0.318093,0.583096,0.480636,0.450794,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
129875,0.239697,34,0.694434,526,0.251825,0.137626,0.560293,0.611128,0.318093,0.275265,0.480636,0.460023,1,0
129876,0.478115,23,0.694434,646,0.600767,0.623011,0.560293,0.611128,0.534584,0.614561,0.615916,0.611714,1,0
129877,0.478115,17,0.187673,828,0.247215,0.137772,0.226024,0.212673,0.534584,0.276062,0.480636,0.611714,0,1
129878,0.478115,14,0.694434,1127,0.251825,0.623011,0.560293,0.611128,0.318093,0.275265,0.615916,0.460023,1,0


In [237]:
standart = StandardScaler()
standart.fit(data)
standarted = standart.transform(data)

data = pd.DataFrame(data=standarted)
data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,0.473422,-1.747961,-0.753696,-0.732184,-0.700903,-0.969711,1.128041,1.034720,0.607817,-0.934091,0.322726,0.205913,-1.493951,1.493951
1,-2.112281,-0.954274,1.041959,-0.957760,-0.700903,-0.969711,-1.098235,-1.400649,-1.443938,1.062039,-1.373321,-1.570263,0.669366,-0.669366
2,0.473422,-0.888133,1.041959,-0.048440,-0.718595,1.424654,1.128041,1.034720,0.607817,-0.934091,0.322726,0.205913,0.669366,-0.669366
3,0.473422,-0.954274,1.041959,-0.629924,-0.718595,-1.045176,-1.084198,-1.062026,-1.086904,1.062039,-1.373321,-1.570263,0.669366,-0.669366
4,0.473422,1.426788,1.041959,-0.978814,-0.700903,1.424654,1.128041,-0.772462,-0.706639,0.876495,0.322726,0.131558,0.669366,-0.669366
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
129875,-2.112281,-0.359008,1.041959,-0.666016,-0.700903,-0.969711,0.654484,0.845666,-0.706639,-0.938789,0.322726,0.205913,0.669366,-0.669366
129876,0.473422,-1.086555,1.041959,-0.545709,0.638223,0.615954,0.654484,0.845666,0.607817,1.062039,1.268269,1.428087,0.669366,-0.669366
129877,0.473422,-1.483399,-0.989130,-0.363243,-0.718595,-0.969232,-1.084198,-1.062026,0.607817,-0.934091,0.322726,1.428087,-1.493951,1.493951
129878,0.473422,-1.681821,1.041959,-0.063478,-0.700903,0.615954,0.654484,0.845666,-0.706639,-0.938789,1.268269,0.205913,0.669366,-0.669366


In [238]:
data.columns = ['Customer Type', 'Age', 'Class', 'Flight Distance', 'Inflight wifi service',
             'Online boarding', 'Seat comfort', 'Inflight entertainment',
            'On-board service', 'Leg room service', 'Baggage handling', 'Checkin service',
            'Type of Travel_0', 'Type of Travel_1']
data

Unnamed: 0,Customer Type,Age,Class,Flight Distance,Inflight wifi service,Online boarding,Seat comfort,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Type of Travel_0,Type of Travel_1
0,0.473422,-1.747961,-0.753696,-0.732184,-0.700903,-0.969711,1.128041,1.034720,0.607817,-0.934091,0.322726,0.205913,-1.493951,1.493951
1,-2.112281,-0.954274,1.041959,-0.957760,-0.700903,-0.969711,-1.098235,-1.400649,-1.443938,1.062039,-1.373321,-1.570263,0.669366,-0.669366
2,0.473422,-0.888133,1.041959,-0.048440,-0.718595,1.424654,1.128041,1.034720,0.607817,-0.934091,0.322726,0.205913,0.669366,-0.669366
3,0.473422,-0.954274,1.041959,-0.629924,-0.718595,-1.045176,-1.084198,-1.062026,-1.086904,1.062039,-1.373321,-1.570263,0.669366,-0.669366
4,0.473422,1.426788,1.041959,-0.978814,-0.700903,1.424654,1.128041,-0.772462,-0.706639,0.876495,0.322726,0.131558,0.669366,-0.669366
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
129875,-2.112281,-0.359008,1.041959,-0.666016,-0.700903,-0.969711,0.654484,0.845666,-0.706639,-0.938789,0.322726,0.205913,0.669366,-0.669366
129876,0.473422,-1.086555,1.041959,-0.545709,0.638223,0.615954,0.654484,0.845666,0.607817,1.062039,1.268269,1.428087,0.669366,-0.669366
129877,0.473422,-1.483399,-0.989130,-0.363243,-0.718595,-0.969232,-1.084198,-1.062026,0.607817,-0.934091,0.322726,1.428087,-1.493951,1.493951
129878,0.473422,-1.681821,1.041959,-0.063478,-0.700903,0.615954,0.654484,0.845666,-0.706639,-0.938789,1.268269,0.205913,0.669366,-0.669366


In [239]:
X_y = data.copy()
X_y['satisfaction'] = Y.copy()
X_y

Unnamed: 0,Customer Type,Age,Class,Flight Distance,Inflight wifi service,Online boarding,Seat comfort,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Type of Travel_0,Type of Travel_1,satisfaction
0,0.473422,-1.747961,-0.753696,-0.732184,-0.700903,-0.969711,1.128041,1.034720,0.607817,-0.934091,0.322726,0.205913,-1.493951,1.493951,False
1,-2.112281,-0.954274,1.041959,-0.957760,-0.700903,-0.969711,-1.098235,-1.400649,-1.443938,1.062039,-1.373321,-1.570263,0.669366,-0.669366,False
2,0.473422,-0.888133,1.041959,-0.048440,-0.718595,1.424654,1.128041,1.034720,0.607817,-0.934091,0.322726,0.205913,0.669366,-0.669366,True
3,0.473422,-0.954274,1.041959,-0.629924,-0.718595,-1.045176,-1.084198,-1.062026,-1.086904,1.062039,-1.373321,-1.570263,0.669366,-0.669366,False
4,0.473422,1.426788,1.041959,-0.978814,-0.700903,1.424654,1.128041,-0.772462,-0.706639,0.876495,0.322726,0.131558,0.669366,-0.669366,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
129875,-2.112281,-0.359008,1.041959,-0.666016,-0.700903,-0.969711,0.654484,0.845666,-0.706639,-0.938789,0.322726,0.205913,0.669366,-0.669366,False
129876,0.473422,-1.086555,1.041959,-0.545709,0.638223,0.615954,0.654484,0.845666,0.607817,1.062039,1.268269,1.428087,0.669366,-0.669366,True
129877,0.473422,-1.483399,-0.989130,-0.363243,-0.718595,-0.969232,-1.084198,-1.062026,0.607817,-0.934091,0.322726,1.428087,-1.493951,1.493951,False
129878,0.473422,-1.681821,1.041959,-0.063478,-0.700903,0.615954,0.654484,0.845666,-0.706639,-0.938789,1.268269,0.205913,0.669366,-0.669366,True


In [240]:
corr = X_y.corr()['satisfaction']
corr

Customer Type             0.186017
Age                       0.134091
Class                     0.503347
Flight Distance           0.298085
Inflight wifi service     0.525683
Online boarding           0.617545
Seat comfort              0.387856
Inflight entertainment    0.421433
On-board service          0.332280
Leg room service          0.342107
Baggage handling          0.288634
Checkin service           0.250395
Type of Travel_0          0.449861
Type of Travel_1         -0.449861
satisfaction              1.000000
Name: satisfaction, dtype: float64

In [243]:
X_train, X_test, y_train, y_test = train_test_split(data, Y, random_state=0)
X_train, X_val, Y_train, Y_val = train_test_split(X_train, y_train, random_state=0)

In [245]:
model = LogisticRegression(penalty='none')
model.fit(X_train, Y_train)



In [247]:
model.predict_proba(X_train)

array([[0.05546102, 0.94453898],
       [0.99559123, 0.00440877],
       [0.80947131, 0.19052869],
       ...,
       [0.10549247, 0.89450753],
       [0.99745067, 0.00254933],
       [0.92938199, 0.07061801]])

In [250]:
model.predict(X_train)

array([ True, False, False, ...,  True, False, False])

In [249]:
pred_arr = model.predict_proba(X_train)[:, 1] >= 0.5
pred_arr

array([ True, False, False, ...,  True, False, False])

In [253]:
print(recall_score(Y_train, model.predict(X_train)))

0.9074394191784266


In [255]:
print(accuracy_score(Y_train, model.predict(X_train)))

0.9230874522633012


In [256]:
print(precision_score(Y_train, model.predict(X_train)))

0.9152348950738604


In [257]:
print(f1_score(Y_train, model.predict(X_train)))

0.9113204867193787


In [258]:
print(f1_score(Y_val, model.predict(X_val)))

0.9106729086868114


In [267]:
best_threshold = None
best_f1 = -float('inf')

probabilities = model.predict_proba(X_val)[:, 1]
for i in range(101):
    threshold = 0.1 * i
    
    y_val_pred = (probabilities > threshold)
    f1 = f1_score(Y_val, y_val_pred)
    
    if f1 > best_f1:
        best_f1 = f1
        best_threshold = threshold

In [268]:
best_f1

0.9106729086868114

In [269]:
best_threshold

0.5

In [271]:
print(f1_score(y_test, model.predict(X_test)))

0.9098767637078049
