In [21]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn import tree
from sklearn import model_selection
from sklearn import metrics
from sklearn import *
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, roc_auc_score, f1_score, make_scorer
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split, GridSearchCV


In [2]:
path = r'C:\Users\troyd\OneDrive\Desktop\data\AirPass.csv'
AirPass = pd.read_csv(path)

In [3]:
AirPass.drop('Unnamed: 0', axis=1, inplace=True)

In [4]:
AirPass.head(3)

Unnamed: 0,id,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,Ease of Online booking,...,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes,satisfaction
0,70172,Male,Loyal Customer,13,Personal Travel,Eco Plus,460,3,4,3,...,5,4,3,4,4,5,5,25,18.0,neutral or dissatisfied
1,5047,Male,disloyal Customer,25,Business travel,Business,235,3,2,3,...,1,1,5,3,1,4,1,1,6.0,neutral or dissatisfied
2,110028,Female,Loyal Customer,26,Business travel,Business,1142,2,2,2,...,5,4,3,4,4,4,5,0,0.0,satisfied


In [5]:
AirPass.isna().sum()

id                                     0
Gender                                 0
Customer Type                          0
Age                                    0
Type of Travel                         0
Class                                  0
Flight Distance                        0
Inflight wifi service                  0
Departure/Arrival time convenient      0
Ease of Online booking                 0
Gate location                          0
Food and drink                         0
Online boarding                        0
Seat comfort                           0
Inflight entertainment                 0
On-board service                       0
Leg room service                       0
Baggage handling                       0
Checkin service                        0
Inflight service                       0
Cleanliness                            0
Departure Delay in Minutes             0
Arrival Delay in Minutes             310
satisfaction                           0
dtype: int64

In [6]:
AirPass['Arrival Delay in Minutes'] = AirPass['Arrival Delay in Minutes'].fillna(AirPass['Arrival Delay in Minutes'].median())
AirPass['Arrival Delay in Minutes'].mean()

np.float64(15.133392362180475)

In [7]:
satisfy = AirPass.groupby(['Gender'])
satisfy['satisfaction'].value_counts(normalize=True)

Gender  satisfaction           
Female  neutral or dissatisfied    0.572629
        satisfied                  0.427371
Male    neutral or dissatisfied    0.560525
        satisfied                  0.439475
Name: proportion, dtype: float64

In [8]:
satisfy = AirPass.groupby(['Type of Travel'])
satisfy['satisfaction'].value_counts(normalize=True)

Type of Travel   satisfaction           
Business travel  satisfied                  0.582597
                 neutral or dissatisfied    0.417403
Personal Travel  neutral or dissatisfied    0.898322
                 satisfied                  0.101678
Name: proportion, dtype: float64

In [9]:
satisfy = AirPass.groupby(['Class'])
satisfy['satisfaction'].value_counts(normalize=True)

Class     satisfaction           
Business  satisfied                  0.694251
          neutral or dissatisfied    0.305749
Eco       neutral or dissatisfied    0.813862
          satisfied                  0.186138
Eco Plus  neutral or dissatisfied    0.753936
          satisfied                  0.246064
Name: proportion, dtype: float64

In [10]:
AirPass['satisfaction'] = AirPass['satisfaction'].map({'neutral or dissatisfied':0 , 'satisfied':1})
AirPass['Customer Type'] = AirPass['Customer Type'].map({'Loyal Customer':1, 'disloyal Customer':0})
AirPass['Type of Travel'] = AirPass['Type of Travel'].map({'Personal Travel':0, 'Business travel':1})
AirPass['Gender'] = AirPass['Gender'].map({'Male': 0, 'Female': 1})

In [11]:

AirPass=pd.get_dummies(AirPass)
AirPass.shape

(103904, 26)

In [12]:
Y = AirPass['satisfaction'] 
X = AirPass.drop('satisfaction', axis = 1)
 
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state = 26)
X_test.shape

(20781, 25)

In [13]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train) 
X_train = scaler.transform(X_train) 
X_test = scaler.transform(X_test) 
X_test[0][0]

np.float64(0.9408251379303)

In [14]:
from sklearn.linear_model import LogisticRegression
model_lr = LogisticRegression()
model_lr.fit(X_train, Y_train)
preds_test = model_lr.predict(X_test)
f1_score(preds_test, Y_test)

0.8547046934650797

In [17]:
model_ada = AdaBoostClassifier(DecisionTreeClassifier(random_state=26),random_state=26,learning_rate=0.01)

model_ada.fit(X_train, Y_train)
preds_test = model_ada.predict(X_test)
f1_score(preds_test, Y_test)

0.9404794558121674

In [22]:
model_for_gs = GradientBoostingClassifier()
params = {"n_estimators":2**np.arange(8), "learning_rate":0.1**np.arange(3)}
gs = GridSearchCV(model_for_gs, 
                  params, 
                  cv=3, 
                  scoring=make_scorer(f1_score),
                  verbose=5)
 
gs.fit(X_train, Y_train)
 
print("Лучшие гиперпараметры:", gs.best_params_)
print("Лучшее значение метрики:", gs.best_score_)

Fitting 3 folds for each of 24 candidates, totalling 72 fits
[CV 1/3] END .learning_rate=1.0, n_estimators=1;, score=0.873 total time=   0.2s
[CV 2/3] END .learning_rate=1.0, n_estimators=1;, score=0.870 total time=   0.2s
[CV 3/3] END .learning_rate=1.0, n_estimators=1;, score=0.871 total time=   0.2s
[CV 1/3] END .learning_rate=1.0, n_estimators=2;, score=0.880 total time=   0.4s
[CV 2/3] END .learning_rate=1.0, n_estimators=2;, score=0.878 total time=   0.5s
[CV 3/3] END .learning_rate=1.0, n_estimators=2;, score=0.875 total time=   0.6s
[CV 1/3] END .learning_rate=1.0, n_estimators=4;, score=0.901 total time=   1.3s
[CV 2/3] END .learning_rate=1.0, n_estimators=4;, score=0.896 total time=   1.4s
[CV 3/3] END .learning_rate=1.0, n_estimators=4;, score=0.897 total time=   1.2s
[CV 1/3] END .learning_rate=1.0, n_estimators=8;, score=0.920 total time=   2.2s
[CV 2/3] END .learning_rate=1.0, n_estimators=8;, score=0.920 total time=   2.1s
[CV 3/3] END .learning_rate=1.0, n_estimators=8;

In [23]:
from xgboost import XGBClassifier
model_xgb = XGBClassifier(random_state=26)
model_xgb.fit(X_train,Y_train)
preds_test = model_xgb.predict(X_test)
f1_score(preds_test, Y_test)

0.9573723114544546

In [29]:
model = CatBoostClassifier(random_state=26)
model.fit(X_train, y_train)
preds_class = model.predict(X_test)
f1_score(preds_class, y_test)

NameError: name 'CatBoostClassifier' is not defined

In [28]:
pd.DataFrame(
    {
        "feature_importance": model_ada.get_feature_importance(),
        "feature_names": AirPass.drop(columns="satisfaction").columns,
    }
).sort_values(by=["feature_importance"], ascending=False)

AttributeError: 'AdaBoostClassifier' object has no attribute 'get_feature_importance'