In [2]:
# !pip install xgboost
# !pip install catboost

In [3]:
import pandas as pd 

df = pd.read_csv('AirPass.zip')

df.head()

Unnamed: 0.1,Unnamed: 0,id,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,...,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes,satisfaction
0,0,70172,Male,Loyal Customer,13,Personal Travel,Eco Plus,460,3,4,...,5,4,3,4,4,5,5,25,18.0,neutral or dissatisfied
1,1,5047,Male,disloyal Customer,25,Business travel,Business,235,3,2,...,1,1,5,3,1,4,1,1,6.0,neutral or dissatisfied
2,2,110028,Female,Loyal Customer,26,Business travel,Business,1142,2,2,...,5,4,3,4,4,4,5,0,0.0,satisfied
3,3,24026,Female,Loyal Customer,25,Business travel,Business,562,2,5,...,2,2,5,3,1,4,2,11,9.0,neutral or dissatisfied
4,4,119299,Male,Loyal Customer,61,Business travel,Business,214,3,3,...,3,3,4,4,3,3,3,0,0.0,satisfied


In [4]:
# Удалим неиспользуемую колонку
df.drop(['Unnamed: 0'], axis=1, inplace=True)

In [5]:
# 6.1
# Сколько всего в данных пропущенных значений?
df.isnull().sum().sum()

310

In [6]:
# 6.2 
# Заполним пропуски медианным значением 
df['Arrival Delay in Minutes'] = df['Arrival Delay in Minutes'].fillna(df['Arrival Delay in Minutes'].median())
round(df['Arrival Delay in Minutes'].mean(),2)

15.13

In [7]:
# 6.3.1
df.groupby('Gender')['satisfaction'].value_counts()*100/df.shape[0]

Gender  satisfaction           
Female  neutral or dissatisfied    29.058554
        satisfied                  21.687327
Male    neutral or dissatisfied    27.608177
        satisfied                  21.645942
Name: satisfaction, dtype: float64

In [8]:
# 6.3.2
df.groupby('Type of Travel')['satisfaction'].value_counts()*100/df.shape[0]

Type of Travel   satisfaction           
Business travel  satisfied                  40.177472
                 neutral or dissatisfied    28.785225
Personal Travel  neutral or dissatisfied    27.881506
                 satisfied                   3.155798
Name: satisfaction, dtype: float64

In [9]:
# 6.3.3
df.groupby('Class')['satisfaction'].value_counts()*100/df.shape[0]

Class     satisfaction           
Business  satisfied                  33.184478
          neutral or dissatisfied    14.614452
Eco       neutral or dissatisfied    36.614567
          satisfied                   8.374076
Eco Plus  neutral or dissatisfied     5.437712
          satisfied                   1.774715
Name: satisfaction, dtype: float64

In [10]:
# Перекодируем часть бинарных признаков, чтобы использовать их при обучении:
df['satisfaction'] = df['satisfaction'].map({'neutral or dissatisfied':0 , 'satisfied':1})
df['Customer Type'] = df['Customer Type'].map({'Loyal Customer':1, 'disloyal Customer':0})
df['Type of Travel'] = df['Type of Travel'].map({'Personal Travel':0, 'Business travel':1})

In [11]:
# 6.4 
df=pd.get_dummies(df)
df.shape

(103904, 27)

In [12]:
# 6.5 
from sklearn.model_selection import train_test_split

y = df['satisfaction']
X = df.drop(['satisfaction'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=26)
y_test.shape

(20781,)

In [13]:
# 6.6 
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(X_train) 
X_train = scaler.transform(X_train) 
X_test = scaler.transform(X_test) 
round(X_test[0][0], 2)

0.94

In [14]:
# 6.7 
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

model_lr = LogisticRegression()
model_lr.fit(X_train, y_train)
preds_test = model_lr.predict(X_test)
round(f1_score(preds_test, y_test), 3)

0.855

In [15]:
# 
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

model_ada = AdaBoostClassifier(DecisionTreeClassifier(random_state=26),random_state=26,learning_rate=0.01)

model_ada.fit(X_train, y_train)
preds_test = model_ada.predict(X_test)
round(f1_score(preds_test, y_test), 3)

0.94

In [16]:
# 6.9
from sklearn.ensemble import GradientBoostingClassifier
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer

model_for_gs = GradientBoostingClassifier()
params = {"n_estimators":2**np.arange(8), "learning_rate":0.1**np.arange(3)}
gs = GridSearchCV(model_for_gs, 
                  params, 
                  cv=3, 
                  scoring=make_scorer(f1_score),
                  verbose=5,
                  n_jobs=-1)
 
gs.fit(X_train, y_train)

Fitting 3 folds for each of 24 candidates, totalling 72 fits
[CV 2/3] END .learning_rate=1.0, n_estimators=1;, score=0.870 total time=   0.4s
[CV 3/3] END .learning_rate=1.0, n_estimators=1;, score=0.871 total time=   0.4s
[CV 1/3] END .learning_rate=1.0, n_estimators=1;, score=0.873 total time=   0.5s
[CV 1/3] END .learning_rate=0.1, n_estimators=1;, score=0.000 total time=   0.4s
[CV 3/3] END .learning_rate=0.1, n_estimators=1;, score=0.000 total time=   0.4s
[CV 2/3] END .learning_rate=0.1, n_estimators=1;, score=0.000 total time=   0.5s
[CV 2/3] END .learning_rate=1.0, n_estimators=2;, score=0.878 total time=   0.7s
[CV 1/3] END .learning_rate=1.0, n_estimators=2;, score=0.880 total time=   0.7s
[CV 3/3] END .learning_rate=1.0, n_estimators=2;, score=0.875 total time=   0.8s
[CV 2/3] END .learning_rate=0.1, n_estimators=2;, score=0.802 total time=   0.7s
[CV 1/3] END .learning_rate=0.1, n_estimators=2;, score=0.765 total time=   0.7s
[CV 3/3] END .learning_rate=0.1, n_estimators=2;

## <p style='color:red'>Ошибка!!! №1</p>
Исправить значение метрики на 0.949 (добавить значение 0.949) в качестве правильного ответа

https://sfdatasciencepro20.slack.com/archives/C027098F9SM/p1663304204766919 

Создан тикет DST-2582

In [17]:
print("Лучшие гиперпараметры:", gs.best_params_)
print("Лучшее значение метрики:", gs.best_score_)

Лучшие гиперпараметры: {'learning_rate': 1.0, 'n_estimators': 128}
Лучшее значение метрики: 0.94914048012694


In [19]:
# 6.10
from xgboost import XGBClassifier

model_xgb = XGBClassifier(random_state = 26, n_jobs=-1)
model_xgb.fit(X_train,y_train)
preds_test = model_xgb.predict(X_test)
f1_score(preds_test, y_test)



0.9579785161685312

In [20]:
!pip show xgboost

Name: xgboost
Version: 1.6.2
Summary: XGBoost Python Package
Home-page: https://github.com/dmlc/xgboost
Author: 
Author-email: 
License: Apache-2.0
Location: /home/vova/.local/lib/python3.10/site-packages
Requires: numpy, scipy
Required-by: 


In [None]:
# Задать random_state = 26 - создан тикет

In [27]:
# 6.11
from catboost import CatBoostClassifier, Pool

model = CatBoostClassifier(random_state =26)
model.fit(X_train, y_train)
preds_class = model.predict(X_test)
f1_score(preds_class, y_test)

Learning rate set to 0.068023
0:	learn: 0.6008085	total: 5.89ms	remaining: 5.88s
1:	learn: 0.5274377	total: 10.4ms	remaining: 5.2s
2:	learn: 0.4526901	total: 15.2ms	remaining: 5.05s
3:	learn: 0.4085058	total: 20.2ms	remaining: 5.04s
4:	learn: 0.3754316	total: 25.2ms	remaining: 5.02s
5:	learn: 0.3316909	total: 29.9ms	remaining: 4.95s
6:	learn: 0.3110782	total: 35.1ms	remaining: 4.97s
7:	learn: 0.2929223	total: 39.7ms	remaining: 4.92s
8:	learn: 0.2731868	total: 44.2ms	remaining: 4.87s
9:	learn: 0.2584990	total: 48.8ms	remaining: 4.83s
10:	learn: 0.2433966	total: 53.3ms	remaining: 4.79s
11:	learn: 0.2327963	total: 57.7ms	remaining: 4.75s
12:	learn: 0.2245691	total: 62.1ms	remaining: 4.71s
13:	learn: 0.2158969	total: 66.4ms	remaining: 4.67s
14:	learn: 0.2073525	total: 71.2ms	remaining: 4.67s
15:	learn: 0.1986439	total: 75.5ms	remaining: 4.64s
16:	learn: 0.1921241	total: 79.9ms	remaining: 4.62s
17:	learn: 0.1881226	total: 84.3ms	remaining: 4.6s
18:	learn: 0.1841774	total: 89.4ms	remaining: 

0.9606535511837279

In [35]:
# 6.12
from catboost.utils import get_confusion_matrix

cm = get_confusion_matrix(model, Pool(X_train, y_train))
print(cm)

# изменить значения матрицы для Cat_Boost при random_state = 26 - создан тикет

[[46665.   541.]
 [ 1269. 34648.]]


In [39]:
model.get_feature_importance()

array([1.95623795e+00, 6.77713701e+00, 3.36314800e+00, 1.91135093e+01,
       1.66668114e+00, 2.54089279e+01, 1.76945653e+00, 1.65527442e+00,
       2.84054992e+00, 2.65456758e-01, 7.31579282e+00, 3.34593227e+00,
       3.03888456e+00, 1.70166504e+00, 1.09832313e+00, 3.20961159e+00,
       3.91532924e+00, 2.97111634e+00, 1.91912876e+00, 4.87910672e-01,
       9.15555380e-01, 4.61787888e-02, 6.66907841e-03, 4.87568770e+00,
       1.80455864e-01, 1.55379809e-01])

In [40]:
# 6.13
pd.DataFrame({'feature_importance': model.get_feature_importance(), 
              'feature_names':X.columns}).sort_values(by=['feature_importance'], 
                                                           ascending=False)

Unnamed: 0,feature_importance,feature_names
5,25.408928,Inflight wifi service
3,19.113509,Type of Travel
10,7.315793,Online boarding
1,6.777137,Customer Type
23,4.875688,Class_Business
16,3.915329,Checkin service
2,3.363148,Age
11,3.345932,Seat comfort
15,3.209612,Baggage handling
12,3.038885,Inflight entertainment
