## <center>Домашнее задание по XGboost Базанов Дмитрий Б03-903</center>

In [68]:
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.preprocessing import LabelEncoder 

In [69]:
data_train = pd.read_csv('flight_delays_train.csv')
data_test = pd.read_csv('flight_delays_test.csv')

In [70]:
print('data_train:', data_train.shape)
print('data_test:', data_test.shape)

data_train: (100000, 9)
data_test: (100000, 8)


In [71]:
data_train.head()

Unnamed: 0,Month,DayofMonth,DayOfWeek,DepTime,UniqueCarrier,Origin,Dest,Distance,dep_delayed_15min
0,c-8,c-21,c-7,1934,AA,ATL,DFW,732,N
1,c-4,c-20,c-3,1548,US,PIT,MCO,834,N
2,c-9,c-2,c-5,1422,XE,RDU,CLE,416,N
3,c-11,c-25,c-6,1015,OO,DEN,MEM,872,N
4,c-10,c-7,c-6,1828,WN,MDW,OMA,423,Y


In [72]:
from sklearn.calibration import CalibratedClassifierCV

Возьмём сначала все признаки, кроме UniqueCarrie, Origin и Dest. Дни, месяца, недели преобразуем с помощью LabelEncoder. 

In [73]:
le = LabelEncoder()
data_train['Month_encoded'] = le.fit_transform(data_train['Month'])
data_test['Month_encoded'] = le.fit_transform(data_test['Month'])
data_test['DayofMonth_encoded']= le.fit_transform(data_test['DayofMonth'])
data_train['DayofMonth_encoded']= le.fit_transform(data_train['DayofMonth'])
data_train['DayofWeek_encoded']= le.fit_transform(data_train['DayOfWeek'])
data_test['DayofWeek_encoded']= le.fit_transform(data_test['DayOfWeek'])

In [74]:
data_train.head()

Unnamed: 0,Month,DayofMonth,DayOfWeek,DepTime,UniqueCarrier,Origin,Dest,Distance,dep_delayed_15min,Month_encoded,DayofMonth_encoded,DayofWeek_encoded
0,c-8,c-21,c-7,1934,AA,ATL,DFW,732,N,10,13,6
1,c-4,c-20,c-3,1548,US,PIT,MCO,834,N,6,12,2
2,c-9,c-2,c-5,1422,XE,RDU,CLE,416,N,11,11,4
3,c-11,c-25,c-6,1015,OO,DEN,MEM,872,N,2,17,5
4,c-10,c-7,c-6,1828,WN,MDW,OMA,423,Y,1,28,5


In [75]:
X_train, y_train = data_train[['Distance', 'DepTime', 'Month_encoded', 'DayofMonth_encoded', 'DayofWeek_encoded']].values, data_train['dep_delayed_15min'].map({'Y': 1, 'N': 0}).values
X_test = data_test[['Distance', 'DepTime', 'Month_encoded', 'DayofMonth_encoded', 'DayofWeek_encoded']].values

X_train_part, X_valid, y_train_part, y_valid = train_test_split(X_train, y_train, test_size=0.3, random_state = 42)

Обучимся без задания параметров:

In [76]:
xgb = XGBClassifier()
xgb.fit(X_train_part, y_train_part)
testPredictions = xgb.predict(X_valid)
print('ROC-AUC:', round(roc_auc_score(testPredictions, y_valid), 5))

ROC-AUC: 0.71379


Попробуем повысить метрику с помощью сетки:

In [77]:
param = {
        'min_child_weight': [5, 10],
        'colsample_bytree': [0.6, 0.8],
        'eta': [0.05, 0.1],
        'max_depth': [6, 9]
        }

In [78]:
from sklearn.model_selection import GridSearchCV

In [80]:
xgb2 = XGBClassifier()
metLearn2=GridSearchCV(xgb2,cv=2, param_grid = param)
metLearn2.fit(X_train_part, y_train_part)
testPredictions2 = metLearn2.predict(X_valid)
print('ROC-AUC:', round(roc_auc_score(testPredictions2, y_valid), 5))

ROC-AUC: 0.74327


Уменьшим колличество признаков. То есть уберём признаки, связанные с датой и временем.

In [81]:
X_train1, y_train1 = data_train[['Distance', 'DepTime']].values, data_train['dep_delayed_15min'].map({'Y': 1, 'N': 0}).values
X_test1 = data_test[['Distance', 'DepTime']].values

X_train_part1, X_valid1, y_train_part1, y_valid1 = train_test_split(X_train1, y_train1, test_size=0.3, random_state = 42)

In [82]:
xgb0 = XGBClassifier()
metLearn0=GridSearchCV(xgb0,cv=2, param_grid = param)
metLearn0.fit(X_train_part1, y_train_part1)
testPredictions0 = metLearn0.predict(X_valid1)
print('ROC-AUC:', round(roc_auc_score(testPredictions0, y_valid1), 5))

ROC-AUC: 0.71903


Ожидаемо получили метрику хуже. Значит эти признаки так же важны. Применим теперь CalibratedClassifierCV. Он должен еще сильнее повысить метрику.

In [83]:
xgb = XGBClassifier()
metLearn=CalibratedClassifierCV(xgb, method='isotonic', cv=2)
metLearn.fit(X_train_part, y_train_part)
testPredictions = metLearn.predict(X_valid)
print('ROC-AUC:', round(roc_auc_score(testPredictions, y_valid), 5))

ROC-AUC: 0.77079


Получили метрику еще лучше. Попробуем теперь на меньшем колличесвте признаков.

In [85]:
xgb1 = XGBClassifier(param = param)
xgb1.set_params()
metLearn1=CalibratedClassifierCV(xgb1, method='isotonic', cv=2)
metLearn1.fit(X_train_part1, y_train_part1)
testPredictions1 = metLearn.predict(X_valid1)
print('ROC-AUC:', round(roc_auc_score(testPredictions1, y_valid1), 5))

Parameters: { param } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { param } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


ROC-AUC: 0.80156


Метрика получилась еще лучше, несмотря на то, что мы взяли меньше признаков. Остановимся на ней.

<b>!</b> При тестировании на разных машинах, метрики несколько рознились. Значения, полученные, непосредственно мной, если вдруг у Вас они выйдут другими: 

1) ROC-AUC: 0.71379 

2) ROC-AUC: 0.74327 

3) ROC-AUC: 0.71903 

4) ROC-AUC: 0.77079

5) ROC-AUC: 0.80156