In [None]:
import pandas as pd
import numpy as np
from ml_metrics import rmsle
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.model_selection import KFold, StratifiedKFold
import xgboost as xgb


train = pd.read_hdf('../input/train_airline_delay.h5')
test = pd.read_csv('../input/test_airline_delay.csv')
df_all = pd.concat([train, test],sort=False)


cat_feats = df_all.select_dtypes(include=[np.object]).columns
for cat_feat in cat_feats:
    df_all['{0}_cat'.format(cat_feat)] = pd.factorize( df_all[cat_feat] )[0]

    
df_train = df_all[ ~df_all.target_delay.isnull() ].copy()
df_test = df_all[ df_all.target_delay.isnull() ].copy()


#predykcja is_canelled_pred
clf_feats = ['year', 'month', 'day_of_month', 'day_of_week',
       'scheduled_departure_time', 'scheduled_arrival_time', 'carrier_id',
       'flight_number', 'tail_number', 'scheduled_elapsed_time', 'origin_cat',
       'destination_cat', 'distance']

X_clf = df_train[clf_feats].values
y_clf = df_train['is_cancelled'].astype('int8').values

# # ===== train ======
xgb_clf_params = { 'max_depth':5, 'n_estimators':20, 'learning_rate':0.40, 'n_jobs':8, 'random_state':2019 }

is_cancelled_pred = {}
cv = StratifiedKFold(n_splits=3, random_state=2019)
for train_idx, test_idx in cv.split(X_clf, y_clf):
    X_clf_train, y_clf_train = X_clf[train_idx], y_clf[train_idx]
    X_clf_test, y_clf_test   = X_clf[test_idx], y_clf[test_idx]
    
    model_clf = xgb.XGBClassifier(**xgb_clf_params)
    model_clf.fit(X_clf_train, y_clf_train)
    y_pred = model_clf.predict_proba(X_clf_test)[:,1]
    
    is_cancelled_pred.update( dict(zip(list(test_idx), list(y_pred))) )
    
df_train['is_cancelled_pred'] = df_train.index.map(is_cancelled_pred)


# # ===== test ======
X_clf_train, y_clf_train = df_train[clf_feats].values, df_train['is_cancelled'].values
X_clf_test           = df_test[clf_feats].values

model = xgb.XGBClassifier(**xgb_clf_params)
model.fit(X_clf_train, y_clf_train)
df_test['is_cancelled_pred'] = model.predict_proba(X_clf_test)[:,1]


#moje cechy, najważniejszy licznik dni "datetime_full" oraz one_hot_encoding "is_winter","is_holiday".
df_all['tail_number_cat']=df_all['tail_number'].factorize()[0]
df_all['tail_dist']=df_all['tail_number_cat']*df_all['distance']
df_all['datetime'] = (2000+df_all['year'].astype(int))*365 + df_all['month'].astype(int)*30
df_all['datetime_full']=df_all['datetime'].astype(int)+df_all['day_of_month'].astype(int)
df_all['delta_scheduled']=df_all['scheduled_arrival_time']-df_all['scheduled_departure_time']
df_all['is_holiday']=df_all['month'].map(lambda x: int(x in (7,8)))
df_all['is_winter']=df_all['month'].map(lambda x: int(x in (12,1,2)))


feats = test.select_dtypes(include=[np.number, np.bool]).columns
black_list = ['id', 'air_time', 'cancellation_code', 'actual_departure_time', 
              'security_delay', 'arrival_delay', 'actual_arrival_time', 
              'carrier_delay', 'actual_elapsed_time', 'target_delay', 
              'late_aircraft_delay', 'depature_delay', 'weather_delay', 'nas_delay'
             ]
feats = [feat for feat in feats if feat not in black_list ]
feats += ['is_cancelled_pred','origin_cat', 'destination_cat',
          'datetime_full','delta_scheduled','is_holiday','is_winter','tail_dist']

#wykres ważności cech danego modelu
def draw_feature_importances(model, features):
    importances = model.feature_importances_
    indices = np.argsort(importances)[::-1]

    plt.figure(figsize=(10, 5))
    plt.title("Feature importances")
    plt.bar(range(X_train.shape[1]), model.feature_importances_[indices],
           color="b", align="center")
    plt.xticks(range(X_train.shape[1]), [ features[x] for x in indices])
    plt.xticks(rotation=90)
    plt.xlim([-1, X_train.shape[1]])
    plt.show()

draw_feature_importances(model, feats)


X_train = df_train[feats].values
y_train = df_train['target_delay'].values
y_train_log = np.log1p(y_train)

cv = KFold(n_splits=2, random_state=13, shuffle=False)

#model który w Jupyter rmsle daje wynik 1,559 natomiast w Kaggle 1,533
model = xgb.XGBRegressor(max_depth=8, n_estimators=1000, learning_rate=0.16, n_jobs=8, random_state=13)

#pętla do obliczeń rmsle z wykożystaniem cross_val_score, 
for train_idx, test_idx in cv.split(y_train):
    model.fit(X_train[train_idx], y_train_log[train_idx])
    y_pred_log = model.predict(X_train[test_idx])
    y_pred = np.expm1(y_pred_log)

    print(rmsle(y_train[test_idx], y_pred))

#trenowanie końcowego modelu i zapis do pliku
model.fit(X_train, y_train_log)
X_test = df_test[feats].values
y_test_pred_log = model.predict(X_test)
y_pred = np.expm1(y_test_pred_log)
df_test['id'] = df_test['id'].astype('int')
df_test['target_delay'] = y_pred
df_test[ ['id', 'target_delay'] ].to_csv('../output/submit_xgb_log.csv', index=False) 
