In [107]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
%matplotlib inline

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

from tqdm import tqdm_notebook
import lightgbm as lgb
from catboost import Pool, CatBoostRegressor

import warnings
warnings.filterwarnings("ignore")

%matplotlib inline
import seaborn as sns



plt.style.use('seaborn')
sns.set(font_scale=1)

import gc



In [108]:
train = pd.read_csv("../input/flight-delay-prediction-challenge/Train (8).csv")
test = pd.read_csv("../input/flight-delay-prediction-challenge/Test (9).csv")
sample = pd.read_csv('../input/flight-delay-prediction-challenge/SampleSubmission (5).csv')

In [109]:
train

Unnamed: 0,ID,DATOP,FLTID,DEPSTN,ARRSTN,STD,STA,STATUS,AC,target
0,train_id_0,2016-01-03,TU 0712,CMN,TUN,2016-01-03 10:30:00,2016-01-03 12.55.00,ATA,TU 32AIMN,260.0
1,train_id_1,2016-01-13,TU 0757,MXP,TUN,2016-01-13 15:05:00,2016-01-13 16.55.00,ATA,TU 31BIMO,20.0
2,train_id_2,2016-01-16,TU 0214,TUN,IST,2016-01-16 04:10:00,2016-01-16 06.45.00,ATA,TU 32AIMN,0.0
3,train_id_3,2016-01-17,TU 0480,DJE,NTE,2016-01-17 14:10:00,2016-01-17 17.00.00,ATA,TU 736IOK,0.0
4,train_id_4,2016-01-17,TU 0338,TUN,ALG,2016-01-17 14:30:00,2016-01-17 15.50.00,ATA,TU 320IMU,22.0
...,...,...,...,...,...,...,...,...,...,...
107828,train_id_107828,2018-07-05,WKL 0000,TUN,TUN,2018-07-05 23:00:00,2018-07-06 02.00.00,SCH,TU 32AIML,0.0
107829,train_id_107829,2018-01-13,UG 0003,DJE,TUN,2018-01-13 08:00:00,2018-01-13 09.00.00,SCH,UG AT7AT7,0.0
107830,train_id_107830,2018-11-07,SGT 0000,TUN,TUN,2018-11-07 05:00:00,2018-11-07 12.50.00,SCH,TU 736IOK,0.0
107831,train_id_107831,2018-01-23,UG 0010,TUN,DJE,2018-01-23 18:00:00,2018-01-23 18.45.00,ATA,TU CR9ISA,0.0


In [110]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df

In [111]:
train = reduce_mem_usage(train)
test = reduce_mem_usage(test)
sample = reduce_mem_usage(sample)

Memory usage after optimization is: 7.61 MB
Decreased by 7.5%
Memory usage after optimization is: 0.64 MB
Decreased by 0.0%
Memory usage after optimization is: 0.09 MB
Decreased by 37.5%


In [112]:
gc.collect()

117

In [113]:
train.head()

Unnamed: 0,ID,DATOP,FLTID,DEPSTN,ARRSTN,STD,STA,STATUS,AC,target
0,train_id_0,2016-01-03,TU 0712,CMN,TUN,2016-01-03 10:30:00,2016-01-03 12.55.00,ATA,TU 32AIMN,260.0
1,train_id_1,2016-01-13,TU 0757,MXP,TUN,2016-01-13 15:05:00,2016-01-13 16.55.00,ATA,TU 31BIMO,20.0
2,train_id_2,2016-01-16,TU 0214,TUN,IST,2016-01-16 04:10:00,2016-01-16 06.45.00,ATA,TU 32AIMN,0.0
3,train_id_3,2016-01-17,TU 0480,DJE,NTE,2016-01-17 14:10:00,2016-01-17 17.00.00,ATA,TU 736IOK,0.0
4,train_id_4,2016-01-17,TU 0338,TUN,ALG,2016-01-17 14:30:00,2016-01-17 15.50.00,ATA,TU 320IMU,22.0


In [114]:
import datetime as dt
train["DATOP"]=pd.to_datetime(train["DATOP"])
train['Date_year'] =train["DATOP"].dt.year
train['Date_month'] = train["DATOP"].dt.month
train['Date_day'] = train["DATOP"].dt.day
train.drop("DATOP", axis=1, inplace=True)


In [154]:
test["DATOP"]=pd.to_datetime(test["DATOP"])
test['Date_year'] =test["DATOP"].dt.year
test['Date_month'] = test["DATOP"].dt.month
test['Date_day'] = test["DATOP"].dt.day
test.drop("DATOP", axis=1, inplace=True)

In [155]:
import datetime as dt
test["STD"]=pd.to_datetime(test["STD"])
test['STD_Date_year'] =test["STD"].dt.year
test['STD_Date_month'] = test["STD"].dt.month
test['STD_Date_day'] = test["STD"].dt.day
test['STD_Date_hour'] = test["STD"].dt.hour
test['STD_Date_minute'] =test["STD"].dt.minute
test['STD_Date_second'] = test["STD"].dt.second

test.drop("STD", axis=1, inplace=True)


In [156]:
test["STAA"]=pd.to_datetime(test["STA"], format='%Y-%m-%d %H.%M.%S')
test['STA_Date_year'] =test["STAA"].dt.year
test['STA_Date_month'] = test["STAA"].dt.month
test['STA_Date_day'] = test["STAA"].dt.day
test['STA_Date_hour'] = test["STAA"].dt.hour
test['STA_Date_minute'] =test["STAA"].dt.minute
test['STA_Date_second'] = test["STAA"].dt.second
test.drop("STA", axis=1, inplace=True)
test.drop("STAA", axis=1, inplace=True)



In [121]:
from category_encoders import CountEncoder
enc = CountEncoder(normalize=True, cols=['DEPSTN', 'ARRSTN','STATUS','AC',"FLTID"])
train = enc.fit_transform(train)

In [159]:
enc = CountEncoder(normalize=True, cols=['DEPSTN', 'ARRSTN','STATUS','AC',"FLTID"])
test=enc.fit_transform(test)

In [122]:
train

Unnamed: 0,ID,FLTID,DEPSTN,ARRSTN,STATUS,AC,target,Date_year,Date_month,Date_day,...,STD_Date_day,STD_Date_hour,STD_Date_minute,STD_Date_second,STA_Date_year,STA_Date_month,STA_Date_day,STA_Date_hour,STA_Date_minute,STA_Date_second
0,train_id_0,0.009274,0.012510,0.394796,0.868741,0.035323,260.0,2016,1,3,...,3,10,30,0,2016,1,3,12,55,0
1,train_id_1,0.008949,0.014226,0.394796,0.868741,0.027033,20.0,2016,1,13,...,13,15,5,0,2016,1,13,16,55,0
2,train_id_2,0.004238,0.394332,0.013373,0.868741,0.035323,0.0,2016,1,16,...,16,4,10,0,2016,1,16,6,45,0
3,train_id_3,0.001595,0.095073,0.005008,0.868741,0.025836,0.0,2016,1,17,...,17,14,10,0,2016,1,17,17,0,0
4,train_id_4,0.005230,0.394332,0.014587,0.868741,0.043808,22.0,2016,1,17,...,17,14,30,0,2016,1,17,15,50,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
107828,train_id_107828,0.028795,0.394332,0.394796,0.122801,0.038226,0.0,2018,7,5,...,5,23,0,0,2018,7,6,2,0,0
107829,train_id_107829,0.008606,0.095073,0.394796,0.122801,0.013363,0.0,2018,1,13,...,13,8,0,0,2018,1,13,9,0,0
107830,train_id_107830,0.002105,0.394332,0.394796,0.122801,0.025836,0.0,2018,11,7,...,7,5,0,0,2018,11,7,12,50,0
107831,train_id_107831,0.007261,0.394332,0.094572,0.868741,0.032541,0.0,2018,1,23,...,23,18,0,0,2018,1,23,18,45,0


In [129]:
train.drop("ID", axis=1, inplace=True)

In [130]:
cols = train.drop("target", axis=1)

In [149]:
col = cols.columns

In [150]:
col

Index(['FLTID', 'DEPSTN', 'ARRSTN', 'STATUS', 'AC', 'Date_year', 'Date_month',
       'Date_day', 'STD_Date_year', 'STD_Date_month', 'STD_Date_day',
       'STD_Date_hour', 'STD_Date_minute', 'STD_Date_second', 'STA_Date_year',
       'STA_Date_month', 'STA_Date_day', 'STA_Date_hour', 'STA_Date_minute',
       'STA_Date_second'],
      dtype='object')

In [152]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=5168)
# oof = df_train[['ID_code', 'target']]
# oof['predict'] = 0
# predictions = test[['ID_code']]
# feature_importance_df = pd.DataFrame()
# val_aucs = []
for fold, (trn_idx, val_idx) in enumerate(skf.split(train, train['target'])):
    X_train, y_train = train.iloc[trn_idx][col], train.iloc[trn_idx]['target']
    X_valid, y_valid = train.iloc[val_idx][col], train.iloc[val_idx]['target']
    break
    
    
    
clf = CatBoostRegressor(loss_function = "RMSE", eval_metric = "RMSE",random_seed=123,use_best_model=True,
                          learning_rate=0.1,  iterations=15000,verbose=100,
                           bootstrap_type= "Poisson", 
                           task_type="GPU", 
#                              l2_leaf_reg= 16.5056753964314982, depth= 3.0,
#                              fold_len_multiplier= 2.9772639036842174, 
#                              scale_pos_weight= 3.542962442406767, 
#                              fold_permutation_block_size=16.0, subsample= 0.46893530376570957
#                              fold_len_multiplier=3.2685541035861747, 
#                              scale_pos_weight= 2.6496926337120916, 
#                              fold_permutation_block_size= 6.0, 
                          )
print("Model training")
clf.fit(X_train, y_train,  eval_set=(X_valid, y_valid), early_stopping_rounds=2000,verbose=100)


Model training
0:	learn: 115.6112675	test: 119.9362685	best: 119.9362685 (0)	total: 11.5ms	remaining: 2m 52s
100:	learn: 108.2631350	test: 115.0948323	best: 115.0948323 (100)	total: 616ms	remaining: 1m 30s
200:	learn: 105.8797467	test: 114.5198908	best: 114.5198908 (200)	total: 1.21s	remaining: 1m 29s
300:	learn: 103.8180775	test: 113.8509428	best: 113.8509428 (300)	total: 2.07s	remaining: 1m 40s
400:	learn: 102.0628827	test: 113.4438918	best: 113.4420934 (393)	total: 2.89s	remaining: 1m 45s
500:	learn: 100.6704053	test: 113.1139419	best: 113.1122694 (499)	total: 3.48s	remaining: 1m 40s
600:	learn: 99.4204857	test: 112.8649444	best: 112.8649444 (600)	total: 4.08s	remaining: 1m 37s
700:	learn: 98.2664287	test: 112.8316861	best: 112.8095852 (694)	total: 4.68s	remaining: 1m 35s
800:	learn: 97.1396698	test: 112.7631335	best: 112.7601992 (789)	total: 5.28s	remaining: 1m 33s
900:	learn: 96.2404426	test: 112.7650480	best: 112.7145828 (850)	total: 5.87s	remaining: 1m 31s
1000:	learn: 95.339344

<catboost.core.CatBoostRegressor at 0x7f2f639d7190>

In [161]:
predict = clf.predict(test[col])

In [166]:
sample.target = predict[:]

In [167]:
sample

Unnamed: 0,ID,target
0,test_id_0,20.791259
1,test_id_1,84.406938
2,test_id_2,18.819778
3,test_id_3,36.830350
4,test_id_4,26.148385
...,...,...
9328,test_id_9328,121.163676
9329,test_id_9329,3.893393
9330,test_id_9330,25.313250
9331,test_id_9331,35.321401


In [172]:
from IPython.display import FileLink
def create_submission(submission_file, submission_name):
    submission_file.to_csv(submission_name+".csv",index=False)
    return FileLink(submission_name+".csv")

In [174]:
create_submission(sample, "sub_c_15k_simple")