In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
%matplotlib inline

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

from tqdm import tqdm_notebook
import lightgbm as lgb
from catboost import Pool, CatBoostRegressor

import warnings
warnings.filterwarnings("ignore")

%matplotlib inline
import seaborn as sns



plt.style.use('seaborn')
sns.set(font_scale=1)

import gc



In [2]:
train = pd.read_csv("../input/flight-delay-prediction-challenge/Train (8).csv")
test = pd.read_csv("../input/flight-delay-prediction-challenge/Test (9).csv")
sample = pd.read_csv('../input/flight-delay-prediction-challenge/SampleSubmission (5).csv')

In [3]:
train

Unnamed: 0,ID,DATOP,FLTID,DEPSTN,ARRSTN,STD,STA,STATUS,AC,target
0,train_id_0,2016-01-03,TU 0712,CMN,TUN,2016-01-03 10:30:00,2016-01-03 12.55.00,ATA,TU 32AIMN,260.0
1,train_id_1,2016-01-13,TU 0757,MXP,TUN,2016-01-13 15:05:00,2016-01-13 16.55.00,ATA,TU 31BIMO,20.0
2,train_id_2,2016-01-16,TU 0214,TUN,IST,2016-01-16 04:10:00,2016-01-16 06.45.00,ATA,TU 32AIMN,0.0
3,train_id_3,2016-01-17,TU 0480,DJE,NTE,2016-01-17 14:10:00,2016-01-17 17.00.00,ATA,TU 736IOK,0.0
4,train_id_4,2016-01-17,TU 0338,TUN,ALG,2016-01-17 14:30:00,2016-01-17 15.50.00,ATA,TU 320IMU,22.0
...,...,...,...,...,...,...,...,...,...,...
107828,train_id_107828,2018-07-05,WKL 0000,TUN,TUN,2018-07-05 23:00:00,2018-07-06 02.00.00,SCH,TU 32AIML,0.0
107829,train_id_107829,2018-01-13,UG 0003,DJE,TUN,2018-01-13 08:00:00,2018-01-13 09.00.00,SCH,UG AT7AT7,0.0
107830,train_id_107830,2018-11-07,SGT 0000,TUN,TUN,2018-11-07 05:00:00,2018-11-07 12.50.00,SCH,TU 736IOK,0.0
107831,train_id_107831,2018-01-23,UG 0010,TUN,DJE,2018-01-23 18:00:00,2018-01-23 18.45.00,ATA,TU CR9ISA,0.0


In [4]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df

In [5]:
train = reduce_mem_usage(train)
test = reduce_mem_usage(test)
sample = reduce_mem_usage(sample)

Memory usage after optimization is: 7.61 MB
Decreased by 7.5%
Memory usage after optimization is: 0.64 MB
Decreased by 0.0%
Memory usage after optimization is: 0.09 MB
Decreased by 37.5%


In [6]:
train.columns

Index(['ID', 'DATOP', 'FLTID', 'DEPSTN', 'ARRSTN', 'STD', 'STA', 'STATUS',
       'AC', 'target'],
      dtype='object')

In [7]:
test.columns

Index(['ID', 'DATOP', 'FLTID', 'DEPSTN', 'ARRSTN', 'STD', 'STA', 'STATUS',
       'AC'],
      dtype='object')

In [8]:
train.drop("ID", axis=1, inplace=True)
test.drop("ID", axis=1, inplace=True)

In [9]:
gc.collect()

73

In [10]:
import datetime as dt
train["DATOP"]=pd.to_datetime(train["DATOP"])
train['Date_year'] =train["DATOP"].dt.year
train['Date_month'] = train["DATOP"].dt.month
train['Date_day'] = train["DATOP"].dt.day
train.drop("DATOP", axis=1, inplace=True)


In [11]:
test["DATOP"]=pd.to_datetime(test["DATOP"])
test['Date_year'] =test["DATOP"].dt.year
test['Date_month'] = test["DATOP"].dt.month
test['Date_day'] = test["DATOP"].dt.day
test.drop("DATOP", axis=1, inplace=True)

In [12]:
import datetime as dt
train["STD"]=pd.to_datetime(train["STD"])
train['STD_Date_year'] =train["STD"].dt.year
train['STD_Date_month'] = train["STD"].dt.month
train['STD_Date_day'] = train["STD"].dt.day
train['STD_Date_hour'] = train["STD"].dt.hour
train['STD_Date_minute'] =train["STD"].dt.minute
train['STD_Date_second'] = train["STD"].dt.second

train.drop("STD", axis=1, inplace=True)

In [13]:
import datetime as dt
test["STD"]=pd.to_datetime(test["STD"])
test['STD_Date_year'] =test["STD"].dt.year
test['STD_Date_month'] = test["STD"].dt.month
test['STD_Date_day'] = test["STD"].dt.day
test['STD_Date_hour'] = test["STD"].dt.hour
test['STD_Date_minute'] =test["STD"].dt.minute
test['STD_Date_second'] = test["STD"].dt.second

test.drop("STD", axis=1, inplace=True)


In [14]:
train["STAA"]=pd.to_datetime(train["STA"], format='%Y-%m-%d %H.%M.%S')
train['STA_Date_year'] =train["STAA"].dt.year
train['STA_Date_month'] = train["STAA"].dt.month
train['STA_Date_day'] = train["STAA"].dt.day
train['STA_Date_hour'] = train["STAA"].dt.hour
train['STA_Date_minute'] =train["STAA"].dt.minute
train['STA_Date_second'] = train["STAA"].dt.second
train.drop("STA", axis=1, inplace=True)
train.drop("STAA", axis=1, inplace=True)

In [15]:
test["STAA"]=pd.to_datetime(test["STA"], format='%Y-%m-%d %H.%M.%S')
test['STA_Date_year'] =test["STAA"].dt.year
test['STA_Date_month'] = test["STAA"].dt.month
test['STA_Date_day'] = test["STAA"].dt.day
test['STA_Date_hour'] = test["STAA"].dt.hour
test['STA_Date_minute'] =test["STAA"].dt.minute
test['STA_Date_second'] = test["STAA"].dt.second
test.drop("STA", axis=1, inplace=True)
test.drop("STAA", axis=1, inplace=True)



In [16]:
train_df = train.drop("target", axis=1)

In [17]:
from category_encoders import CountEncoder
enc = CountEncoder(normalize=True, cols=['DEPSTN', 'ARRSTN','STATUS','AC',"FLTID"])
train_df = enc.fit_transform(train_df)
test=enc.transform(test)

In [18]:
train_df

Unnamed: 0,FLTID,DEPSTN,ARRSTN,STATUS,AC,Date_year,Date_month,Date_day,STD_Date_year,STD_Date_month,STD_Date_day,STD_Date_hour,STD_Date_minute,STD_Date_second,STA_Date_year,STA_Date_month,STA_Date_day,STA_Date_hour,STA_Date_minute,STA_Date_second
0,0.009274,0.012510,0.394796,0.868741,0.035323,2016,1,3,2016,1,3,10,30,0,2016,1,3,12,55,0
1,0.008949,0.014226,0.394796,0.868741,0.027033,2016,1,13,2016,1,13,15,5,0,2016,1,13,16,55,0
2,0.004238,0.394332,0.013373,0.868741,0.035323,2016,1,16,2016,1,16,4,10,0,2016,1,16,6,45,0
3,0.001595,0.095073,0.005008,0.868741,0.025836,2016,1,17,2016,1,17,14,10,0,2016,1,17,17,0,0
4,0.005230,0.394332,0.014587,0.868741,0.043808,2016,1,17,2016,1,17,14,30,0,2016,1,17,15,50,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
107828,0.028795,0.394332,0.394796,0.122801,0.038226,2018,7,5,2018,7,5,23,0,0,2018,7,6,2,0,0
107829,0.008606,0.095073,0.394796,0.122801,0.013363,2018,1,13,2018,1,13,8,0,0,2018,1,13,9,0,0
107830,0.002105,0.394332,0.394796,0.122801,0.025836,2018,11,7,2018,11,7,5,0,0,2018,11,7,12,50,0
107831,0.007261,0.394332,0.094572,0.868741,0.032541,2018,1,23,2018,1,23,18,0,0,2018,1,23,18,45,0


In [19]:
col = train_df.columns

In [20]:
col

Index(['FLTID', 'DEPSTN', 'ARRSTN', 'STATUS', 'AC', 'Date_year', 'Date_month',
       'Date_day', 'STD_Date_year', 'STD_Date_month', 'STD_Date_day',
       'STD_Date_hour', 'STD_Date_minute', 'STD_Date_second', 'STA_Date_year',
       'STA_Date_month', 'STA_Date_day', 'STA_Date_hour', 'STA_Date_minute',
       'STA_Date_second'],
      dtype='object')

In [21]:
# from sklearn.model_selection import KFold
# import xgboost as xgb
# from sklearn.metrics import mean_squared_error


# kf = KFold(n_splits =5,shuffle=True,random_state=160)
# feats = pd.DataFrame({'features': train_df.columns}) 
# gbm_predictions = []
# cv_score_ = 0
# oof_preds = np.zeros((train.shape[0],))

# for i, (tr_index, test_index) in enumerate(kf.split(train, train['target'])):
#     X_train, y_train = train_df.iloc[tr_index][col], train.iloc[tr_index]['target']
#     X_valid, y_valid = train_df.iloc[test_index][col], train.iloc[test_index]['target']
  
#     print()
#     print(f'######### FOLD {i+1} / {kf.n_splits} ')
  
#     X_train,y_train = train_df.iloc[tr_index,:],train[tr_index]
#     X_test,y_test = train_df.iloc[test_index,:],train[test_index]
  
#     gbm = xgb.XGBRegressor(eval_metric = 'rmse',n_estimators = 2000,learning_rate = 0.01,seed=162,random_state = 162,colsample_bytree=0.65)

#     gbm.fit(X_train,y_train,eval_set = [(X_test, y_test)],early_stopping_rounds  = 200,verbose=100)
  
#     cv_score_ += mean_squared_error(y_test, gbm.predict(X_test), squared=True) / kf.n_splits
#     oof_preds[test_index] = gbm.predict(X_test)
  
#     preds = gbm.predict(final_test[X_train.columns])
#     gbm_predictions.append(preds)

#     feats[f'Fold {i}'] = gbm.feature_importances_

# feats['Importances'] = feats.mean(axis=1)
# print( ' CV RMSE : ',cv_score_)
# preds_xgb = np.average(gbm_predictions, axis=0)
# print(preds_xgb.shape)

In [22]:
# from sklearn import  ensemble
# skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=5168)

# # oof = df_train[['ID_code', 'target']]
# # oof['predict'] = 0
# # predictions = test[['ID_code']]
# # feature_importance_df = pd.DataFrame()
# # val_aucs = []
# for fold, (trn_idx, val_idx) in enumerate(skf.split(train, train['target'])):
#     X_train, y_train = train_df.iloc[trn_idx][col], train.iloc[trn_idx]['target']
#     X_valid, y_valid = train_df.iloc[val_idx][col], train.iloc[val_idx]['target']
#     break
    

# params = {
#     "n_estimators": 1000,
#     "max_depth": 8,
#     "min_samples_split": 5,
#     "learning_rate": 0.01,
#     "loss": "squared_error",
# }
    
# # clf =  ensemble.GradientBoostingRegressor(**params 
# # #                              l2_leaf_reg= 16.5056753964314982, depth= 3.0,
# # #                              fold_len_multiplier= 2.9772639036842174, 
# # #                              scale_pos_weight= 3.542962442406767, 
# # #                              fold_permutation_block_size=16.0, subsample= 0.46893530376570957
# # #                              fold_len_multiplier=3.2685541035861747, 
# # #                              scale_pos_weight= 2.6496926337120916, 
# # #                              fold_permutation_block_size= 6.0, eval_set=(X_valid, y_valid)
# #                           )
# gbm = xgb.XGBRegressor(eval_metric = 'rmse',n_estimators = 2000,learning_rate = 0.01,seed=162,random_state = 162,colsample_bytree=0.65)

# gbm.fit(X_train,y_train,eval_set = [(X_valid, y_valid)],early_stopping_rounds  = 200,verbose=100)

  
# preds = gbm.predict(test[col])
# print("Model training")
# # clf.fit(X_train, y_train )


In [23]:
from lightgbm import LGBMRegressor
from sklearn.model_selection import KFold,StratifiedKFold ,GroupKFold
from sklearn.metrics import mean_squared_error
from sklearn import  ensemble
def get_model(Name='lgbm') :
    if Name=='lgbm' :
      return LGBMRegressor(**{'objective' :'regression','boosting_type' : 'gbdt','metric': 'rmse' ,
                              'learning_rate' : 0.05,'num_iterations': 1500,'max_depth' :8 ,'num_leaves' : 150,
                              'max_bins': 85,'min_data_in_leaf':30,'reg_lambda' :75})

In [24]:
test_ = test[col]
Model_Name = "lgbm"
folds = StratifiedKFold(n_splits=10, shuffle=True, random_state=5168)
oofs  = np.zeros((len(train_df[col])))
test_predictions = np.zeros((len(test)))

for fold_, (trn_idx, val_idx) in enumerate(folds.split(train, train['target'])):
    X_train, y_train = train_df.iloc[trn_idx][col], train.iloc[trn_idx]['target']
    X_valid, y_valid = train_df.iloc[val_idx][col], train.iloc[val_idx]['target']
#     X_trn, y_trn = X.iloc[trn_idx], y.iloc[trn_idx]
#     X_val, y_val = X.iloc[val_idx], y.iloc[val_idx]
    
    clf = get_model(Name=Model_Name)
    clf.fit(X_train, y_train, eval_set = [(X_valid, y_valid)],
            verbose =100, early_stopping_rounds = 200)
    
    vp = clf.predict(X_valid)
    oofs[val_idx] = vp
    val_score = mean_squared_error((vp), (y_valid),squared=True)
    print(4*'-- -- -- --')
    print(f'Fold {fold_+1} Val score: {val_score}')
    print(4*'-- -- -- --')
    
    tp = clf.predict(test_)
    test_predictions += tp / folds.n_splits

  
print()
print(3*'###',10*"^",3*'###')
print(mean_squared_error(train["target"], oofs,squared=True))
print("Model training")
# clf.fit(X_train, y_train )


[100]	valid_0's rmse: 118.458
[200]	valid_0's rmse: 117.362
[300]	valid_0's rmse: 116.602
[400]	valid_0's rmse: 116.336
[500]	valid_0's rmse: 116.043
[600]	valid_0's rmse: 115.921
[700]	valid_0's rmse: 115.746
[800]	valid_0's rmse: 115.393
[900]	valid_0's rmse: 115.138
[1000]	valid_0's rmse: 114.966
[1100]	valid_0's rmse: 114.723
[1200]	valid_0's rmse: 114.6
[1300]	valid_0's rmse: 114.58
[1400]	valid_0's rmse: 114.536
[1500]	valid_0's rmse: 114.521
-- -- -- ---- -- -- ---- -- -- ---- -- -- --
Fold 1 Val score: 13105.06199160736
-- -- -- ---- -- -- ---- -- -- ---- -- -- --
[100]	valid_0's rmse: 117.493
[200]	valid_0's rmse: 116.2
[300]	valid_0's rmse: 115.38
[400]	valid_0's rmse: 115.024
[500]	valid_0's rmse: 114.684
[600]	valid_0's rmse: 114.332
[700]	valid_0's rmse: 114.071
[800]	valid_0's rmse: 113.988
[900]	valid_0's rmse: 113.813
[1000]	valid_0's rmse: 113.668
[1100]	valid_0's rmse: 113.295
[1200]	valid_0's rmse: 113.091
[1300]	valid_0's rmse: 112.945
[1400]	valid_0's rmse: 112.773

In [31]:
len(test_predictions)

9333

In [26]:
# predict = clf.predict(test[col])

In [27]:
sample.target = test_predictions[:]

In [28]:
sample

Unnamed: 0,ID,target
0,test_id_0,13.081758
1,test_id_1,127.289061
2,test_id_2,-3.028895
3,test_id_3,19.614055
4,test_id_4,25.410660
...,...,...
9328,test_id_9328,104.845872
9329,test_id_9329,4.252107
9330,test_id_9330,-2.498498
9331,test_id_9331,-0.465632


In [29]:
from IPython.display import FileLink
def create_submission(submission_file, submission_name):
    submission_file.to_csv(submission_name+".csv",index=False)
    return FileLink(submission_name+".csv")

In [32]:
create_submission(sample, "sub5")