In [189]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
%matplotlib inline

from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GroupKFold
from sklearn.metrics import roc_auc_score

from tqdm import tqdm_notebook
import lightgbm as lgb
from catboost import Pool, CatBoostRegressor

import warnings
warnings.filterwarnings("ignore")
import datetime as dt
%matplotlib inline
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
LE = LabelEncoder()


plt.style.use('seaborn')
sns.set(font_scale=1)

import gc

from sklearn.metrics import accuracy_score

In [190]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df

In [191]:
train = pd.read_csv("../input/flight-delay-prediction-challenge/Train (8).csv")
test = pd.read_csv("../input/flight-delay-prediction-challenge/Test (9).csv")
sample = pd.read_csv('../input/flight-delay-prediction-challenge/SampleSubmission (5).csv')

In [192]:
train = reduce_mem_usage(train)
test = reduce_mem_usage(test)
sample = reduce_mem_usage(sample)
gc.collect()

Memory usage after optimization is: 7.61 MB
Decreased by 7.5%
Memory usage after optimization is: 0.64 MB
Decreased by 0.0%
Memory usage after optimization is: 0.09 MB
Decreased by 37.5%


83

In [193]:
train["STAA"]=pd.to_datetime(train["STA"], format='%Y-%m-%d %H.%M.%S')
train['STA_Date_year'] =train["STAA"].dt.year
train['STA_Date_month'] = train["STAA"].dt.month
train['STA_Date_day'] = train["STAA"].dt.day
train['STA_Date_hour'] = train["STAA"].dt.hour
train['STA_Date_minute'] =train["STAA"].dt.minute
train['STA_Date_second'] = train["STAA"].dt.second
train['sta_day_hour']  = train['STA_Date_day'].astype(str) + '-' + train['STA_Date_hour'].astype(str)
train['sta_month_day'] = train['STA_Date_month'].astype(str) + '-' + train['STA_Date_day'].astype(str)
train.drop("STA", axis=1, inplace=True)
train.drop("STAA", axis=1, inplace=True)
train.drop(["STA_Date_day","STA_Date_month"], axis=1, inplace=True)

test["STAA"]=pd.to_datetime(test["STA"], format='%Y-%m-%d %H.%M.%S')
test['STA_Date_year'] =test["STAA"].dt.year
test['STA_Date_month'] = test["STAA"].dt.month
test['STA_Date_day'] = test["STAA"].dt.day
test['STA_Date_hour'] = test["STAA"].dt.hour
test['STA_Date_minute'] =test["STAA"].dt.minute
test['STA_Date_second'] = test["STAA"].dt.second
test['sta_day_hour']  = test['STA_Date_day'].astype(str) + '-' + test['STA_Date_hour'].astype(str)
test['sta_month_day'] = test['STA_Date_month'].astype(str) + '-' + test['STA_Date_day'].astype(str)

test.drop("STA", axis=1, inplace=True)
test.drop("STAA", axis=1, inplace=True)

for le_col in ["sta_month_day","sta_day_hour"] :
    train[le_col] = LE.fit_transform(train[le_col])
    test[le_col] = LE.transform(test[le_col])

KeyError: 'STA_Date_day'

In [None]:
train["STD"]=pd.to_datetime(train["STD"])
train['STD_Date_year'] =train["STD"].dt.year
# train['STD_Date_month'] = train["STD"].dt.month
# train['STD_Date_day'] = train["STD"].dt.day
train['STD_Date_hour'] = train["STD"].dt.hour
train['STD_Date_minute'] =train["STD"].dt.minute
train['STD_Date_second'] = train["STD"].dt.second
train['std_day_hour']  = train['STD_Date_day'].astype(str) + '-' + train['STD_Date_hour'].astype(str)
train['std_month_day'] = train['STD_Date_month'].astype(str) + '-' + train['STD_Date_day'].astype(str)
train.drop("STD", axis=1, inplace=True)

test["STD"]=pd.to_datetime(test["STD"])
test['STD_Date_year'] =test["STD"].dt.year
# test['STD_Date_month'] = test["STD"].dt.month
# test['STD_Date_day'] = test["STD"].dt.day
test['STD_Date_hour'] = test["STD"].dt.hour
test['STD_Date_minute'] =test["STD"].dt.minute
test['STD_Date_second'] = test["STD"].dt.second
test['std_day_hour']  = test['STD_Date_day'].astype(str) + '-' + test['STD_Date_hour'].astype(str)
test['std_month_day'] = test['STD_Date_month'].astype(str) + '-' + test['STD_Date_day'].astype(str)

test.drop("STD", axis=1, inplace=True)

for le_col in ["std_day_hour","std_month_day"] :
    train[le_col] = LE.fit_transform(train[le_col])
    test[le_col] = LE.transform(test[le_col])

In [None]:
train["DATOP"]=pd.to_datetime(train["DATOP"])
train['Date_year'] =train["DATOP"].dt.year
train['Date_month'] = train["DATOP"].dt.month
train['Date_day'] = train["DATOP"].dt.day
train['day_month'] = train['Date_month'].astype(str) + '-' + train['Date_day'].astype(str)
train['SplitBy']   = train['Date_year'].astype(int).astype(str) + '-' + train['Date_month'].astype(str) + '-' + train['Date_day'].astype(str)
train = train.sort_values('SplitBy').reset_index(drop=True)
train.drop("DATOP", axis=1, inplace=True)

test["DATOP"]=pd.to_datetime(test["DATOP"])
test['Date_year'] =test["DATOP"].dt.year
test['Date_month'] = test["DATOP"].dt.month
test['Date_day'] = test["DATOP"].dt.day
test['day_month'] = test['Date_month'].astype(str) + '-' + test['Date_day'].astype(str)
test['SplitBy']   = test['STD_Date_year'].astype(int).astype(str) + '-' + test['STD_Date_month'].astype(str) + '-' + test['STD_Date_day'].astype(str)
test = test.sort_values('SplitBy').reset_index(drop=True)
test.drop("DATOP", axis=1, inplace=True)


for le_col in ["day_month"] :
    train[le_col] = LE.fit_transform(train[le_col])
    test[le_col] = LE.transform(test[le_col])

In [None]:
train.drop("ID", axis=1, inplace=True)
test.drop("ID", axis=1, inplace=True)

In [None]:
train

In [None]:
from category_encoders import CountEncoder
enc = CountEncoder(normalize=True, cols=['DEPSTN', 'ARRSTN','STATUS','AC',"FLTID"])
train = enc.fit_transform(train)
test=enc.fit_transform(test)

In [None]:
class CFG :
  SEED = 42
  n_splits = 5

  lgb_params = {'boosting_type': 'gbdt','objective': 'regression','metric': 'rmse',
                'n_estimators': 2500,'reg_lambda' :50,'num_leaves' : 81,
                'seed': SEED,'silent':True,'early_stopping_rounds': 200,
               }
  remove_features = ['SplitBy' , 'folds']
  TARGET_COL = 'target'
class CostumSplit :
  def __init__(self,) :
    self.n_splits = CFG.n_splits

  def Split(self,Train_) :
    kf = GroupKFold(n_splits=self.n_splits)

    Train = Train_.copy()
    Train = Train.drop_duplicates('SplitBy').reset_index(drop=True)
    
    groups = Train['SplitBy']
    Train["folds"]=-1   
    for fold, (_, val_index) in enumerate(kf.split(Train,Train['target'],groups)):
          Train.loc[val_index, "folds"] = fold
    return Train

  def apply(self,train) :
    mapper = dict(zip(self.Split(train)['SplitBy'].tolist(),
                      self.Split(train)['folds'].tolist()))

    train['folds'] = train['SplitBy'].map(mapper)
    return train

In [None]:
train

In [None]:
split = CostumSplit() 

train = split.apply(train)

In [None]:
train.columns

In [None]:
test.columns

In [None]:
features_columns = [col for col in train.columns if col not in CFG.remove_features]
len(features_columns)

In [None]:
train_df = train.drop(["target",'SplitBy' , 'folds'], axis=1)
col = train_df.columns


In [None]:
# from sklearn.model_selection import KFold
# import xgboost as xgb
# from sklearn.metrics import mean_squared_error


# kf = KFold(n_splits =5,shuffle=True,random_state=160)
# # feats = pd.DataFrame({'features': train.columns}) 
# feats = pd.DataFrame({'features': train_df.columns}) 
# gbm_predictions = []
# cv_score_ = 0
# oof_preds = np.zeros((train.shape[0],))

# for i, (tr_index, test_index) in enumerate(kf.split(train, train['target'])):
#     X_train, y_train = train_df.iloc[tr_index][col], train.iloc[tr_index]['target']
#     X_valid, y_valid = train_df.iloc[test_index][col], train.iloc[test_index]['target']
  
#     print()
#     print(f'######### FOLD {i+1} / {kf.n_splits} ')
  
#     X_train,y_train = train_df.iloc[tr_index,:],train[tr_index]
#     X_test,y_test = train_df.iloc[test_index,:],train[test_index]
  
#     gbm = xgb.XGBRegressor(eval_metric = 'rmse',n_estimators = 2000,learning_rate = 0.01,seed=162,random_state = 162,colsample_bytree=0.65)

#     gbm.fit(X_train,y_train,eval_set = [(X_test, y_test)],early_stopping_rounds  = 200,verbose=100)
  
#     cv_score_ += mean_squared_error(y_test, gbm.predict(X_test), squared=True) / kf.n_splits
#     oof_preds[test_index] = gbm.predict(X_test)
  
#     preds = gbm.predict(final_test[X_train.columns])
#     gbm_predictions.append(preds)

#     feats[f'Fold {i}'] = gbm.feature_importances_

# feats['Importances'] = feats.mean(axis=1)
# print( ' CV RMSE : ',cv_score_)
# preds_xgb = np.average(gbm_predictions, axis=0)
# print(preds_xgb.shape)

In [None]:
from lightgbm import LGBMRegressor
from sklearn.model_selection import KFold,StratifiedKFold ,GroupKFold
from sklearn.metrics import mean_squared_error
from sklearn import  ensemble
def get_model(Name='lgbm') :
    if Name=='lgbm' :
      return LGBMRegressor(**{'objective' :'regression','boosting_type' : 'gbdt','metric': 'rmse' ,
                              'learning_rate' : 0.05,'num_iterations': 2500,'max_depth' :8 ,'num_leaves' : 150,
                              'max_bins': 85,'min_data_in_leaf':30,'reg_lambda' :75})
test_ = test[col]
Model_Name = "lgbm"
folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=5168)
oofs  = np.zeros((len(train_df[col])))
test_predictions = np.zeros((len(test)))

for fold_, (trn_idx, val_idx) in enumerate(folds.split(train, train['target'])):
    X_train, y_train = train_df.iloc[trn_idx][col], train.iloc[trn_idx]['target']
    X_valid, y_valid = train_df.iloc[val_idx][col], train.iloc[val_idx]['target']
#     X_trn, y_trn = X.iloc[trn_idx], y.iloc[trn_idx]
#     X_val, y_val = X.iloc[val_idx], y.iloc[val_idx]
    
    clf = get_model(Name=Model_Name)
    clf.fit(X_train, y_train, eval_set = [(X_valid, y_valid)],
            verbose =100, early_stopping_rounds = 200)
    
    vp = clf.predict(X_valid)
    oofs[val_idx] = vp
    val_score = mean_squared_error((vp), (y_valid),squared=True)
    print(4*'-- -- -- --')
    print(f'Fold {fold_+1} Val score: {val_score}')
    print(4*'-- -- -- --')
    
    tp = clf.predict(test_)
    test_predictions += tp / folds.n_splits

  
print()
print(3*'###',10*"^",3*'###')
print(mean_squared_error(train["target"], oofs,squared=True))
print("Model training")

In [None]:
# from sklearn import  ensemble
# skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=5168)

# # oof = df_train[['ID_code', 'target']]
# # oof['predict'] = 0
# # predictions = test[['ID_code']]
# # feature_importance_df = pd.DataFrame()
# # val_aucs = []
# for fold, (trn_idx, val_idx) in enumerate(skf.split(train, train['target'])):
#     X_train, y_train = train_df.iloc[trn_idx][col], train.iloc[trn_idx]['target']
#     X_valid, y_valid = train_df.iloc[val_idx][col], train.iloc[val_idx]['target']
#     break
    

# params = {
#     "n_estimators": 1000,
#     "max_depth": 8,
#     "min_samples_split": 5,
#     "learning_rate": 0.01,
#     "loss": "squared_error",
# }
    
# # clf =  ensemble.GradientBoostingRegressor(**params 
# # #                              l2_leaf_reg= 16.5056753964314982, depth= 3.0,
# # #                              fold_len_multiplier= 2.9772639036842174, 
# # #                              scale_pos_weight= 3.542962442406767, 
# # #                              fold_permutation_block_size=16.0, subsample= 0.46893530376570957
# # #                              fold_len_multiplier=3.2685541035861747, 
# # #                              scale_pos_weight= 2.6496926337120916, 
# # #                              fold_permutation_block_size= 6.0, eval_set=(X_valid, y_valid)
# #                           )
# gbm = xgb.XGBRegressor(eval_metric = 'rmse',n_estimators = 5000,learning_rate = 0.01,seed=162,random_state = 162,colsample_bytree=0.65)

# gbm.fit(X_train,y_train,eval_set = [(X_valid, y_valid)],early_stopping_rounds  = 200,verbose=100)

  
# preds = gbm.predict(test[col])
# print("Model training")
# # clf.fit(X_train, y_train )


In [None]:
sample.target = test_predictions[:]

In [None]:
# from lightgbm import LGBMRegressor
# from sklearn.model_selection import KFold,StratifiedKFold ,GroupKFold
# from sklearn.metrics import mean_squared_error
# from sklearn import  ensemble
# def get_model(Name='lgbm') :
#     if Name=='lgbm' :
#       return LGBMRegressor(**{'objective' :'regression','boosting_type' : 'gbdt','metric': 'rmse' ,
#                               'learning_rate' : 0.05,'num_iterations': 1500,'max_depth' :8 ,'num_leaves' : 150,
#                               'max_bins': 85,'min_data_in_leaf':30,'reg_lambda' :75})
# test_ = test[col]
# Model_Name = "lgbm"
# folds = StratifiedKFold(n_splits=10, shuffle=True, random_state=5168)
# oofs  = np.zeros((len(train_df[col])))
# test_predictions = np.zeros((len(test)))

# for fold_, (trn_idx, val_idx) in enumerate(folds.split(train, train['target'])):
#     X_train, y_train = train_df.iloc[trn_idx][col], train.iloc[trn_idx]['target']
#     X_valid, y_valid = train_df.iloc[val_idx][col], train.iloc[val_idx]['target']
# #     X_trn, y_trn = X.iloc[trn_idx], y.iloc[trn_idx]
# #     X_val, y_val = X.iloc[val_idx], y.iloc[val_idx]
    
#     clf = get_model(Name=Model_Name)
#     clf.fit(X_train, y_train, eval_set = [(X_valid, y_valid)],
#             verbose =100, early_stopping_rounds = 200)
    
#     vp = clf.predict(X_valid)
#     oofs[val_idx] = vp
#     val_score = mean_squared_error((vp), (y_valid),squared=True)
#     print(4*'-- -- -- --')
#     print(f'Fold {fold_+1} Val score: {val_score}')
#     print(4*'-- -- -- --')
    
#     tp = clf.predict(test_)
#     test_predictions += tp / folds.n_splits

  
# print()
# print(3*'###',10*"^",3*'###')
# print(mean_squared_error(train["target"], oofs,squared=True))
# print("Model training")
# # clf.fit(X_train, y_train )


In [194]:
# len(preds)

In [195]:
# predict = clf.predict(test[col])

In [196]:
# sample.target = preds[:]

In [197]:
sample

Unnamed: 0,ID,target
0,test_id_0,2470
1,test_id_1,2944
2,test_id_2,2585
3,test_id_3,3264
4,test_id_4,1369
...,...,...
9328,test_id_9328,2155
9329,test_id_9329,3444
9330,test_id_9330,921
9331,test_id_9331,130


In [198]:
from IPython.display import FileLink
def create_submission(submission_file, submission_name):
    submission_file.to_csv(submission_name+".csv",index=False)
    return FileLink(submission_name+".csv")

In [199]:
create_submission(sample, "sub7")