Applying PCA and investigating its effect on the CV score. 

Started kernel by fabiendaniel: https://www.kaggle.com/fabiendaniel/elo-world

In [1]:
import pandas as pd
import numpy as np
import datetime
import time
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.decomposition import PCA
import lightgbm as lgb
import matplotlib.pyplot as plt
%matplotlib inline

  from numpy.core.umath_tests import inner1d


In [None]:
df_train = pd.read_csv('../Tareq Mufaddi/Elo/train.csv')
df_history = pd.read_csv("../Tareq Mufaddi/Elo/historical_transactions.csv")

In [None]:
df_history.loc[:, 'purchase_date'] = pd.DatetimeIndex(df_history['purchase_date']).\
                                      astype(np.int64) * 1e-9

In [3]:
df_history['authorized_flag'] = df_history['authorized_flag'].map({'Y':1, 'N':0})
df_history['category_1'] = df_history['category_1'].map({'Y':1, 'N':0})
df_history['purchase_date'] = pd.to_datetime(df_history['purchase_date'])
last_date_hist = datetime.datetime(2018, 2, 28)
df_history['time_since_purchase_date'] = ((last_date_hist - df_history['purchase_date']).dt.days)
df_history.loc[:, 'purchase_date'] = pd.DatetimeIndex(df_history['purchase_date']).\
                                      astype(np.int64) * 1e-9

df_history['installments'] = df_history['installments'].replace(999,-1)
cols_with_nulls = ['city_id', 'state_id', 'subsector_id', 'installments']
for col in cols_with_nulls:
    df_history[col] = df_history[col].replace(-1, np.nan)

In [4]:
agg_func = {
        'authorized_flag': ['mean'],
        'city_id': ['nunique'], 
        'category_1': ['sum', 'mean'],
        'installments': ['median', 'max'],
        'category_3': ['nunique'],
        'merchant_category_id': ['nunique'], 
        'merchant_id': ['nunique'],
        'month_lag': ['min', 'max'],
        'purchase_amount': ['sum', 'median', 'max', 'min'],
        'purchase_date': ['min', 'max'],
        'time_since_purchase_date': ['min', 'max', 'mean'],
        'category_2': ['nunique'], 
        'state_id': ['nunique'], 
        'subsector_id': ['nunique']
        }


agg_history = df_history.groupby(['card_id']).agg(agg_func)
agg_history.columns = ['hist_' + '_'.join(col).strip() for col in agg_history.columns.values]
agg_history.reset_index(inplace=True)

In [7]:
df_train_all = pd.merge(df_train, agg_history, on='card_id', how='left')

In [18]:
df_test = pd.read_csv("../Tareq Mufaddi/Elo//test.csv")
df_test_all = pd.merge(df_test, agg_history, on='card_id', how='left')

In [8]:
y_label_regr = df_train_all['target']

df_train_all = df_train_all.drop(['target',
                                    'first_active_month', 
                                    'card_id'
                                    ],
                                     axis = 1)

train_x, test_x, train_y, test_y = train_test_split(df_train_all, y_label_regr, test_size=0.7, random_state=42)

train_x.reset_index(inplace=True, drop = True)
test_x.reset_index(inplace=True, drop = True)
train_y.reset_index(inplace=True, drop = True)
test_y.reset_index(inplace=True, drop = True)

In [10]:
param = {'num_leaves': 111,
         'min_data_in_leaf': 149,
         'objective':'regression',
         'max_depth': 9,
         'learning_rate': 0.005,
         "boosting": "gbdt",
         "feature_fraction": 0.7522,
         "bagging_freq": 1,
         "bagging_fraction": 0.7083 ,
         "bagging_seed": 11,
         "metric": 'rmse',
         "lambda_l1": 0.3134,
         "random_state": 133,
         "verbosity": -1}

features = train_x.columns
folds = KFold(n_splits=5, shuffle=True, random_state=15)
oof_normal = np.zeros(len(df_train_all))
predictions_normal = np.zeros(len(test_x))
start = time.time()
feature_importance_df = pd.DataFrame()

for fold_, (trn_idx, val_idx) in enumerate(folds.split(train_x.values, train_y)):
    print("fold n°{}".format(fold_))
    trn_data = lgb.Dataset(train_x.iloc[trn_idx][features],
                           label=train_y[trn_idx],
                           #categorical_feature=cat_feats
                           )
    val_data = lgb.Dataset(train_x.iloc[val_idx][features],
                           label=train_y[val_idx],
                           #categorical_feature=cat_feats
                           )

    num_round = 10000
    clf = lgb.train(param,
                    trn_data,
                    num_round,
                    valid_sets=[trn_data, val_data],
                    verbose_eval=100,
                    early_stopping_rounds=200)

    oof_normal[val_idx] = clf.predict(train_x.iloc[val_idx][features], num_iteration=clf.best_iteration)

    fold_importance_df = pd.DataFrame()
    fold_importance_df["feature"] = features
    fold_importance_df["importance"] = clf.feature_importance()
    fold_importance_df["fold"] = fold_ + 1
    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)

    predictions_normal += clf.predict(test_x[features], num_iteration=clf.best_iteration) / folds.n_splits

fold n°0
Training until validation scores don't improve for 200 rounds.
[100]	training's rmse: 3.71439	valid_1's rmse: 3.6248
[200]	training's rmse: 3.67049	valid_1's rmse: 3.59701
[300]	training's rmse: 3.64182	valid_1's rmse: 3.58443
[400]	training's rmse: 3.61954	valid_1's rmse: 3.57837
[500]	training's rmse: 3.60202	valid_1's rmse: 3.57675
[600]	training's rmse: 3.58724	valid_1's rmse: 3.57594
[700]	training's rmse: 3.57252	valid_1's rmse: 3.57596
[800]	training's rmse: 3.55977	valid_1's rmse: 3.57679
Early stopping, best iteration is:
[671]	training's rmse: 3.57661	valid_1's rmse: 3.57571
fold n°1
Training until validation scores don't improve for 200 rounds.
[100]	training's rmse: 3.66766	valid_1's rmse: 3.80748
[200]	training's rmse: 3.62326	valid_1's rmse: 3.78337
[300]	training's rmse: 3.59436	valid_1's rmse: 3.77188
[400]	training's rmse: 3.57218	valid_1's rmse: 3.76749
[500]	training's rmse: 3.55393	valid_1's rmse: 3.76566
[600]	training's rmse: 3.53874	valid_1's rmse: 3.764

### PCA:

In [12]:
pca = PCA()
pca.fit(train_x)
pca.transform(train_x)

array([[ 1.94079041e+07,  3.53346406e+06,  5.75364883e+00, ...,
         6.93976390e-02, -4.95092076e-02,  7.91549702e-02],
       [-1.49087073e+07,  1.34960314e+07, -4.29062754e+01, ...,
        -4.82598962e-02,  4.45909873e-02, -4.99463743e-02],
       [ 1.27527676e+07,  1.14614689e+06, -7.53711112e+01, ...,
         1.30902118e-02, -4.98023302e-02,  9.96072240e-02],
       ...,
       [-1.25273645e+07,  2.83561452e+06, -3.13058347e+01, ...,
         6.91810644e-02, -1.90406090e-02,  8.50344585e-02],
       [-9.34452857e+06, -7.01095651e+06, -1.92966316e+02, ...,
        -3.08044730e-02, -9.48346930e-03,  1.66133066e-02],
       [ 1.54771096e+07,  5.39813040e+05, -2.70075557e+01, ...,
         5.04061334e-02,  2.29999281e-03, -2.03523120e-02]])

In [43]:
test = pd.read_csv("../Tareq Mufaddi/Elo//test.csv")

In [13]:
pca_train_x= pca.transform(train_x)
pca_train_x = pca_train_x[:,:3]
pca_train_x = pd.DataFrame(pca_train_x, columns=['comp1', 'comp2', 'comp3'])

pca_test_x= pca.transform(test_x)
pca_test_x = pca_test_x[:,:3]
pca_test_x = pd.DataFrame(pca_test_x, columns=['comp1', 'comp2', 'comp3'])

In [16]:
print("RMSE test normal: {:<8.5f}".format(mean_squared_error(predictions_normal, test_y) ** 0.5))
print("RMSE test PCA: {:<8.5f}".format(mean_squared_error(predictions_pca, test_y) ** 0.5))

RMSE test normal: 3.78331 
RMSE test PCA: 3.84118 


In [17]:
train_x_2 = pd.concat([train_x, pca_train_x], axis = 1)
test_x_2 = pd.concat([test_x, pca_test_x], axis = 1)

del train_x
del test_x
del pca_train_x
del pca_test_x

#Train LightGBM model
param = {'num_leaves': 111,
         'min_data_in_leaf': 149,
         'objective':'regression',
         'max_depth': 9,
         'learning_rate': 0.005,
         "boosting": "gbdt",
         "feature_fraction": 0.7522,
         "bagging_freq": 1,
         "bagging_fraction": 0.7083 ,
         "bagging_seed": 11,
         "metric": 'rmse',
         "lambda_l1": 0.3134,
         "random_state": 133,
         "verbosity": -1}

features = train_x_2.columns
folds = KFold(n_splits=5, shuffle=True, random_state=15)
oof_pca = np.zeros(len(train_x_2))
predictions_all = np.zeros(len(test_x_2))
start = time.time()
feature_importance_df = pd.DataFrame()

for fold_, (trn_idx, val_idx) in enumerate(folds.split(train_x_2.values, train_y)):
    print("fold n°{}".format(fold_))
    trn_data = lgb.Dataset(train_x_2.iloc[trn_idx][features],
                           label=train_y[trn_idx],
                           #categorical_feature=cat_feats
                           )
    val_data = lgb.Dataset(train_x_2.iloc[val_idx][features],
                           label=train_y[val_idx],
                           #categorical_feature=cat_feats
                           )

    num_round = 10000
    clf = lgb.train(param,
                    trn_data,
                    num_round,
                    valid_sets=[trn_data, val_data],
                    verbose_eval=100,
                    early_stopping_rounds=200)

    oof_pca[val_idx] = clf.predict(train_x_2.iloc[val_idx][features], num_iteration=clf.best_iteration)

    fold_importance_df = pd.DataFrame()
    fold_importance_df["feature"] = features
    fold_importance_df["importance"] = clf.feature_importance()
    fold_importance_df["fold"] = fold_ + 1
    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)

    predictions_all += clf.predict(test_x_2[features], num_iteration=clf.best_iteration) / folds.n_splits

print("RMSE test PCA: {:<8.5f}".format(mean_squared_error(predictions_all, test_y) ** 0.5))

fold n°0
Training until validation scores don't improve for 200 rounds.
[100]	training's rmse: 3.71241	valid_1's rmse: 3.62394
[200]	training's rmse: 3.66753	valid_1's rmse: 3.5964
[300]	training's rmse: 3.63827	valid_1's rmse: 3.58314
[400]	training's rmse: 3.61477	valid_1's rmse: 3.5783
[500]	training's rmse: 3.59641	valid_1's rmse: 3.57712
[600]	training's rmse: 3.5812	valid_1's rmse: 3.57641
[700]	training's rmse: 3.56633	valid_1's rmse: 3.57684
Early stopping, best iteration is:
[594]	training's rmse: 3.58207	valid_1's rmse: 3.57616
fold n°1
Training until validation scores don't improve for 200 rounds.
[100]	training's rmse: 3.66543	valid_1's rmse: 3.8044
[200]	training's rmse: 3.62048	valid_1's rmse: 3.77922
[300]	training's rmse: 3.59116	valid_1's rmse: 3.76829
[400]	training's rmse: 3.56913	valid_1's rmse: 3.76313
[500]	training's rmse: 3.55061	valid_1's rmse: 3.76089
[600]	training's rmse: 3.53491	valid_1's rmse: 3.76041
[700]	training's rmse: 3.52104	valid_1's rmse: 3.76055


### For submission:

In [20]:
pca = PCA()
pca.fit(df_train_all)
pca.transform(df_train_all)

array([[ 4.73388816e+06, -2.42371268e+06, -1.21410025e+02, ...,
        -4.48191542e-02,  1.71906462e-02,  8.69832983e-03],
       [-1.00890361e+07, -4.73788019e+06, -1.53266397e+02, ...,
         8.95252473e-02, -5.20158917e-02,  1.53639367e-02],
       [-9.02651208e+06, -6.83991761e+06,  7.92498502e+00, ...,
         4.86408469e-02,  3.28881060e-02,  1.33499943e-03],
       ...,
       [ 9.07711004e+06,  2.14835763e+05, -5.57310736e+01, ...,
        -1.20515054e-02, -1.22536071e-02,  2.14535760e-03],
       [-1.06181691e+07,  3.45372450e+06, -1.54452253e+02, ...,
         6.76793833e-02,  2.77244920e-02,  5.35228829e-04],
       [ 5.17197133e+06, -1.40530722e+06, -7.90317097e+01, ...,
        -1.71331636e-01,  1.35018932e-01,  2.79289974e-03]])

In [21]:
var_exp_3 = sum(pca.explained_variance_ratio_[:3])
print('Variance explained by the first 3 PCA components:', var_exp_3)

Variance explained by the first 3 PCA components: 0.999999999956983


In [26]:
df_test_all = df_test_all.drop(['first_active_month','card_id'], axis = 1)

In [27]:
pca_train_x= pca.transform(df_train_all)
pca_train_x = pca_train_x[:,:3]
pca_train_x = pd.DataFrame(pca_train_x, columns=['comp1', 'comp2', 'comp3'])

pca_test_x= pca.transform(df_test_all)
pca_test_x = pca_test_x[:,:3]
pca_test_x = pd.DataFrame(pca_test_x, columns=['comp1', 'comp2', 'comp3'])

In [None]:
train_x.reset_index(inplace=True, drop = True)
test_x.reset_index(inplace=True, drop = True)
train_y.reset_index(inplace=True, drop = True)
test_y.reset_index(inplace=True, drop = True)

In [40]:
#Train LightGBM model on these 3 PCA components only
param = {#'num_leaves': 21,
         #'min_data_in_leaf': 49,
         'objective':'regression',
         'max_depth': 8,
         'learning_rate': 0.001,
         #"boosting": "gbdt",
         #"feature_fraction": 0.5,
         #"bagging_freq": 1,
         #"bagging_fraction": 0.5 ,
         #"bagging_seed": 11,
         "metric": 'rmse',
         #"lambda_l1": 0.3134,
         "random_state": 133,
         #"is_unbalance": True,
         "verbosity": -1}

features = pca_train_x.columns
folds = KFold(n_splits=5, shuffle=True, random_state=15)
oof_pca = np.zeros(len(pca_train_x))
predictions_pca = np.zeros(len(pca_test_x))
start = time.time()
feature_importance_df = pd.DataFrame()

for fold_, (trn_idx, val_idx) in enumerate(folds.split(pca_train_x.values, y_label_regr)):
    print("fold n°{}".format(fold_))
    trn_data = lgb.Dataset(pca_train_x.iloc[trn_idx][features],
                           label=y_label_regr[trn_idx],
                           #categorical_feature=cat_feats
                           )
    val_data = lgb.Dataset(pca_train_x.iloc[val_idx][features],
                           label=y_label_regr[val_idx],
                           #categorical_feature=cat_feats
                           )

    num_round = 10000
    clf = lgb.train(param,
                    trn_data,
                    num_round,
                    valid_sets=[trn_data, val_data],
                    verbose_eval=100,
                    early_stopping_rounds=200)
    
    oof_pca[val_idx] = clf.predict(pca_train_x.iloc[val_idx][features], num_iteration=clf.best_iteration)

    fold_importance_df = pd.DataFrame()
    fold_importance_df["feature"] = features
    fold_importance_df["importance"] = clf.feature_importance()
    fold_importance_df["fold"] = fold_ + 1
    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)

    predictions_pca += clf.predict(pca_test_x[features], num_iteration=clf.best_iteration) / folds.n_splits

fold n°0
Training until validation scores don't improve for 200 rounds.
[100]	training's rmse: 3.83255	valid_1's rmse: 3.87503
[200]	training's rmse: 3.82455	valid_1's rmse: 3.86802
[300]	training's rmse: 3.8179	valid_1's rmse: 3.86236
[400]	training's rmse: 3.8122	valid_1's rmse: 3.85772
[500]	training's rmse: 3.80727	valid_1's rmse: 3.85374
[600]	training's rmse: 3.80313	valid_1's rmse: 3.85047
[700]	training's rmse: 3.79962	valid_1's rmse: 3.84761
[800]	training's rmse: 3.79652	valid_1's rmse: 3.84522
[900]	training's rmse: 3.79391	valid_1's rmse: 3.84335
[1000]	training's rmse: 3.79163	valid_1's rmse: 3.84176
[1100]	training's rmse: 3.78951	valid_1's rmse: 3.84051
[1200]	training's rmse: 3.78749	valid_1's rmse: 3.83947
[1300]	training's rmse: 3.78573	valid_1's rmse: 3.83858
[1400]	training's rmse: 3.78417	valid_1's rmse: 3.83781
[1500]	training's rmse: 3.78274	valid_1's rmse: 3.83716
[1600]	training's rmse: 3.78151	valid_1's rmse: 3.83656
[1700]	training's rmse: 3.78037	valid_1's r

In [45]:
sub_df = pd.DataFrame({"card_id":test["card_id"].values})
sub_df["target"] = predictions_pca
sub_df.to_csv("submission_PCA.csv", index=False)

### Features + PCA:

In [None]:
train_x_2 = pd.concat([df_train_all, pca_train_x], axis = 1)
test_x_2 = pd.concat([df_test_all, pca_test_x], axis = 1)

In [49]:
param = {'num_leaves': 111,
         'min_data_in_leaf': 149,
         'objective':'regression',
         'max_depth': 9,
         'learning_rate': 0.005,
         "boosting": "gbdt",
         "feature_fraction": 0.7522,
         "bagging_freq": 1,
         "bagging_fraction": 0.7083 ,
         "bagging_seed": 11,
         "metric": 'rmse',
         "lambda_l1": 0.3134,
         "random_state": 133,
         "verbosity": -1}

features = train_x_2.columns
folds = KFold(n_splits=5, shuffle=True, random_state=15)
oof_pca = np.zeros(len(train_x_2))
predictions_all = np.zeros(len(test_x_2))
start = time.time()
feature_importance_df = pd.DataFrame()

for fold_, (trn_idx, val_idx) in enumerate(folds.split(train_x_2.values, y_label_regr)):
    print("fold n°{}".format(fold_))
    trn_data = lgb.Dataset(train_x_2.iloc[trn_idx][features],
                           label=y_label_regr[trn_idx],
                           #categorical_feature=cat_feats
                           )
    val_data = lgb.Dataset(train_x_2.iloc[val_idx][features],
                           label=y_label_regr[val_idx],
                           #categorical_feature=cat_feats
                           )

    num_round = 10000
    clf = lgb.train(param,
                    trn_data,
                    num_round,
                    valid_sets=[trn_data, val_data],
                    verbose_eval=100,
                    early_stopping_rounds=200)

    oof_pca[val_idx] = clf.predict(train_x_2.iloc[val_idx][features], num_iteration=clf.best_iteration)

    fold_importance_df = pd.DataFrame()
    fold_importance_df["feature"] = features
    fold_importance_df["importance"] = clf.feature_importance()
    fold_importance_df["fold"] = fold_ + 1
    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)

    predictions_all += clf.predict(test_x_2[features], num_iteration=clf.best_iteration) / folds.n_splits

fold n°0
Training until validation scores don't improve for 200 rounds.
[100]	training's rmse: 3.76001	valid_1's rmse: 3.8175
[200]	training's rmse: 3.7173	valid_1's rmse: 3.79009
[300]	training's rmse: 3.69135	valid_1's rmse: 3.77899
[400]	training's rmse: 3.67324	valid_1's rmse: 3.77391
[500]	training's rmse: 3.65939	valid_1's rmse: 3.772
[600]	training's rmse: 3.64774	valid_1's rmse: 3.77135
[700]	training's rmse: 3.63816	valid_1's rmse: 3.77108
[800]	training's rmse: 3.62946	valid_1's rmse: 3.77119
Early stopping, best iteration is:
[691]	training's rmse: 3.63896	valid_1's rmse: 3.77105
fold n°1
Training until validation scores don't improve for 200 rounds.
[100]	training's rmse: 3.77843	valid_1's rmse: 3.74504
[200]	training's rmse: 3.73521	valid_1's rmse: 3.72108
[300]	training's rmse: 3.70912	valid_1's rmse: 3.71138
[400]	training's rmse: 3.69107	valid_1's rmse: 3.70722
[500]	training's rmse: 3.67722	valid_1's rmse: 3.70546
[600]	training's rmse: 3.66574	valid_1's rmse: 3.70473


In [50]:
sub_df = pd.DataFrame({"card_id":test["card_id"].values})
sub_df["target"] = predictions_all
sub_df.to_csv("submission_PCA_all.csv", index=False)