##Loading Data


In [3]:
import pickle
with open('/content/drive/MyDrive/amz_dataset/dataset.pkl', 'rb') as f:
  train_set = pickle.load(f)
  test_set = pickle.load(f)
  cate_list = pickle.load(f)
  user_count, item_count, cate_count = pickle.load(f)


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
import pandas as pd
df=pd.DataFrame()
for i in range(10000):
  ii=train_set[i]
  # for j in ii[1]
  df=df.append([[ii[0],ii[1][0],ii[2],ii[3]]])

In [5]:
df.columns=["C1","C2","C3","Label"]

In [6]:
df.head()

Unnamed: 0,C1,C2,C3,Label
0,103944,17704,53346,0
0,126219,15082,48620,0
0,21022,11725,2179,0
0,134145,4952,55798,1
0,121815,9906,45235,1


In [7]:
df2=pd.DataFrame()
for i in range(1000):
  ii=test_set[i]
  df2=df2.append([[ii[0],ii[1][0],ii[2][0]]])

In [8]:
df2.head()

Unnamed: 0,0,1,2
0,114981,24265,56743
0,50580,42354,43604
0,172652,28839,49205
0,98577,299,17961
0,147658,4932,20257


##Model Training

In [11]:
import gc
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
pos_trainDf = df[df['Label'] == 1]
neg_trainDf = df[df['Label'] == 0].sample(n=1000, random_state=2018)
trainDf = pd.concat([pos_trainDf, neg_trainDf], axis=0).sample(frac=1.0, random_state=2018)

bin_feats = []
cat_feats = ['C2','C3']
con_feats = ['C1']

print(trainDf.shape, trainDf['Label'].mean())

trainDf, testDf, _, _ = train_test_split(trainDf, trainDf['Label'], test_size=0.25, random_state=2018)

print(trainDf['Label'].mean(), trainDf.shape)
print(testDf['Label'].mean(), testDf.shape)


# 2. Feature Processing
trainDf = trainDf.fillna(0)
testDf = testDf.fillna(0)

train_sz = trainDf.shape[0]
print(trainDf.head())
combineDf = pd.concat([trainDf, testDf], axis=0)


# 2.1 Continuous features are all normalized
from sklearn.preprocessing import MinMaxScaler

for col in con_feats:
    scaler = MinMaxScaler()
    combineDf[col] = scaler.fit_transform(np.array(combineDf[col].values.tolist()).reshape(-1, 1))

# 2.2 discrete features one-hot
for col in bin_feats + cat_feats:
    onehotret = pd.get_dummies(combineDf[col], prefix=col)
    combineDf = pd.concat([combineDf, onehotret], axis=1)

# 3. Training model
label = 'Label'
onehot_feats = [col for col in combineDf.columns if col not in ['Label'] + con_feats + cat_feats + bin_feats]
train = combineDf[:train_sz]
test = combineDf[train_sz:]
print("Train.shape: {0}, Test.shape: {0}".format(train.shape, test.shape))

# 3.1 LR model
lr_feats = con_feats + onehot_feats
lr = LogisticRegression(penalty='l2', C=1)
lr.fit(train[lr_feats], train[label].values)


def do_model_metric(y_true, y_pred, y_pred_prob):
    # print("Predict 1 percent: {0}".format(np.mean(y_pred)))
    # print("Label 1 percent: {0}".format(train[label].mean()))
    from sklearn.metrics import roc_auc_score, accuracy_score
    print("AUC: {0:.3}".format(roc_auc_score(y_true=y_true, y_score=y_pred_prob[:, 1])))
    print("Accuracy: {0}".format(accuracy_score(y_true=y_true, y_pred=y_pred)))


print("Train............")
do_model_metric(y_true=train[label], y_pred=lr.predict(train[lr_feats]), y_pred_prob=lr.predict_proba(train[lr_feats]))

print("\n\n")
print("Test.............")
do_model_metric(y_true=test[label], y_pred=lr.predict(test[lr_feats]), y_pred_prob=lr.predict_proba(test[lr_feats]))

# 3.2 GBDT
lgb_feats = con_feats + cat_feats + bin_feats
categorical_feature_list = cat_feats + bin_feats

import lightgbm as lgb

lgb_params = {
    'objective': 'binary',
    'boosting_type': 'gbdt',
    'metric': 'auc',
    'learning_rate': 0.01,
    'num_leaves': 5,
    'max_depth': 4,
    'min_data_in_leaf': 100,
    'bagging_fraction': 0.8,
    'feature_fraction': 0.8,
    'bagging_freq': 10,
    'lambda_l1': 0.2,
    'lambda_l2': 0.2,
    'scale_pos_weight': 1,
}

lgbtrain = lgb.Dataset(train[lgb_feats].values, label=train[label].values,
                       feature_name=lgb_feats,
                       categorical_feature=categorical_feature_list
                       )
lgbvalid = lgb.Dataset(test[lgb_feats].values, label=test[label].values,
                       feature_name=lgb_feats,
                       categorical_feature=categorical_feature_list
                       )

evals_results = {}

lgb_model = lgb.train(lgb_params,
                      lgbtrain,
                      valid_sets=lgbvalid,
                      evals_result=evals_results,
                      num_boost_round=1000,
                      early_stopping_rounds=60,
                      verbose_eval=50,
                      categorical_feature=categorical_feature_list
                      )

# 3.3 LR + GBDT
train_sz = train.shape[0]
combineDf = pd.concat([train, test], axis=0, ignore_index=True)

# Get leaf node number Feature Transformation
gbdt_feats_vals = lgb_model.predict(combineDf[lgb_feats], pred_leaf=True)
gbdt_columns = ["gbdt_leaf_indices_" + str(i) for i in range(0, gbdt_feats_vals.shape[1])]

combineDf = pd.concat(
    [combineDf, pd.DataFrame(data=gbdt_feats_vals, index=range(0, gbdt_feats_vals.shape[0]), columns=gbdt_columns)],
    axis=1)

# onehotencoder(gbdt_feats)
origin_columns = combineDf.columns
for col in gbdt_columns:
    combineDf = pd.concat([combineDf, pd.get_dummies(combineDf[col], prefix=col)], axis=1)
gbdt_onehot_feats = [col for col in combineDf.columns if col not in origin_columns]

# train, test
train = combineDf[:train_sz]
test = combineDf[train_sz:]
del combineDf;
gc.collect();

lr_gbdt_feats = lr_feats + gbdt_onehot_feats

lr_gbdt_model = LogisticRegression(penalty='l2', C=1,max_iter=80)
lr_gbdt_model.fit(train[lr_gbdt_feats], train[label])

print("Train................")
do_model_metric(y_true=train[label], y_pred=lr_gbdt_model.predict(train[lr_gbdt_feats]),
                y_pred_prob=lr_gbdt_model.predict_proba(train[lr_gbdt_feats]))

print("Test..................")
do_model_metric(y_true=test[label], y_pred=lr_gbdt_model.predict(test[lr_gbdt_feats]),
                y_pred_prob=lr_gbdt_model.predict_proba(test[lr_gbdt_feats]))

(6044, 4) 0.8345466578424884
0.8310169865431282 (4533, 4)
0.8451356717405691 (1511, 4)
       C1     C2     C3  Label
0  185562   6306  27329      0
0  119301  23733  46556      1
0  175807  23587  27222      1
0  120587   8511  14109      1
0   30474   2270  43627      1
Train.shape: (4533, 9647), Test.shape: (4533, 9647)
Train............
AUC: 1.0
Accuracy: 0.8323406132803882



Test.............
AUC: 0.577
Accuracy: 0.8451356717405691
Training until validation scores don't improve for 60 rounds.




[50]	valid_0's auc: 0.507685
[100]	valid_0's auc: 0.498874
Early stopping, best iteration is:
[57]	valid_0's auc: 0.51063


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Train................
AUC: 0.998
Accuracy: 0.8407235826163688
Test..................
AUC: 0.55
Accuracy: 0.8451356717405691
