In [1]:
import warnings

## Plotting libraries
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from matplotlib.pylab import rcParams
rcParams['figure.figsize']= 15,6

import datetime
import numpy as np
import pandas as pd
from tqdm import tqdm

## Sklearn Libraries
from sklearn.utils import shuffle
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
from sklearn.metrics import f1_score, confusion_matrix, roc_curve, auc, \
            classification_report, recall_score, precision_recall_curve, roc_auc_score

#from catboost import CatBoostClassifier
import xgboost as xgb


# Define random state
random_state = 2018
np.random.seed(random_state)
warnings.filterwarnings('ignore')

In [2]:
data_train = pd.read_csv('checkpoint3.csv')

In [3]:
train = data_train.copy(deep=True)

In [4]:
train.head()

Unnamed: 0,impression_id,user_id,app_code,is_4g,os_version_intermediate,os_version_latest,os_version_old,is_click,visited,impression_time,item_id,item_price,ptype
0,c4ca4238a0b923820dcc509a6f75849b,87862,422,0,0,0,1,0,1,2018-11-15 00:00:00,43886,2350,5622
1,45c48cce2e2d7fbdea1afc51c7c6ad26,63410,467,1,0,1,0,1,7,2018-11-15 00:01:00,43209,3421,577
2,70efdf2ec9b086079795c442636b55fb,71748,259,1,1,0,0,0,1,2018-11-15 00:02:00,122348,973,2874
3,8e296a067a37563370ded05f5a3bf3ec,69209,244,1,0,1,0,0,11,2018-11-15 00:02:00,104035,819,2944
4,182be0c5cdcd5072bb1864cdee4d3d6e,62873,473,0,0,1,0,0,9,2018-11-15 00:02:00,4069,4556,9215


In [5]:
train_sub = train.drop(['impression_id','impression_time','user_id','item_id'],axis=1)

In [6]:
train_sub.head()

Unnamed: 0,app_code,is_4g,os_version_intermediate,os_version_latest,os_version_old,is_click,visited,item_price,ptype
0,422,0,0,0,1,0,1,2350,5622
1,467,1,0,1,0,1,7,3421,577
2,259,1,1,0,0,0,1,973,2874
3,244,1,0,1,0,0,11,819,2944
4,473,0,0,1,0,0,9,4556,9215


In [13]:
train.impression_time = pd.to_datetime(train.impression_time)

In [16]:
log = pd.read_csv('data/view_log.csv')

In [8]:
test = pd.read_csv('data/test.csv')

In [9]:
test_sub = pd.get_dummies(test.drop(['impression_id','impression_time','user_id'],axis=1))

In [16]:
test_sub['impression_id']

Unnamed: 0,app_code,is_4G,os_version_intermediate,os_version_latest,os_version_old
0,127,1,0,1,0
1,44,0,0,1,0
2,296,1,0,1,0
3,207,1,0,1,0
4,242,1,0,1,0


In [30]:
visited=[]

In [31]:
with tqdm(total=len(test)) as pbar:
    for index, row in test.iterrows():
        user_id = row['user_id']
        if len(train[train.user_id==user_id]['visited'])!=0:
            visited.append(train[train.user_id==user_id]['visited'].iloc[0])
            
        else:
            visited.append(0)
        
        pbar.update(1)

100%|██████████| 90675/90675 [03:20<00:00, 451.41it/s]


In [35]:
test_sub['visited']=visited

In [33]:
test.to_csv('checkpoint_test1.csv')

In [12]:
test = pd.read_csv('checkpoint_test1.csv',index_col=0)
test_sub = pd.get_dummies(test.drop(['impression_id','impression_time','user_id'],axis=1))

In [13]:
test_sub.head()

Unnamed: 0,app_code,is_4G,visited,os_version_intermediate,os_version_latest,os_version_old
0,127,1,0,0,1,0
1,44,0,49,0,1,0
2,296,1,0,0,1,0
3,207,1,32,0,1,0
4,242,1,22,0,1,0


In [17]:
items=[]

In [15]:
train[train.user_id==63410]['item_price'].iloc[0]

3421

In [18]:
price=[]
ptype=[]

In [19]:
with tqdm(total=len(test)) as pbar:
    for index, row in test.iterrows():
        time = row['impression_time']
        item_id = log[(log.server_time<time) & (log.user_id==row['user_id'])]
        
        if len(item_id)>0:
            items.append((row['impression_id'],item_id['item_id'].iloc[-1]))

        else:
            items.append(0)
            
        pbar.update(1)

100%|██████████| 90675/90675 [3:33:26<00:00,  7.36it/s]  


In [22]:
len(items)

90675

In [23]:
test['item_id']=0

In [24]:
with tqdm(total=len(items)) as pbar:
    for i in range(len(items)):
        if items[i]!=0:
            test.loc[test.impression_id==items[i][0],'item_id']=items[i][1]
        pbar.update(1)

100%|██████████| 90675/90675 [11:19<00:00, 133.45it/s]


In [25]:
test.to_csv('checkpoint_test2.csv')

In [26]:
test.head()

Unnamed: 0,impression_id,impression_time,user_id,app_code,os_version,is_4G,visited,item_id
0,a9e7126a585a69a32bc7414e9d0c0ada,2018-12-13 07:44:00,44754,127,latest,1,0,128995
1,caac14a5bf2ba283db7708bb34855760,2018-12-13 07:45:00,29656,44,latest,0,49,98581
2,13f10ba306a19ce7bec2f3cae507b698,2018-12-13 07:46:00,25234,296,latest,1,0,94782
3,39c4b4dc0e9701b55a0a4f072008fb3f,2018-12-13 07:47:00,22988,207,latest,1,32,28927
4,bf5a572cca75f5fc67f4b14e58b11d70,2018-12-13 07:48:00,35431,242,latest,1,22,127251


In [27]:
item = pd.read_csv('data/item_data.csv')

In [28]:
tmp=[]

In [29]:
with tqdm(total=len(test)) as pbar:
    for index, row in test.iterrows():
        item_id = row['item_id']
        
        if len(item[item.item_id==item_id]['item_price'])!=0:
            price = item[item.item_id==item_id]['item_price'].iloc[0]
            
        else:
            price=0
            
        if len(item[item.item_id==item_id]['product_type'])!=0:
            ptype = item[item.item_id==item_id]['product_type'].iloc[0]
        
        else:
            ptype=0
            
        tmp.append((row['impression_id'],price,ptype))
            
        pbar.update(1)

100%|██████████| 90675/90675 [05:22<00:00, 281.56it/s]


In [31]:
test['item_price']=0
test['ptype']=0

In [33]:
with tqdm(total=len(tmp)) as pbar:
    for i in range(len(tmp)):
        test.loc[test.impression_id==tmp[i][0],'item_price']=tmp[i][1]
        test.loc[test.impression_id==tmp[i][0],'ptype']=tmp[i][2]
        pbar.update(1)

100%|██████████| 90675/90675 [24:20<00:00, 62.09it/s]


In [34]:
test.to_csv('checkpoint_test3.csv')

In [7]:
test = pd.read_csv('checkpoint_test3.csv',index_col=0)

In [8]:
test.head()

Unnamed: 0,impression_id,impression_time,user_id,app_code,os_version,is_4G,visited,item_id,item_price,ptype
0,a9e7126a585a69a32bc7414e9d0c0ada,2018-12-13 07:44:00,44754,127,latest,1,0,128995,23424,1617
1,caac14a5bf2ba283db7708bb34855760,2018-12-13 07:45:00,29656,44,latest,0,49,98581,289,2637
2,13f10ba306a19ce7bec2f3cae507b698,2018-12-13 07:46:00,25234,296,latest,1,0,94782,2116,2111
3,39c4b4dc0e9701b55a0a4f072008fb3f,2018-12-13 07:47:00,22988,207,latest,1,32,28927,4145,4924
4,bf5a572cca75f5fc67f4b14e58b11d70,2018-12-13 07:48:00,35431,242,latest,1,22,127251,11504,10058


In [9]:
test_sub = pd.get_dummies(test.drop(['impression_id','impression_time','user_id','item_id'],axis=1))

In [10]:
test_sub.head()

Unnamed: 0,app_code,is_4G,visited,item_price,ptype,os_version_intermediate,os_version_latest,os_version_old
0,127,1,0,23424,1617,0,1,0
1,44,0,49,289,2637,0,1,0
2,296,1,0,2116,2111,0,1,0
3,207,1,32,4145,4924,0,1,0
4,242,1,22,11504,10058,0,1,0


In [11]:
train_sub.shape

(208303, 9)

In [12]:
test_sub.shape

(90675, 8)

In [13]:
class Create_ensemble(object):
    def __init__(self, n_splits, base_models):
        self.n_splits = n_splits
        self.base_models = base_models

    def predict(self, X, y, T):
        X = np.array(X)
        y = np.array(y)
        T = np.array(T)
        no_class = len(np.unique(y))

        folds = list(StratifiedKFold(n_splits=self.n_splits, shuffle=True, 
                                     random_state = random_state).split(X, y))

        train_proba = np.zeros((X.shape[0], no_class))
        test_proba = np.zeros((T.shape[0], no_class))
        
        train_pred = np.zeros((X.shape[0], len(self.base_models)))
        test_pred = np.zeros((T.shape[0], len(self.base_models)* self.n_splits))
        auc_scores = np.zeros((len(self.base_models), self.n_splits))
        recall_scores = np.zeros((len(self.base_models), self.n_splits))
        
        test_col = 0
        for i, clf in enumerate(self.base_models):
            
            for j, (train_idx, valid_idx) in enumerate(folds):
                
                X_train = X[train_idx]
                Y_train = y[train_idx]
                X_valid = X[valid_idx]
                Y_valid = y[valid_idx]
                
                clf.fit(X_train, Y_train)
                
                valid_pred = clf.predict(X_valid)
                recall  = recall_score(Y_valid, valid_pred, average='macro')
                auc = roc_auc_score(Y_valid, valid_pred, average='macro')
                
                recall_scores[i][j] = recall
                auc_scores[i][j] = auc
                
                train_pred[valid_idx, i] = valid_pred
                test_pred[:, test_col] = clf.predict(T)
                test_col += 1
                
                ## Probabilities
                valid_proba = clf.predict_proba(X_valid)
                train_proba[valid_idx, :] = valid_proba
                test_proba  += clf.predict_proba(T)
                
                print( "Model- {} and CV- {} recall: {}, auc_score: {}".format(i, j, recall, auc))
                
            test_proba /= self.n_splits
            
        return train_proba, test_proba, train_pred, test_pred

In [24]:
class_weight = dict({0:1.5, 1:50})
xtrain = train_sub.drop(['is_click'], axis=1)
ytrain = train_sub['is_click'].values

In [25]:
ss = StandardScaler()

In [26]:
xtrain = ss.fit_transform(xtrain)

In [27]:
test_sub = ss.fit_transform(test_sub)

In [54]:
rdf = RandomForestClassifier(bootstrap=True, class_weight=class_weight, criterion='entropy',
            max_depth=75, max_features='auto', max_leaf_nodes=1200,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=8, min_samples_split=10,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
            oob_score=False,
            random_state=random_state,
            verbose=0, warm_start=False)

In [61]:
num_pos_samples = len(train[train.is_click==1])
total_samples = len(train)

In [62]:
scale_pos_weight = 100 - ( (num_pos_samples /total_samples) * 100 )

In [63]:
xg = xgb.XGBClassifier(
                        learning_rate =0.15,
                        n_estimators=500,
                        max_depth = 65,
                        min_child_weight = 5,
                        gamma=0.4,
                        subsample=0.55,
                        #colsample_bytree=0.85,
                        objective= 'binary:logistic',
                        nthread=4,
                        reg_lambda=1,
                        scale_pos_weight=scale_pos_weight
                    )

In [55]:
base_models = [rdf
              #xg
              ]
n_splits = 5
lgb_stack = Create_ensemble(n_splits = n_splits, base_models = base_models)    

In [56]:
train_proba, test_proba, train_pred, test_pred = lgb_stack.predict(xtrain, ytrain, test_sub)

Model- 0 and CV- 0 recall: 0.6378938010861419, auc_score: 0.6378938010861419
Model- 0 and CV- 1 recall: 0.6216892200376509, auc_score: 0.6216892200376509
Model- 0 and CV- 2 recall: 0.636514699386027, auc_score: 0.636514699386027
Model- 0 and CV- 3 recall: 0.6354968598055105, auc_score: 0.6354968598055104
Model- 0 and CV- 4 recall: 0.6354562152457534, auc_score: 0.6354562152457535


In [31]:
print('1. The auc score of the model:{}\n'.format(roc_auc_score(ytrain, train_pred, average='macro')))
print('2. The f1 score of the model:{}\n'.format(f1_score(ytrain, train_pred, average='macro')))
print('3. Classification report \n:{} \n'.format(classification_report(ytrain, train_pred)))
print('4. Confusion matrix \n {} \n'.format(confusion_matrix(ytrain, train_pred)))

1. The auc score of the model:0.63203064819969

2. The f1 score of the model:0.3847848238049716

3. Classification report 
:              precision    recall  f1-score   support

           0       0.98      0.47      0.63    197441
           1       0.08      0.80      0.14     10862

    accuracy                           0.48    208303
   macro avg       0.53      0.63      0.38    208303
weighted avg       0.93      0.48      0.61    208303
 

4. Confusion matrix 
 [[ 91981 105460]
 [  2192   8670]] 



In [32]:
class custom(object):
    def __init__(self,base_models):
        self.base_models = base_models

    def predict(self, X, y, T):
        X = np.array(X)
        y = np.array(y)
        T = np.array(T)
        no_class = len(np.unique(y))

        train_proba = []
        test_proba = []
        valid_proba=[]
        
        train_pred = []
        test_pred = []
        auc_scores = []
        
        test_col = 0
        for i, clf in enumerate(self.base_models):
                
            X_train = X
            Y_train = y

            clf.fit(X_train, Y_train)

            valid_pred = clf.predict(X_train)
            auc = roc_auc_score(Y_train, valid_pred, average='macro')

            auc_scores.append(auc)

            train_pred.append(valid_pred)
            test_pred.append(clf.predict(T))

            ## Probabilities
            valid_proba.append(clf.predict_proba(X_train))
            train_proba.append(valid_proba)
            test_proba.append(clf.predict_proba(T))

            print( "Model- {}- , auc_score: {}".format(i,auc))
            
        return train_proba, test_proba, train_pred, test_pred

In [57]:
custom_class = custom(base_models=[RandomForestClassifier(bootstrap=True, class_weight=class_weight, criterion='entropy',
            max_depth=75, max_features='log2', max_leaf_nodes=1200,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=12, min_samples_split=13,
            min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=-1,
            oob_score=False,
            random_state=random_state,
            verbose=0, warm_start=False)])

In [58]:
train_proba, test_proba, train_pred, test_pred = custom_class.predict(xtrain, ytrain, test_sub)

Model- 0- , auc_score: 0.7329436974947333


In [96]:
test_pred

[array([1, 0, 1, ..., 1, 1, 1])]

In [130]:
submission = pd.DataFrame({
    'impression_id':test.impression_id,
    'is_click':test_pred[0]
})

In [131]:
submission.head()

Unnamed: 0,impression_id,is_click
0,a9e7126a585a69a32bc7414e9d0c0ada,1
1,caac14a5bf2ba283db7708bb34855760,0
2,13f10ba306a19ce7bec2f3cae507b698,1
3,39c4b4dc0e9701b55a0a4f072008fb3f,0
4,bf5a572cca75f5fc67f4b14e58b11d70,1


In [88]:
submission.is_click = submission.is_click.astype(int)

In [132]:
submission.to_csv('sub12.csv',index=False)