In [46]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import log_loss
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.ensemble import RandomForestClassifier

In [2]:
df_train = pd.read_csv('train.csv')
df_val = pd.read_csv('validation.csv')

In [3]:
x_train = df_train.drop(['click'],axis=1)
y_train = df_train['click']

In [4]:
y_val = df_val['click']
x_val = df_val.drop(['click'],axis=1)

In [5]:
def preprocess_dataframe(x):
    
    # Drop unrequired features
    x = x.drop(['bidid','userid','IP','domain','url','urlid','slotid','slotprice','creative','bidprice','payprice','keypage'],axis=1)
    
    # Split user agent into two features, platform and browser
    x['platform'] = x['useragent'].map(lambda x: str(x).split('_')[0])   
    x['browser'] = x['useragent'].map(lambda x: str(x).split('_')[1])
    
    # Drop User Agent
    x = x.drop(['useragent'],axis=1)
    
    x = x.join(x['usertag'].str.get_dummies(','))
    
    # Drop User Tag
    x = x.drop(['usertag'],axis=1)
    
    x.loc[x['slotvisibility'] == 'FirstView', 'slotvisibility'] = 1
    x.loc[x['slotvisibility'] == 'SecondView', 'slotvisibility'] = 2
    x.loc[x['slotvisibility'] == 'ThirdView', 'slotvisibility'] = 3
    x.loc[x['slotvisibility'] == 'FourthView', 'slotvisibility'] = 4
    x.loc[x['slotvisibility'] == 'FifthView', 'slotvisibility'] = 5
    x.loc[x['slotvisibility'] == 'OtherView', 'slotvisibility'] = 6
    x.loc[x['slotvisibility'] == 'Na', 'slotvisibility'] = 0
    x.loc[x['slotvisibility'] == '255', 'slotvisibility'] = 255
    
    x['adexchange'].fillna(0,inplace=True)
    
    x = x.join(pd.get_dummies(x['weekday'],prefix='weekday'))
    x = x.join(pd.get_dummies(x['hour'],prefix='hour'))
    x = x.join(pd.get_dummies(x['region'],prefix='region'))
    x = x.join(pd.get_dummies(x['city'],prefix='city'))
    x = x.join(pd.get_dummies(x['adexchange'],prefix='adexchange'))
    x = x.join(pd.get_dummies(x['slotwidth'],prefix='slotwidth'))
    x = x.join(pd.get_dummies(x['slotheight'],prefix='slotheight'))
    x = x.join(pd.get_dummies(x['slotvisibility'],prefix='slotvisibility'))
    x = x.join(pd.get_dummies(x['slotformat'],prefix='slotformat'))
    x = x.join(pd.get_dummies(x['advertiser'],prefix='advertiser'))
    x = x.join(pd.get_dummies(x['platform'],prefix='platform'))
    x = x.join(pd.get_dummies(x['browser'],prefix='browser'))
    
    
    # Drop unrequired features
    x = x.drop(['weekday','hour','region','city','adexchange','slotwidth','slotheight','slotvisibility','slotformat','advertiser','platform','browser'],axis=1)
    
    
#     x = x.join(x['weekday'].get_dummies())
#     x = x.join(x['hour'].get_dummies())
#     x = x.join(x['region'].get_dummies())
#     x = x.join(x['city'].get_dummies())
#     x = x.join(x['adexchange'].str.get_dummies())
#     x = x.join(x['slotwidth'].get_dummies())
#     x = x.join(x['slotheight'].get_dummies())
#     x = x.join(x['slotvisibility'].str.get_dummies())
#     x = x.join(x['slotformat'].str.get_dummies())
#     x = x.join(x['advertiser'].get_dummies())
#     x = x.join(x['platform'].str.get_dummies())
#     x = x.join(x['browser'].str.get_dummies())
    
    print('Data Preprocessed')

    return x

In [6]:
X_train = preprocess_dataframe(x_train)

Data Preprocessed


In [7]:
X_val = preprocess_dataframe(x_val)

Data Preprocessed


## Undersampled RF

In [8]:
from imblearn.under_sampling import RandomUnderSampler

rus = RandomUnderSampler(return_indices=True)
X_resampled, y_resampled, idx_resampled = rus.fit_sample(X_train,y_train)

In [9]:
rfclassifier = RandomForestClassifier(n_estimators=200,max_depth=5,criterion='entropy',bootstrap=False,min_samples_split=10,warm_start=False,class_weight='balanced')

In [10]:
rfclassifier.fit(X_resampled, y_resampled)

RandomForestClassifier(bootstrap=False, class_weight='balanced',
            criterion='entropy', max_depth=5, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=10, min_weight_fraction_leaf=0.0,
            n_estimators=200, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [11]:
usy_pred = rfclassifier.predict(X_val)

In [12]:
usy_pred

array([0, 0, 0, ..., 0, 0, 0])

In [14]:
def evaluate(y_true, y_pred):
    print("AUC: " + str(roc_auc_score(y_true, y_pred)))
    print("Cross Entropy " + str(log_loss(y_true, y_pred)))
    return print('Completed.')

evaluate(y_val, usy_pred)

AUC: 0.758365230395
Cross Entropy 2.50769175777
Completed.


In [15]:
from sklearn.metrics import mean_squared_error
from math import sqrt

rms = sqrt(mean_squared_error(y_val, usy_pred))
print('RMSE:',rms)

RMSE: 0.26945025208427015


In [248]:
from sklearn.metrics import confusion_matrix

confusion_matrix = pd.DataFrame(
    confusion_matrix(y_val,usy_pred), 
    columns=["Predicted False", "Predicted True"], 
    index=["Actual False", "Actual True"]
)
display(confusion_matrix)

Unnamed: 0,Predicted False,Predicted True
Actual False,284904,18819
Actual True,108,94


## No sampling

In [25]:
classifier = RandomForestClassifier(n_estimators=20,max_depth=100,class_weight='balanced',criterion='entropy',bootstrap=False,min_samples_split=10)

In [26]:
classifier.fit(X_train, y_train)

RandomForestClassifier(bootstrap=False, class_weight='balanced',
            criterion='entropy', max_depth=100, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=10, min_weight_fraction_leaf=0.0,
            n_estimators=20, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [27]:
y_pred = classifier.predict(X_val)

In [28]:
y_pred

array([0, 0, 0, ..., 0, 0, 0])

In [29]:
evaluate(y_val,y_pred)

AUC: 0.553270155
Cross Entropy 0.102280083789
Completed.


In [30]:
from sklearn.metrics import mean_squared_error
from math import sqrt

rms = sqrt(mean_squared_error(y_val, y_pred))
print('RMSE:',rms)

RMSE: 0.05441743184895006


In [31]:
from sklearn.metrics import confusion_matrix

confusion_matrix = pd.DataFrame(
    confusion_matrix(y_val,y_pred), 
    columns=["Predicted False", "Predicted True"], 
    index=["Actual False", "Actual True"]
)
display(confusion_matrix)

Unnamed: 0,Predicted False,Predicted True
Actual False,303003,720
Actual True,180,22


## Prediction on test set

In [35]:
#Preprocessing x_test data
df_test = pd.read_csv('test.csv')

In [36]:
x_test = df_test

In [38]:
def preprocess_test(x):
    
    # Drop unrequired features
    x = x.drop(['bidid','userid','IP','domain','url','urlid','slotid','slotprice','creative','keypage'],axis=1)
    
    # Split user agent into two features, platform and browser
    x['platform'] = x['useragent'].map(lambda x: str(x).split('_')[0])   
    x['browser'] = x['useragent'].map(lambda x: str(x).split('_')[1])
    
    # Drop User Agent
    x = x.drop(['useragent'],axis=1)
    
    x = x.join(x['usertag'].str.get_dummies(','))
    
    # Drop User Tag
    x = x.drop(['usertag'],axis=1)
    
    x.loc[x['slotvisibility'] == 'FirstView', 'slotvisibility'] = 1
    x.loc[x['slotvisibility'] == 'SecondView', 'slotvisibility'] = 2
    x.loc[x['slotvisibility'] == 'ThirdView', 'slotvisibility'] = 3
    x.loc[x['slotvisibility'] == 'FourthView', 'slotvisibility'] = 4
    x.loc[x['slotvisibility'] == 'FifthView', 'slotvisibility'] = 5
    x.loc[x['slotvisibility'] == 'OtherView', 'slotvisibility'] = 6
    x.loc[x['slotvisibility'] == 'Na', 'slotvisibility'] = 0
    x.loc[x['slotvisibility'] == '255', 'slotvisibility'] = 255
    
    x['adexchange'].fillna(0,inplace=True)
    
    x = x.join(pd.get_dummies(x['weekday'],prefix='weekday'))
    x = x.join(pd.get_dummies(x['hour'],prefix='hour'))
    x = x.join(pd.get_dummies(x['region'],prefix='region'))
    x = x.join(pd.get_dummies(x['city'],prefix='city'))
    x = x.join(pd.get_dummies(x['adexchange'],prefix='adexchange'))
    x = x.join(pd.get_dummies(x['slotwidth'],prefix='slotwidth'))
    x = x.join(pd.get_dummies(x['slotheight'],prefix='slotheight'))
    x = x.join(pd.get_dummies(x['slotvisibility'],prefix='slotvisibility'))
    x = x.join(pd.get_dummies(x['slotformat'],prefix='slotformat'))
    x = x.join(pd.get_dummies(x['advertiser'],prefix='advertiser'))
    x = x.join(pd.get_dummies(x['platform'],prefix='platform'))
    x = x.join(pd.get_dummies(x['browser'],prefix='browser'))
    
    
    # Drop unrequired features
    x = x.drop(['weekday','hour','region','city','adexchange','slotwidth','slotheight','slotvisibility','slotformat','advertiser','platform','browser'],axis=1)
    
    
#     x = x.join(x['weekday'].get_dummies())
#     x = x.join(x['hour'].get_dummies())
#     x = x.join(x['region'].get_dummies())
#     x = x.join(x['city'].get_dummies())
#     x = x.join(x['adexchange'].str.get_dummies())
#     x = x.join(x['slotwidth'].get_dummies())
#     x = x.join(x['slotheight'].get_dummies())
#     x = x.join(x['slotvisibility'].str.get_dummies())
#     x = x.join(x['slotformat'].str.get_dummies())
#     x = x.join(x['advertiser'].get_dummies())
#     x = x.join(x['platform'].str.get_dummies())
#     x = x.join(x['browser'].str.get_dummies())
    
    print('Data Preprocessed')

    return x

In [39]:
X_test = preprocess_test(x_test)

Data Preprocessed


In [42]:
y_test = classifier.predict(X_test)

In [47]:
y_test

array([0, 1, 0, ..., 0, 0, 0])

In [49]:
df_results = pd.DataFrame(y_test)
df_results.to_csv('testing_output.csv',header=None)