In [28]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import log_loss
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import classification_report,confusion_matrix

In [3]:
df_train = pd.read_csv('train.csv')
df_val = pd.read_csv('validation.csv')

In [16]:
sc = StandardScaler()

In [4]:
y_val = df_val['click']
x_val = df_val.drop(['click'],axis=1)

In [5]:
x_train = df_train.drop(['click'],axis=1)
y_train = df_train['click']

In [6]:
def preprocess_dataframe(x):
    
    # Drop unrequired features
    x = x.drop(['bidid','userid','IP','domain','url','urlid','slotid','slotprice','creative','bidprice','payprice','keypage'],axis=1)
    
    # Split user agent into two features, platform and browser
    x['platform'] = x['useragent'].map(lambda x: str(x).split('_')[0])   
    x['browser'] = x['useragent'].map(lambda x: str(x).split('_')[1])
    
    # Drop User Agent
    x = x.drop(['useragent'],axis=1)
    
    x = x.join(x['usertag'].str.get_dummies(','))
    
    # Drop User Tag
    x = x.drop(['usertag'],axis=1)
    
    x.loc[x['slotvisibility'] == 'FirstView', 'slotvisibility'] = 1
    x.loc[x['slotvisibility'] == 'SecondView', 'slotvisibility'] = 2
    x.loc[x['slotvisibility'] == 'ThirdView', 'slotvisibility'] = 3
    x.loc[x['slotvisibility'] == 'FourthView', 'slotvisibility'] = 4
    x.loc[x['slotvisibility'] == 'FifthView', 'slotvisibility'] = 5
    x.loc[x['slotvisibility'] == 'OtherView', 'slotvisibility'] = 6
    x.loc[x['slotvisibility'] == 'Na', 'slotvisibility'] = 0
    x.loc[x['slotvisibility'] == '255', 'slotvisibility'] = 255
    
    x['adexchange'].fillna(0,inplace=True)
    
    x = x.join(pd.get_dummies(x['weekday'],prefix='weekday'))
    x = x.join(pd.get_dummies(x['hour'],prefix='hour'))
    x = x.join(pd.get_dummies(x['region'],prefix='region'))
    x = x.join(pd.get_dummies(x['city'],prefix='city'))
    x = x.join(pd.get_dummies(x['adexchange'],prefix='adexchange'))
    x = x.join(pd.get_dummies(x['slotwidth'],prefix='slotwidth'))
    x = x.join(pd.get_dummies(x['slotheight'],prefix='slotheight'))
    x = x.join(pd.get_dummies(x['slotvisibility'],prefix='slotvisibility'))
    x = x.join(pd.get_dummies(x['slotformat'],prefix='slotformat'))
    x = x.join(pd.get_dummies(x['advertiser'],prefix='advertiser'))
    x = x.join(pd.get_dummies(x['platform'],prefix='platform'))
    x = x.join(pd.get_dummies(x['browser'],prefix='browser'))
    
    
    # Drop unrequired features
    x = x.drop(['weekday','hour','region','city','adexchange','slotwidth','slotheight','slotvisibility','slotformat','advertiser','platform','browser'],axis=1)
    
    
#     x = x.join(x['weekday'].get_dummies())
#     x = x.join(x['hour'].get_dummies())
#     x = x.join(x['region'].get_dummies())
#     x = x.join(x['city'].get_dummies())
#     x = x.join(x['adexchange'].str.get_dummies())
#     x = x.join(x['slotwidth'].get_dummies())
#     x = x.join(x['slotheight'].get_dummies())
#     x = x.join(x['slotvisibility'].str.get_dummies())
#     x = x.join(x['slotformat'].str.get_dummies())
#     x = x.join(x['advertiser'].get_dummies())
#     x = x.join(x['platform'].str.get_dummies())
#     x = x.join(x['browser'].str.get_dummies())
    
    print('Data Preprocessed')

    return x
    

In [8]:
X_train = preprocess_dataframe(x_train)

Data Preprocessed


In [7]:
X_val = preprocess_dataframe(x_val)

Data Preprocessed


In [9]:
classifier = GaussianNB()

#X_train = X_train.to_dict(orient='records')
#X_train = v.fit_transform(X_train)

In [10]:
classifier.fit(X_train,y_train)

GaussianNB(priors=None)

In [11]:
y_pred = classifier.predict(X_val)

In [12]:
print(y_pred)

[1 1 1 ..., 1 1 1]


In [13]:
y_pred[y_pred==1].shape[0]

283234

In [37]:
X_val.shape[0]

303925

In [14]:
def evaluate(y_true, y_pred):
    print("AUC: " + str(roc_auc_score(y_true, y_pred)))
    print("Cross Entropy " + str(log_loss(y_true, y_pred)))
    return print('Completed.')

evaluate(y_val, y_pred)

AUC: 0.509293349402
Cross Entropy 32.1674625509
Completed.


In [15]:
confusion_matrix = pd.DataFrame(
    confusion_matrix(y_val,y_pred), 
    columns=["Predicted False", "Predicted True"], 
    index=["Actual False", "Actual True"]
)
display(confusion_matrix)

Unnamed: 0,Predicted False,Predicted True
Actual False,20681,283042
Actual True,10,192


## Normalised data

In [18]:
sX_train = sc.fit_transform(X_train)

In [19]:
sX_val = sc.transform(X_val)

In [20]:
classifier2 = GaussianNB()

In [22]:
classifier2.fit(sX_train, y_train)

GaussianNB(priors=None)

In [23]:
sy_pred = classifier2.predict(sX_val)

In [24]:
sy_pred

array([1, 1, 1, ..., 1, 1, 1])

In [25]:
scores(y_val, sy_pred)

AUC: 0.503463592722
Cross Entropy 33.0821916767
Completed.


In [29]:
from sklearn.metrics import confusion_matrix

confusion_matrix = pd.DataFrame(
    confusion_matrix(y_val,sy_pred), 
    columns=["Predicted False", "Predicted True"], 
    index=["Actual False", "Actual True"]
)
display(confusion_matrix)

Unnamed: 0,Predicted False,Predicted True
Actual False,12629,291094
Actual True,7,195


## Undersampled data

In [30]:
from imblearn.under_sampling import RandomUnderSampler

In [32]:
rus = RandomUnderSampler(return_indices=True)
X_resampled, y_resampled, idx_resampled = rus.fit_sample(X_train,y_train)

In [33]:
classifier3 = GaussianNB()

In [34]:
classifier3.fit(X_resampled, y_resampled)

GaussianNB(priors=None)

In [36]:
usy_pred = classifier3.predict(X_val)

In [37]:
usy_pred

array([1, 1, 1, ..., 1, 0, 1])

In [38]:
scores(y_val,usy_pred)

AUC: 0.542164445502
Cross Entropy 26.4830500559
Completed.


In [41]:
from sklearn.metrics import confusion_matrix

confusion_matrix = pd.DataFrame(
    confusion_matrix(y_val,usy_pred), 
    columns=["Predicted False", "Predicted True"], 
    index=["Actual False", "Actual True"]
)
display(confusion_matrix)

Unnamed: 0,Predicted False,Predicted True
Actual False,70720,233003
Actual True,30,172


In [42]:
from sklearn.metrics import mean_squared_error
from math import sqrt

rms = sqrt(mean_squared_error(y_val, usy_pred))
print('RMSE:',rms)

RMSE: 0.8756398147275715


In [43]:
print (metrics.classification_report(y_val, usy_pred))

             precision    recall  f1-score   support

          0       1.00      0.23      0.38    303723
          1       0.00      0.85      0.00       202

avg / total       1.00      0.23      0.38    303925

