## Predict Ad Clicks
https://www.hackerearth.com/challenge/competitive/machine-learning-challenge-3/problems/
Leaderboard: 10th position

## Importing Reqd modules

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score as ruc
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams['figure.figsize'] = (10.0, 8.0)
import seaborn as sns

## Imputing missing values and Handling timeseries data

In [None]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

# imputing missing values
train['siteid'].fillna(-999, inplace=True)
test['siteid'].fillna(-999, inplace=True)

train['browserid'].fillna("None",inplace=True)
test['browserid'].fillna("None", inplace=True)

train['devid'].fillna("None",inplace=True)
test['devid'].fillna("None",inplace=True)

# create timebased features

train['datetime'] = pd.to_datetime(train['datetime'])
test['datetime'] = pd.to_datetime(test['datetime'])

train['tweekday'] = train['datetime'].dt.weekday
test['tweekday'] = test['datetime'].dt.weekday

train['thour'] = train['datetime'].dt.hour
test['thour'] = test['datetime'].dt.hour

train['tminute'] = train['datetime'].dt.minute
test['tminute'] = test['datetime'].dt.minute

# create aggregate features
site_offer_count = train.groupby(['siteid','offerid']).size().reset_index()
site_offer_count.columns = ['siteid','offerid','site_offer_count']

site_offer_count_test = test.groupby(['siteid','offerid']).size().reset_index()
site_offer_count_test.columns = ['siteid','offerid','site_offer_count']

site_cat_count = train.groupby(['siteid','category']).size().reset_index()
site_cat_count.columns = ['siteid','category','site_cat_count']

site_cat_count_test = test.groupby(['siteid','category']).size().reset_index()
site_cat_count_test.columns = ['siteid','category','site_cat_count']

site_mcht_count = train.groupby(['siteid','merchant']).size().reset_index()
site_mcht_count.columns = ['siteid','merchant','site_mcht_count']

site_mcht_count_test = test.groupby(['siteid','merchant']).size().reset_index()
site_mcht_count_test.columns = ['siteid','merchant','site_mcht_count']

# joining all files
agg_df = [site_offer_count,site_cat_count,site_mcht_count]
agg_df_test = [site_offer_count_test,site_cat_count_test,site_mcht_count_test]

for x in agg_df:
    train = train.merge(x)
    
for x in agg_df_test:
    test = test.merge(x)

In [None]:
train = train.sample(1500000)
print (train.shape)

## Converting dtype of columns into object

In [None]:
numeric_data = train.select_dtypes(include=[np.number])
cat_data = train.select_dtypes(exclude=[np.number])
print("There are {} numeric and {} categorical columns in train data".format(numeric_data.shape[1],cat_data.shape[1]))
numeric_data.drop(['site_offer_count','site_cat_count','tweekday','thour','tminute','site_mcht_count'],axis=1,inplace=True)
for i in numeric_data.columns:
    train[i] = train[i].astype(object)
cat_data = train.select_dtypes(include=['object'])

In [None]:
# Label Encoding
from sklearn.preprocessing import LabelEncoder
for c in list(train.select_dtypes(include=['object']).columns):
    if c != 'ID' and c!='click':
        lbl = LabelEncoder()
        lbl.fit(list(train[c].values) + list(test[c].values))
        train[c] = lbl.transform(list(train[c].values))
        test[c] = lbl.transform(list(test[c].values)) 

In [None]:
all_data = train.append(test)

## Analysing data

In [None]:
train_0 = train[train['click']==0]
train_0.drop('click',axis=1,inplace=True)
train_1 = train[train['click']==1]
train_1.drop('click',axis=1,inplace=True)

In [None]:
gp_by = ['countrycode','merchant']
count = train_1.groupby(gp_by).size()
count = count.to_frame()
count.columns = ['count']
#count = count.loc[4,'count'].to_frame()
count = count.loc[0,'count'].sort_values(ascending=False)[1:10].to_frame()
count.index.names = ['index']
count['index'] = count.index

In [None]:
#plot the  value count
sns.set(style="whitegrid", color_codes=True)
sns.barplot(x = 'index', y = 'count',data=count)
plt.show()

In [None]:
sns.set(style="whitegrid", color_codes=True)
#sns.swarmplot(x="devid", y="browserid", hue="click", data=all_data[all_data.notnull()])
sns.countplot(x="merchant", data=train_0, palette="Greens_d")
plt.xticks(rotation=90)
plt.show()

## Adding Features

In [None]:
all_data['is_cat_20'] = (all_data['category']==20)*1
all_data['is_not_cat_117_121'] = ((all_data['category']!=117)&(all_data['category']!=121))*1
all_data['is_country_2_3'] = ((all_data['countrycode']==2)|(all_data['countrycode']==3))*1
all_data['is_country_not_0_1'] = ((all_data['countrycode']!=1)&(all_data['countrycode']!=0))*1
all_data['is_country_2_browser_3'] = ((all_data['countrycode']==2)&((all_data['browserid']==3)))*1
all_data['is_country_2_browser_not_2'] = ((all_data['countrycode']==2)&((all_data['browserid']!=2)))*1
all_data['is_devid_not_3'] = (all_data['devid']!=3)*1
all_data['is_devid_0'] = (all_data['devid']==0)*1
all_data['is_browserid_not_7_8'] = ((all_data['browserid']!=7)&(all_data['browserid']!=8))*1
all_data['is_devid_3_browser_5_11'] = ((all_data['devid']==3)&((all_data['browserid']==5)|(all_data['browserid']==11)))*1
all_data['is_browser_not_1_2'] = ((all_data['browserid']!=1)&(all_data['browserid']!=2))*1
all_data['is_browser_3_6'] = ((all_data['browserid']==3)|(all_data['browserid']==6))*1
all_data['is_siteid_not_137632_22767_1466'] = ((all_data['siteid']!=137632)&(all_data['siteid']!=22767)&
                                               (all_data['siteid']!=1466))*1
all_data['is_siteid_4055_40767_43672'] = ((all_data['siteid']==43672)|(all_data['siteid']==4055)|(all_data['siteid']==40767))*1
all_data['is_siteid_124622_country_5'] = ((all_data['siteid']==124622)&(all_data['countrycode']==5))*1
all_data['is_thour_not_17'] = (all_data['thour']!=17)*1
all_data['is_country_4_hour_not_13'] = ((all_data['countrycode']==4)&(all_data['thour']!=13))*1

In [None]:
gp_by = 'merchant'
count = train_1.groupby(gp_by).size()
count = count.to_frame()
count.columns = ['count']
count.index.names = ['index']
count['index'] = count.index
count2 = count['count'].sort_values(ascending=False)[:50].index
count0 = count['count'].sort_values()[:50].index
def label(merch):
    if merch in count2 and merch not in count0:
        return 2
    elif merch in count0 and merch not in count2:
        return 0
    return 1
all_data['is_merch_bins'] = all_data[gp_by].apply(lambda row: label(row))
all_data['is_merch_bins'].unique()

In [None]:
gp_by = 'category'
count = train_1.groupby(gp_by).size()
count = count.to_frame()
count.columns = ['count']
count.index.names = ['index']
count['index'] = count.index
count2 = count['count'].sort_values(ascending=False)[:50].index
count0 = count['count'].sort_values()[:50].index
def label(merch):
    if merch in count2 and merch not in count0:
        return 2
    elif merch in count0 and merch not in count2:
        return 0
    return 1
all_data['is_category_bins'] = all_data[gp_by].apply(lambda row: label(row))
all_data['is_category_bins'].unique()

In [None]:
gp_by = 'siteid'
count = train_1.groupby(gp_by).size()
count = count.to_frame()
count.columns = ['count']
count.index.names = ['index']
count['index'] = count.index
count2 = count['count'].sort_values(ascending=False)[:10000].index
count0 = count['count'].sort_values()[:10000].index
def label(merch):
    if merch in count2 and merch not in count0:
        return 2
    elif merch in count0 and merch not in count2:
        return 0
    return 1
all_data['is_site_bins'] = all_data[gp_by].apply(lambda row: label(row))
all_data['is_site_bins'].unique()

In [None]:
gp_by = 'offerid'
count = train_1.groupby(gp_by).size()
count = count.to_frame()
count.columns = ['count']
count.index.names = ['index']
count['index'] = count.index
count2 = count['count'].sort_values(ascending=False)[:10000].index
count0 = count['count'].sort_values()[:10000].index
def label(merch):
    if merch in count2 and merch not in count0:
        return 2
    elif merch in count0 and merch not in count2:
        return 0
    return 1
all_data['is_offer_bins'] = all_data[gp_by].apply(lambda row: label(row))
all_data['is_offer_bins'].unique()

## Splitting Data and one-hot encoding

In [None]:
train_new = all_data[all_data['click'].notnull()]
test_new = all_data[all_data['click'].isnull()]

In [None]:

def onehot(onehot_df,df,column_name):
       onehot_df[column_name] = df[column_name]
       dummies = pd.get_dummies(onehot_df[column_name], prefix="_"+column_name)
       onehot_df = onehot_df.join(dummies)
       onehot_df = onehot_df.drop([column_name], axis=1)
       return onehot_df

def munge_onehot(df):
       onehot_df = pd.DataFrame(index = df.index)
       onehot_df = onehot(onehot_df, df, "countrycode")
       onehot_df = onehot(onehot_df, df, "browserid")
       onehot_df = onehot(onehot_df, df, "devid")
       onehot_df = onehot(onehot_df, df, "thour")
       onehot_df = onehot(onehot_df, df, "tweekday")
       return onehot_df

#create one-hot features
onehot_df = munge_onehot(train)
train_new = train_new.join(onehot_df) 
onehot_df = munge_onehot(test)
test_new = test_new.join(onehot_df)

## Creating training-testing data

In [None]:
target = train_new['click']
test_new_ID = test_new.ID
train_new.drop(['click','datetime','ID','_tweekday_1', '_tweekday_2', '_tweekday_3', '_tweekday_4'],axis=1,inplace=True)
test_new.drop(['click','datetime','ID'],axis=1,inplace=True)
X_train, X_test, y_train, y_test = train_test_split(train_new, target, test_size = 0.5,random_state=0)

## Training, prediciting,saving

In [None]:
from catboost import CatBoostClassifier
from sklearn.metrics import roc_auc_score as ruc
model = CatBoostClassifier(depth=9,iterations=100,learning_rate=0.1,od_pval=0.01,eval_metric='AUC',random_seed=0)

In [None]:
model.fit(X_train
          ,y_train
          ,cat_features=[0,1,2,3,4,5,9,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33]
          ,eval_set = (X_test, y_test)
          ,use_best_model=True
          ,verbose=True
         )

In [None]:
pred = model.predict_proba(test_new)[:,1]

In [None]:
sub = pd.DataFrame({'ID':test_new_ID,'click':pred})
sub.to_csv('cb_20.csv',index=False)