# Data Analysis and Prediction of an "AD CLICK"
## Using Decision Tree             

  --Arhit Bose Tagore

### Importing Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
sns.set_style('darkgrid')
plt.style.use("dark_background")

### Importing the Train Dataset 

In [2]:
train_df = pd.read_csv("train_.csv")
df = train_df

In [3]:
train_df

Unnamed: 0.1,Unnamed: 0,id,click,hour,C1,banner_pos,site_id,site_domain,site_category,app_id,...,device_type,device_conn_type,C14,C15,C16,C17,C18,C19,C20,C21
0,0,1.000009e+18,0,14102100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,...,1,2,15706,320,50,1722,0,35,-1,79
1,1,1.000017e+19,0,14102100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,...,1,0,15704,320,50,1722,0,35,100084,79
2,2,1.000037e+19,0,14102100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,...,1,0,15704,320,50,1722,0,35,100084,79
3,3,1.000064e+19,0,14102100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,...,1,0,15706,320,50,1722,0,35,100084,79
4,4,1.000068e+19,0,14102100,1005,1,fe8cc448,9166c161,0569f928,ecad2386,...,1,0,18993,320,50,2161,0,35,-1,157
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
499995,499995,1.311196e+19,0,14102103,1005,0,85f751fd,c4e18dd6,50e219e0,53de0284,...,1,0,21611,320,50,2480,3,299,100111,61
499996,499996,1.311197e+19,0,14102103,1005,1,d9750ee7,98572c79,f028772b,ecad2386,...,1,0,17614,320,50,1993,2,1063,-1,33
499997,499997,1.311202e+19,0,14102103,1005,0,85f751fd,c4e18dd6,50e219e0,e2fcccd2,...,1,0,20633,320,50,2374,3,39,-1,23
499998,499998,1.311205e+19,0,14102103,1005,1,5b4d2eda,16a36ef3,f028772b,ecad2386,...,1,0,20355,216,36,2333,0,39,100077,157


In [4]:
df = df.drop_duplicates() 
df.duplicated().values.any()  #Finding any duplicates

False

In [5]:
df = df.fillna(method="ffill")
pd.isnull(df).any()  # Checking for Null Values

Unnamed: 0          False
id                  False
click               False
hour                False
C1                  False
banner_pos          False
site_id             False
site_domain         False
site_category       False
app_id              False
app_domain          False
app_category        False
device_id           False
device_ip           False
device_model        False
device_type         False
device_conn_type    False
C14                 False
C15                 False
C16                 False
C17                 False
C18                 False
C19                 False
C20                 False
C21                 False
dtype: bool

In [6]:
def to_date_column(df):
    df["dt_hour"] = pd.to_datetime(df["hour"], format="%y%m%d%H")
    df["year"] = df["dt_hour"].dt.year
    df["month"] = df["dt_hour"].dt.month
    df["day"] = df["dt_hour"].dt.day
    df["int_hour"] = df["dt_hour"].dt.hour
    df["is_weekday"] = df["dt_hour"].dt.dayofweek
    df["is_weekend"] = df.apply(lambda x: x["is_weekday"] in [5, 6], axis=1)
to_date_column(df)

In [7]:
df = df.drop(['Unnamed: 0'], axis=1)

In [8]:
df.head()

Unnamed: 0,id,click,hour,C1,banner_pos,site_id,site_domain,site_category,app_id,app_domain,...,C19,C20,C21,dt_hour,year,month,day,int_hour,is_weekday,is_weekend
0,1.000009e+18,0,14102100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,...,35,-1,79,2014-10-21,2014,10,21,0,1,False
1,1.000017e+19,0,14102100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,...,35,100084,79,2014-10-21,2014,10,21,0,1,False
2,1.000037e+19,0,14102100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,...,35,100084,79,2014-10-21,2014,10,21,0,1,False
3,1.000064e+19,0,14102100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,...,35,100084,79,2014-10-21,2014,10,21,0,1,False
4,1.000068e+19,0,14102100,1005,1,fe8cc448,9166c161,0569f928,ecad2386,7801e8d9,...,35,-1,157,2014-10-21,2014,10,21,0,1,False


In [9]:
df.tail()

Unnamed: 0,id,click,hour,C1,banner_pos,site_id,site_domain,site_category,app_id,app_domain,...,C19,C20,C21,dt_hour,year,month,day,int_hour,is_weekday,is_weekend
499995,1.311196e+19,0,14102103,1005,0,85f751fd,c4e18dd6,50e219e0,53de0284,d9b5648e,...,299,100111,61,2014-10-21 03:00:00,2014,10,21,3,1,False
499996,1.311197e+19,0,14102103,1005,1,d9750ee7,98572c79,f028772b,ecad2386,7801e8d9,...,1063,-1,33,2014-10-21 03:00:00,2014,10,21,3,1,False
499997,1.311202e+19,0,14102103,1005,0,85f751fd,c4e18dd6,50e219e0,e2fcccd2,5c5a694b,...,39,-1,23,2014-10-21 03:00:00,2014,10,21,3,1,False
499998,1.311205e+19,0,14102103,1005,1,5b4d2eda,16a36ef3,f028772b,ecad2386,7801e8d9,...,39,100077,157,2014-10-21 03:00:00,2014,10,21,3,1,False
499999,1.311207e+18,1,14102103,1005,0,2328ee8e,7804dea6,f028772b,ecad2386,7801e8d9,...,39,-1,32,2014-10-21 03:00:00,2014,10,21,3,1,False


In [10]:
df.describe()

Unnamed: 0,id,click,hour,C1,banner_pos,device_type,device_conn_type,C14,C15,C16,C17,C18,C19,C20,C21,year,month,day,int_hour,is_weekday
count,500000.0,500000.0,500000.0,500000.0,500000.0,500000.0,500000.0,500000.0,500000.0,500000.0,500000.0,500000.0,500000.0,500000.0,500000.0,500000.0,500000.0,500000.0,500000.0,500000.0
mean,9.314806e+18,0.164074,14102100.0,1005.034018,0.217276,1.036694,0.223636,18193.167466,319.183992,56.561368,2031.632062,1.109156,201.528508,42639.88713,74.050944,2014.0,10.0,21.0,1.321254,1.0
std,5.213424e+18,0.370343,0.9156416,0.966678,0.443134,0.489694,0.669603,3343.557221,21.037074,36.185037,417.213802,1.277892,273.749184,49497.494455,40.835674,0.0,0.0,0.0,0.915642,0.0
min,9984920000000.0,0.0,14102100.0,1001.0,0.0,0.0,0.0,375.0,120.0,20.0,112.0,0.0,33.0,-1.0,13.0,2014.0,10.0,21.0,0.0,1.0
25%,4.82805e+18,0.0,14102100.0,1005.0,0.0,1.0,0.0,15706.0,320.0,50.0,1722.0,0.0,35.0,-1.0,48.0,2014.0,10.0,21.0,1.0,1.0
50%,9.794425e+18,0.0,14102100.0,1005.0,0.0,1.0,0.0,18993.0,320.0,50.0,2161.0,0.0,39.0,-1.0,61.0,2014.0,10.0,21.0,1.0,1.0
75%,1.347337e+19,0.0,14102100.0,1005.0,0.0,1.0,0.0,20632.0,320.0,50.0,2351.0,3.0,297.0,100084.0,79.0,2014.0,10.0,21.0,2.0,1.0
max,1.84467e+19,1.0,14102100.0,1012.0,7.0,5.0,5.0,21705.0,1024.0,1024.0,2497.0,3.0,1835.0,100248.0,195.0,2014.0,10.0,21.0,3.0,1.0


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 500000 entries, 0 to 499999
Data columns (total 31 columns):
 #   Column            Non-Null Count   Dtype         
---  ------            --------------   -----         
 0   id                500000 non-null  float64       
 1   click             500000 non-null  int64         
 2   hour              500000 non-null  int64         
 3   C1                500000 non-null  int64         
 4   banner_pos        500000 non-null  int64         
 5   site_id           500000 non-null  object        
 6   site_domain       500000 non-null  object        
 7   site_category     500000 non-null  object        
 8   app_id            500000 non-null  object        
 9   app_domain        500000 non-null  object        
 10  app_category      500000 non-null  object        
 11  device_id         500000 non-null  object        
 12  device_ip         500000 non-null  object        
 13  device_model      500000 non-null  object        
 14  devi

In [12]:
df.columns

Index(['id', 'click', 'hour', 'C1', 'banner_pos', 'site_id', 'site_domain',
       'site_category', 'app_id', 'app_domain', 'app_category', 'device_id',
       'device_ip', 'device_model', 'device_type', 'device_conn_type', 'C14',
       'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21', 'dt_hour', 'year',
       'month', 'day', 'int_hour', 'is_weekday', 'is_weekend'],
      dtype='object')

### Label Encoding

In [13]:
label_col = "click"
x_columns = set(list(df.columns)) - set(["id", "site_id", "app_id", "hour", "dt_hour", "device_id", "device_ip", ] + [label_col] )

In [14]:
x_train = df[x_columns]
y_train = df[label_col]

In [15]:
from collections import defaultdict
from sklearn.preprocessing import LabelEncoder


x_train_len = len(x_train)
d = defaultdict(LabelEncoder)
n_df = x_train.apply(lambda x: d[x.name].fit_transform(x))
n_df.head()

Unnamed: 0,C1,C19,is_weekend,int_hour,month,is_weekday,app_category,year,C17,C16,...,device_type,device_conn_type,day,C20,C14,app_domain,site_category,banner_pos,C18,site_domain
0,2,1,0,0,0,0,0,0,40,2,...,1,1,0,0,162,62,2,0,0,1503
1,2,1,0,0,0,0,0,0,40,2,...,1,0,0,60,160,62,2,0,0,1503
2,2,1,0,0,0,0,0,0,40,2,...,1,0,0,60,160,62,2,0,0,1503
3,2,1,0,0,0,0,0,0,40,2,...,1,0,0,60,162,62,2,0,0,1503
4,2,1,0,0,0,0,0,0,79,2,...,1,0,0,0,256,62,0,1,0,910


In [16]:
n_df.columns

Index(['C1', 'C19', 'is_weekend', 'int_hour', 'month', 'is_weekday',
       'app_category', 'year', 'C17', 'C16', 'device_model', 'C15', 'C21',
       'device_type', 'device_conn_type', 'day', 'C20', 'C14', 'app_domain',
       'site_category', 'banner_pos', 'C18', 'site_domain'],
      dtype='object')

In [17]:
X = n_df
y = df['click']


## Training Logistic Regression Model

In [22]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix


In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [24]:
model=DecisionTreeClassifier(criterion='entropy')
model.fit(X_train,y_train)

DecisionTreeClassifier(criterion='entropy')

In [25]:
predictions = model.predict(X_test)

In [26]:
report = classification_report(y_test, predictions)
print('report:', report, sep='\n')

report:
              precision    recall  f1-score   support

           0       0.86      0.94      0.90     83617
           1       0.40      0.19      0.26     16383

    accuracy                           0.82    100000
   macro avg       0.63      0.57      0.58    100000
weighted avg       0.78      0.82      0.79    100000



In [27]:
confusion_matrix(y_test, predictions)

array([[79006,  4611],
       [13284,  3099]], dtype=int64)

In [28]:
accuracy_score(y_test, predictions)

0.82105

## We can observe that our Decision Tree Model has an accuracy score of 82.10%. Our model can give the correct results 82 times for every 100 tests. 

In [29]:
model.predict_proba(X_train)

array([[0.66666667, 0.33333333],
       [0.72972973, 0.27027027],
       [1.        , 0.        ],
       ...,
       [0.66666667, 0.33333333],
       [0.95      , 0.05      ],
       [0.88297872, 0.11702128]])