In [28]:
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier

In [2]:
conversion_train = pd.read_csv('pre/train.csv')

In [3]:
# Percentage of label 1 & 0 Instance
print('Percentage of Label 1:','{:.1%}'.format(len(conversion_train[conversion_train['label'] == 1])/len(conversion_train)))
print('Percentage of Label 0:','{:.1%}'.format(len(conversion_train[conversion_train['label'] == 0])/len(conversion_train)))

Percentage of Label 1: 2.5%
Percentage of Label 0: 97.5%


In [5]:
print('amount of Label 1:', len(conversion_train[conversion_train['label'] == 1]))

amount of Label 1: 93262


In [8]:
y = conversion_train['label']
X = conversion_train.drop (['label', 'conversionTime'], axis = 1)

In [10]:
# Extra Tree classifier
et_train = ExtraTreesClassifier()
et_train.fit(X, y)

# create predict value 
y_et = et_train.fit(X,y).predict(X)

# check accuracy
print("Accuracy: ", round(metrics.accuracy_score(y, y_et),2))

# check feature importance score
importfeature = pd.DataFrame(et_train.feature_importances_, columns = ["Importance"], index = X.columns).sort_values(['Importance'], ascending = False)
print(importfeature)

Accuracy:  1.0
                  Importance
clickTime           0.405981
userID              0.378301
positionID          0.108349
creativeID          0.095870
telecomsOperator    0.007467
connectionType      0.004032


In [4]:
ad = pd.read_csv('pre/ad.csv')
train = pd.read_csv('pre/train.csv')
test = pd.read_csv('pre/test.csv')

In [8]:
ad_train = ad.merge(train, on='creativeID')

In [19]:
ad_y = ad_train['label']
ad_X = ad_train.drop (['label', 'conversionTime', 'connectionType','telecomsOperator', 'clickTime', 'userID', 'positionID'], axis = 1)

# Extra Tree classifier
et_ad_train = ExtraTreesClassifier()
et_ad_train.fit(ad_X, ad_y)

# create predict value 
ad_y_et = et_ad_train.fit(ad_X,ad_y).predict(ad_X)

# Perform 10-fold Cross-Validation
print('Average 10-fold Accuracy:', round(np.mean(cross_val_score(et_ad_train, ad_X, ad_y, cv=10, scoring='accuracy')),2))

# check feature importance score
importfeature = pd.DataFrame(et_ad_train.feature_importances_, columns = ["Importance"], index = ad_X.columns).sort_values(['Importance'], ascending = False)
print(importfeature)

Average 10-fold Accuracy: 0.8
              Importance
advertiserID    0.244221
appID           0.239816
creativeID      0.185204
camgaignID      0.180491
adID            0.135621
appPlatform     0.014647


In [30]:
ad_y = ad_train['label']
ad_X = ad_train.drop (['label', 'conversionTime', 'connectionType','telecomsOperator', 'clickTime', 'userID', 'positionID'], axis = 1)

# Extra Tree classifier
rf_ad_train = RandomForestClassifier(max_depth=5)
rf_ad_train.fit(ad_X, ad_y)

# create predict value 
ad_y_rf = rf_ad_train.fit(ad_X,ad_y).predict(ad_X)

# Perform 10-fold Cross-Validation
print('Average 10-fold Accuracy:', round(np.mean(cross_val_score(rf_ad_train, ad_X, ad_y, cv=10, scoring='accuracy')),2))

# check feature importance score
importfeature = pd.DataFrame(rf_ad_train.feature_importances_, columns = ["Importance"], index = ad_X.columns).sort_values(['Importance'], ascending = False)
print(importfeature)

Average 10-fold Accuracy: 0.98
              Importance
advertiserID    0.467968
appID           0.287404
creativeID      0.095064
adID            0.084437
camgaignID      0.055016
appPlatform     0.010111


In [20]:
position = pd.read_csv('pre/position.csv')
train = pd.read_csv('pre/train.csv')
test = pd.read_csv('pre/test.csv')

In [21]:
position_train = position.merge(train, on='positionID')

In [22]:
position_y = position_train['label']
position_X = position_train.drop (['label', 'conversionTime', 'connectionType','telecomsOperator', 'clickTime', 'userID', 'creativeID'], axis = 1)

# Extra Tree classifier
et_position_train = ExtraTreesClassifier()
et_position_train.fit(position_X, position_y)

# create predict value 
position_y_et = et_position_train.fit(position_X,position_y).predict(position_X)

# Perform 10-fold Cross-Validation
print('Average 10-fold Accuracy:', round(np.mean(cross_val_score(et_position_train, position_X, position_y, cv=10, scoring='accuracy')),2))

# check feature importance score
importfeature = pd.DataFrame(et_position_train.feature_importances_, columns = ["Importance"], index = position_X.columns).sort_values(['Importance'], ascending = False)
print(importfeature)

Average 10-fold Accuracy: 0.57
              Importance
positionID      0.646511
sitesetID       0.227305
positionType    0.126184


In [29]:
position_y = position_train['label']
position_X = position_train.drop (['label', 'conversionTime', 'connectionType','telecomsOperator', 'clickTime', 'userID', 'creativeID'], axis = 1)

# Random Forest 
rf_position_train = RandomForestClassifier(max_depth=5)
rf_position_train.fit(position_X, position_y)

# create predict value 
position_y_rf = rf_position_train.fit(position_X,position_y).predict(position_X)

# Perform 10-fold Cross-Validation
print('Average 10-fold Accuracy:', round(np.mean(cross_val_score(et_position_train, position_X, position_y, cv=10, scoring='accuracy')),2))

# check feature importance score
importfeature = pd.DataFrame(rf_position_train.feature_importances_, columns = ["Importance"], index = position_X.columns).sort_values(['Importance'], ascending = False)
print(importfeature)

Average 10-fold Accuracy: 0.57
              Importance
sitesetID       0.400982
positionType    0.363378
positionID      0.235641


In [23]:
user = pd.read_csv('pre/user.csv')
train = pd.read_csv('pre/train.csv')
test = pd.read_csv('pre/test.csv')

In [24]:
user_train = user.merge(train, on='userID')

In [25]:
user_y = user_train['label']
user_X = user_train.drop (['label', 'conversionTime', 'connectionType','telecomsOperator', 'clickTime', 'positionID', 'creativeID'], axis = 1)

# Extra Tree classifier
et_user_train = ExtraTreesClassifier()
et_user_train.fit(user_X, user_y)

# create predict value 
user_y_et = et_user_train.fit(user_X,user_y).predict(user_X)

# Perform 10-fold Cross-Validation
print('Average 10-fold Accuracy:', round(np.mean(cross_val_score(et_user_train, user_X, user_y, cv=10, scoring='accuracy')),2))

# check feature importance score
importfeature = pd.DataFrame(et_user_train.feature_importances_, columns = ["Importance"], index = user_X.columns).sort_values(['Importance'], ascending = False)
print(importfeature)

Average 10-fold Accuracy: 0.97
                Importance
userID            0.430214
residence         0.216464
age               0.167815
hometown          0.141433
education         0.027097
haveBaby          0.007770
marriageStatus    0.007246
gender            0.001961


In [31]:
user_y = user_train['label']
user_X = user_train.drop (['label', 'conversionTime', 'connectionType','telecomsOperator', 'clickTime', 'positionID', 'creativeID'], axis = 1)

# Extra Tree classifier
rf_user_train = RandomForestClassifier(max_depth=5)
rf_user_train.fit(user_X, user_y)

# create predict value 
user_y_rf = rf_user_train.fit(user_X,user_y).predict(user_X)

# Perform 10-fold Cross-Validation
print('Average 10-fold Accuracy:', round(np.mean(cross_val_score(rf_user_train, user_X, user_y, cv=10, scoring='accuracy')),2))

# check feature importance score
importfeature = pd.DataFrame(rf_user_train.feature_importances_, columns = ["Importance"], index = user_X.columns).sort_values(['Importance'], ascending = False)
print(importfeature)

Average 10-fold Accuracy: 0.98
                Importance
education         0.216283
hometown          0.188332
gender            0.180405
age               0.150618
haveBaby          0.141069
residence         0.075024
marriageStatus    0.027073
userID            0.021196
