In [1]:
import pandas as pd
import warnings

warnings.filterwarnings(action='ignore')

data = pd.read_csv('ctr_data_20000.csv')

In [2]:
data = data.drop(['id','hour','device_id','device_ip'], axis=1)

X = data.loc[:, data.columns != 'click'].astype('str')
y = data.loc[:, data.columns == 'click'].astype('int').values.ravel()

X_dic = X.to_dict('records')

In [3]:
from sklearn.feature_extraction import DictVectorizer

one_hot_encoder = DictVectorizer()

onehot_X = one_hot_encoder.fit_transform(X_dic)

In [4]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

train_X, test_X = onehot_X[0:15000], onehot_X[15000:]
train_y, test_y = y[0:15000], y[15000:]

rdf = RandomForestClassifier(n_estimators=100,
                             criterion = 'gini',
                             min_samples_split=30)

rdf.fit(train_X, train_y)

prediction = rdf.predict(test_X)

print(classification_report(test_y, prediction))

              precision    recall  f1-score   support

           0       0.84      0.98      0.90      4141
           1       0.47      0.10      0.17       859

    accuracy                           0.83      5000
   macro avg       0.65      0.54      0.54      5000
weighted avg       0.78      0.83      0.78      5000



In [5]:
from sklearn.ensemble import AdaBoostClassifier

train_X02, test_X02 = onehot_X[0:15000], onehot_X[15000:]
train_y02, test_y02 = y[0:15000], y[15000:]

ada_boost = AdaBoostClassifier(n_estimators = 50)
ada_boost.fit(train_X02, train_y02)

prediction02 = ada_boost.predict(test_X02)

print(classification_report(test_y02, prediction02))

              precision    recall  f1-score   support

           0       0.84      0.98      0.90      4141
           1       0.48      0.08      0.14       859

    accuracy                           0.83      5000
   macro avg       0.66      0.53      0.52      5000
weighted avg       0.78      0.83      0.77      5000



In [6]:
from sklearn.ensemble import GradientBoostingClassifier

train_X03, test_X03 = onehot_X[0:15000], onehot_X[15000:]
train_y03, test_y03 = y[0:15000], y[15000:]

gbm = GradientBoostingClassifier()
gbm.fit(train_X03, train_y03)

prediction03 = gbm.predict(test_X03)

print(classification_report(test_y02, prediction02))

              precision    recall  f1-score   support

           0       0.84      0.98      0.90      4141
           1       0.48      0.08      0.14       859

    accuracy                           0.83      5000
   macro avg       0.66      0.53      0.52      5000
weighted avg       0.78      0.83      0.77      5000



In [7]:
!pip install xgboost



In [8]:
from xgboost import XGBClassifier

train_X04, test_X04 = onehot_X[0:15000], onehot_X[15000:]
train_y04, test_y04 = y[0:15000], y[15000:]

xgb = XGBClassifier()
xgb.fit(train_X04, train_y04)

prediction04 = xgb.predict(test_X04)

print(classification_report(test_y04, prediction04))

              precision    recall  f1-score   support

           0       0.84      0.98      0.90      4141
           1       0.44      0.09      0.15       859

    accuracy                           0.82      5000
   macro avg       0.64      0.53      0.53      5000
weighted avg       0.77      0.82      0.77      5000



In [9]:
!pip install lightgbm



In [10]:
import lightgbm as lgb

train_data = lgb.Dataset(train_X, train_y)
test_data = lgb.Dataset(test_X, test_y)

In [11]:
parameters = {
    'application': 'binary',
    'objective': 'binary',
    'metric': 'auc',
    'is_unbalance': 'true',
    'boosting': 'gbdt',
    'num_leaves': 31,
    'feature_fraction': 0.5,
    'bagging_fraction': 0.5,
    'bagging_freq': 20,
    'learning_rate': 0.05,
    'verbose': 0
}

num_round = 30

lgbm = lgb.train(parameters, train_data, num_round, valid_sets = [test_data])

prediction = lgbm.predict(test_X)

print(classification_report(test_y, prediction>0.5 ))

You can set `force_col_wise=true` to remove the overhead.
[1]	valid_0's auc: 0.686474
[2]	valid_0's auc: 0.698008
[3]	valid_0's auc: 0.705473
[4]	valid_0's auc: 0.707587
[5]	valid_0's auc: 0.711021
[6]	valid_0's auc: 0.712422
[7]	valid_0's auc: 0.71286
[8]	valid_0's auc: 0.712578
[9]	valid_0's auc: 0.713436
[10]	valid_0's auc: 0.713087
[11]	valid_0's auc: 0.713639
[12]	valid_0's auc: 0.713597
[13]	valid_0's auc: 0.715383
[14]	valid_0's auc: 0.715502
[15]	valid_0's auc: 0.715977
[16]	valid_0's auc: 0.715069
[17]	valid_0's auc: 0.715158
[18]	valid_0's auc: 0.71558
[19]	valid_0's auc: 0.715502
[20]	valid_0's auc: 0.715871
[21]	valid_0's auc: 0.716133
[22]	valid_0's auc: 0.715791
[23]	valid_0's auc: 0.714979
[24]	valid_0's auc: 0.714734
[25]	valid_0's auc: 0.714825
[26]	valid_0's auc: 0.714656
[27]	valid_0's auc: 0.713657
[28]	valid_0's auc: 0.71243
[29]	valid_0's auc: 0.712365
[30]	valid_0's auc: 0.712433
              precision    recall  f1-score   support

           0       0.87      