# 使用Logistic Regression做CTR预估

## 数据：Avazu公司提供的点击率预估，预测一个广告是否会被点击

	https://www.kaggle.com/c/avazu-ctr-prediction/data

## 数据文件
**train** - Training set. 10 days of click-through data, ordered chronologically. Non-clicks and clicks are subsampled according to different strategies. 

	https://www.kaggle.com/c/avazu-ctr-prediction/download/train.gz

**test** - Test set. 1 day of ads to for testing your model predictions. 

	https://www.kaggle.com/c/avazu-ctr-prediction/download/test.gz

**sampleSubmission.csv** - Sample submission file in the correct format, corresponds to the All-0.5 Benchmark. 

	https://www.kaggle.com/c/avazu-ctr-prediction/download/sampleSubmission.gz

### 原始数据集太大，选择训练集中前100,000 个样本，构成train_small.csv

## 数据字段
id: ad identifier
click: 0/1 for non-click/click
hour: format is YYMMDDHH, so 14091123 means 23:00 on Sept. 11, 2014 UTC.

C1 -- anonymized categorical variable, banner_pos, site_id, site_domain, site_category, app_id, app_domain, app_category, device_id, device_ip, device_model, device_type, device_conn_type, C14-C21 -- anonymized categorical variables

# Load Data

In [4]:
import pandas as pd


# Initial setup
train_filename = "train_small.csv"
test_filename = "test.csv"
submission_filename = "submit.csv"

training_set = pd.read_csv(train_filename)

# Explore Data

In [5]:
training_set.head(10)

Unnamed: 0,id,click,hour,C1,banner_pos,site_id,site_domain,site_category,app_id,app_domain,...,device_type,device_conn_type,C14,C15,C16,C17,C18,C19,C20,C21
0,1.000009e+18,0,14102100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,...,1,2,15706,320,50,1722,0,35,-1,79
1,1.000017e+19,0,14102100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,...,1,0,15704,320,50,1722,0,35,100084,79
2,1.000037e+19,0,14102100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,...,1,0,15704,320,50,1722,0,35,100084,79
3,1.000064e+19,0,14102100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,...,1,0,15706,320,50,1722,0,35,100084,79
4,1.000068e+19,0,14102100,1005,1,fe8cc448,9166c161,0569f928,ecad2386,7801e8d9,...,1,0,18993,320,50,2161,0,35,-1,157
5,1.000072e+19,0,14102100,1005,0,d6137915,bb1ef334,f028772b,ecad2386,7801e8d9,...,1,0,16920,320,50,1899,0,431,100077,117
6,1.000072e+19,0,14102100,1005,0,8fda644b,25d4cfcd,f028772b,ecad2386,7801e8d9,...,1,0,20362,320,50,2333,0,39,-1,157
7,1.000092e+19,0,14102100,1005,1,e151e245,7e091613,f028772b,ecad2386,7801e8d9,...,1,0,20632,320,50,2374,3,39,-1,23
8,1.000095e+19,1,14102100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,...,1,2,15707,320,50,1722,0,35,-1,79
9,1.000126e+19,0,14102100,1002,0,84c7ba46,c4e18dd6,50e219e0,ecad2386,7801e8d9,...,0,0,21689,320,50,2496,3,167,100191,23


In [6]:
training_set.describe()

Unnamed: 0,id,click,hour,C1,banner_pos,device_type,device_conn_type,C14,C15,C16,C17,C18,C19,C20,C21
count,99999.0,99999.0,99999.0,99999.0,99999.0,99999.0,99999.0,99999.0,99999.0,99999.0,99999.0,99999.0,99999.0,99999.0,99999.0
mean,9.500834e+18,0.174902,14102100.0,1005.03444,0.198302,1.055741,0.199272,17682.106071,318.333943,56.818988,1964.02909,0.789328,131.735447,37874.606366,88.555386
std,5.669435e+18,0.379885,0.0,1.088705,0.402641,0.583986,0.635271,3237.726956,11.931998,36.924283,394.961129,1.223747,244.077816,48546.369299,45.482979
min,32375630000000.0,0.0,14102100.0,1001.0,0.0,0.0,0.0,375.0,120.0,20.0,112.0,0.0,33.0,-1.0,13.0
25%,4.183306e+18,0.0,14102100.0,1005.0,0.0,1.0,0.0,15704.0,320.0,50.0,1722.0,0.0,35.0,-1.0,61.0
50%,1.074496e+19,0.0,14102100.0,1005.0,0.0,1.0,0.0,17654.0,320.0,50.0,1993.0,0.0,35.0,-1.0,79.0
75%,1.457544e+19,0.0,14102100.0,1005.0,0.0,1.0,0.0,20362.0,320.0,50.0,2306.0,2.0,39.0,100083.0,156.0
max,1.84467e+19,1.0,14102100.0,1010.0,5.0,5.0,5.0,21705.0,728.0,480.0,2497.0,3.0,1835.0,100248.0,157.0


In [7]:
training_set.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99999 entries, 0 to 99998
Data columns (total 24 columns):
id                  99999 non-null float64
click               99999 non-null int64
hour                99999 non-null int64
C1                  99999 non-null int64
banner_pos          99999 non-null int64
site_id             99999 non-null object
site_domain         99999 non-null object
site_category       99999 non-null object
app_id              99999 non-null object
app_domain          99999 non-null object
app_category        99999 non-null object
device_id           99999 non-null object
device_ip           99999 non-null object
device_model        99999 non-null object
device_type         99999 non-null int64
device_conn_type    99999 non-null int64
C14                 99999 non-null int64
C15                 99999 non-null int64
C16                 99999 non-null int64
C17                 99999 non-null int64
C18                 99999 non-null int64
C19                

In [8]:
training_set['C1'].unique()

array([1005, 1002, 1010, 1001, 1007, 1008])

In [11]:
var = ['C1','site_category','app_category','device_model','device_type','device_conn_type', 'C15','C16','C18','C19','C20']
for v in var:
    print '\n%s属性的不同取值和出现的次数\n'%v
    print training_set[v].value_counts()


C1属性的不同取值和出现的次数

1005    92454
1002     4211
1010     3191
1007      104
1001       29
1008       10
Name: C1, dtype: int64

site_category属性的不同取值和出现的次数

28905ebd    37696
50e219e0    25398
f028772b    19760
3e814130    12480
76b2941d     1843
335d28a8     1329
0569f928      675
72722551      314
f66779e6      279
75fa27f6      146
c0dd3be3       55
bcf865d9       13
a818d37a        7
42a36e14        2
110ab22d        1
e787de0e        1
Name: site_category, dtype: int64

app_category属性的不同取值和出现的次数

07d7df22    78827
0f2161f8    14971
f95efa07     2895
8ded1f7a     1347
cef3e649     1309
75d80bbe      171
4ce2e9fc      130
d1327cf5      129
09481d60       50
fc6fa53d       41
dc97ec06       40
a3c42688       32
0f9a328c       18
879c24eb       16
a86a3e89        8
7113d72a        7
a7fd01ec        4
8df2e842        2
4681bb9d        2
Name: app_category, dtype: int64

device_model属性的不同取值和出现的次数

8a4875bd    6886
d787e91b    5438
1f0bc64f    3769
4ea23a13    2301
711ee120    2108
c6263d8a

In [4]:
# id: ad identifier
# click: 0/1 for non-click/click
# hour: format is YYMMDDHH, so 14091123 means 23:00 on Sept. 11, 2014 UTC.
# C1 -- anonymized categorical variable
# banner_pos
# site_id
# site_domain
# site_category
# app_id
# app_domain
# app_category
# device_id
# device_ip
# device_model
# device_type
# device_conn_type
# C14-C21 -- anonymized categorical variables
from sklearn.externals import joblib
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

from utils import load_df



In [5]:
# 结果衡量
def print_metrics(true_values, predicted_values):
    print "Accuracy: ", metrics.accuracy_score(true_values, predicted_values)
    print "AUC: ", metrics.roc_auc_score(true_values, predicted_values)
    print "Confusion Matrix: ", + metrics.confusion_matrix(true_values, predicted_values)
    print metrics.classification_report(true_values, predicted_values)

# 拟合分类器
def classify(classifier_class, train_input, train_targets):
    classifier_object = classifier_class()
    classifier_object.fit(train_input, train_targets)
    return classifier_object

# 模型存储
def save_model(clf):
    joblib.dump(clf, 'classifier.pkl')

In [6]:
#drop了9个类别型的特征
train_data = load_df('train_small.csv').values

  if self.run_code(code, result):


In [8]:
train_data[:,:]

array([[       0, 14102100,     1005, ...,       35,       -1,       79],
       [       0, 14102100,     1005, ...,       35,   100084,       79],
       [       0, 14102100,     1005, ...,       35,   100084,       79],
       ..., 
       [       0, 14102100,     1005, ...,       35,       -1,       79],
       [       1, 14102100,     1005, ...,       35,       -1,       79],
       [       0, 14102100,     1005, ...,       35,       -1,       79]])

In [9]:
# 训练和存储模型
X_train, X_test, y_train, y_test = train_test_split(train_data[0::, 1::], train_data[0::, 0],
                                                    test_size=0.3, random_state=0)

classifier = classify(LogisticRegression, X_train, y_train)
predictions = classifier.predict(X_test)
print_metrics(y_test, predictions)
save_model(classifier)

Accuracy:  0.8233
AUC:  0.5
Confusion Matrix:  [[24699     0]
 [ 5301     0]]
             precision    recall  f1-score   support

          0       0.82      1.00      0.90     24699
          1       0.00      0.00      0.00      5301

avg / total       0.68      0.82      0.74     30000



  'precision', 'predicted', average, warn_for)


AUC为0.5，模型性能并不好，数据采样是一个原因，特征工程和模型简单是另一个原因（被点击的样本都预测错了:(）

In [12]:
# 按照指定的格式生成结果
def create_submission(ids, predictions, filename='submission.csv'):
    submissions = np.concatenate((ids.reshape(len(ids), 1), predictions.reshape(len(predictions), 1)), axis=1)
    df = DataFrame(submissions)
    df.to_csv(filename, header=['id', 'click'], index=False)

In [17]:
import numpy as np
from pandas import DataFrame

classifier = joblib.load('classifier.pkl')
test_data_df = load_df('test.csv', training=False)
ids = test_data_df.values[0:, 0]
predictions = classifier.predict(test_data_df.values[0:, 1:])
create_submission(ids, predictions)

## 样本数目太多，也可以采用SGDClassifier