In [1]:
import os
import pandas as pd
import shutil

In [2]:
TRAIN_PATH = 'D:/click_rate/train/train.csv'
SMALL_TRAIN_PATH = 'D:/click_rate/train/train_small.csv'
TEST_PATH = 'D:/click_rate/test/test.csv'
MODEL_PATH = 'D:/click_rate/model/'
MODEL_NAME = 'lr.m'

In [3]:
if not os.path.exists(MODEL_PATH):
    os.makedirs(MODEL_PATH)

In [4]:
def make_small_train_dataset():
    """
    创建一个小的数据集
    """
    fw = open(SMALL_TRAIN_PATH, 'w')
    count = 0
    with open(TRAIN_PATH, 'r') as f:  # 这种写法可以解决大文件读写问题
        for line in f:
            if count < 20001:  # 将前 10001 行写到一个新文件里
                fw.write(line)
            else:
                break
            count += 1
    fw.close()
    
make_small_train_dataset()

In [5]:
data = pd.read_csv(SMALL_TRAIN_PATH)
print('data shape:', data.shape)

data shape: (20000, 24)


In [14]:
data['click'].value_counts()  # 可以发现存在样本不均衡问题

0    16564
1     3436
Name: click, dtype: int64

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 24 columns):
id                  20000 non-null uint64
click               20000 non-null int64
hour                20000 non-null int64
C1                  20000 non-null int64
banner_pos          20000 non-null int64
site_id             20000 non-null object
site_domain         20000 non-null object
site_category       20000 non-null object
app_id              20000 non-null object
app_domain          20000 non-null object
app_category        20000 non-null object
device_id           20000 non-null object
device_ip           20000 non-null object
device_model        20000 non-null object
device_type         20000 non-null int64
device_conn_type    20000 non-null int64
C14                 20000 non-null int64
C15                 20000 non-null int64
C16                 20000 non-null int64
C17                 20000 non-null int64
C18                 20000 non-null int64
C19                 

In [7]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
data.head(10)

Unnamed: 0,id,click,hour,C1,banner_pos,site_id,site_domain,site_category,app_id,app_domain,app_category,device_id,device_ip,device_model,device_type,device_conn_type,C14,C15,C16,C17,C18,C19,C20,C21
0,1000009418151094273,0,14102100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,07d7df22,a99f214a,ddd2926e,44956a24,1,2,15706,320,50,1722,0,35,-1,79
1,10000169349117863715,0,14102100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,07d7df22,a99f214a,96809ac8,711ee120,1,0,15704,320,50,1722,0,35,100084,79
2,10000371904215119486,0,14102100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,07d7df22,a99f214a,b3cf8def,8a4875bd,1,0,15704,320,50,1722,0,35,100084,79
3,10000640724480838376,0,14102100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,07d7df22,a99f214a,e8275b8f,6332421a,1,0,15706,320,50,1722,0,35,100084,79
4,10000679056417042096,0,14102100,1005,1,fe8cc448,9166c161,0569f928,ecad2386,7801e8d9,07d7df22,a99f214a,9644d0bf,779d90c2,1,0,18993,320,50,2161,0,35,-1,157
5,10000720757801103869,0,14102100,1005,0,d6137915,bb1ef334,f028772b,ecad2386,7801e8d9,07d7df22,a99f214a,05241af0,8a4875bd,1,0,16920,320,50,1899,0,431,100077,117
6,10000724729988544911,0,14102100,1005,0,8fda644b,25d4cfcd,f028772b,ecad2386,7801e8d9,07d7df22,a99f214a,b264c159,be6db1d7,1,0,20362,320,50,2333,0,39,-1,157
7,10000918755742328737,0,14102100,1005,1,e151e245,7e091613,f028772b,ecad2386,7801e8d9,07d7df22,a99f214a,e6f67278,be74e6fe,1,0,20632,320,50,2374,3,39,-1,23
8,10000949271186029916,1,14102100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,07d7df22,a99f214a,37e8da74,5db079b5,1,2,15707,320,50,1722,0,35,-1,79
9,10001264480619467364,0,14102100,1002,0,84c7ba46,c4e18dd6,50e219e0,ecad2386,7801e8d9,07d7df22,c357dbff,f1ac7184,373ecbe6,0,0,21689,320,50,2496,3,167,100191,23


In [8]:
pd.set_option('display.max_columns', 0)
pd.set_option('display.max_rows', 60)

In [9]:
from sklearn import cross_validation
from sklearn import linear_model
from sklearn.externals import joblib  # 保存模型用
from sklearn.metrics import precision_recall_curve, roc_curve, auc  #模型评估用
from sklearn.metrics import classification_report  # 模型评估用


model_features = ['click', 'device_type', 'C1', 'C15', 'C16']
data = data[model_features]
X = data.as_matrix()[:, 1:]
y = data.as_matrix()[:, 0]  # 提取标签


# 划分数据集
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.3, random_state=0)


# check
print('X_train shape:', X_train.shape)
print('y_train shape:', y_train.shape)
print('X_test shape:', X_test.shape)
print('y_test shape:', y_test.shape)

X_train shape: (14000, 4)
y_train shape: (14000,)
X_test shape: (6000, 4)
y_test shape: (6000,)




In [10]:
# 训练模型
clf = linear_model.LogisticRegression(C=1.0, penalty='l1', tol=1e-6)
clf.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=1e-06,
          verbose=0, warm_start=False)

In [11]:
# 保存模型
joblib.dump(clf, os.path.join(MODEL_PATH, MODEL_NAME))

['D:/click_rate/model/lr.m']

In [12]:
# 预测
proba = clf.predict_proba(X_test)
pred = clf.predict(X_test)

In [13]:
# 模型评估
report = classification_report(y_test, pred, target_names = ['neg', 'pos'])
print(report)

[0.162      0.16218473 0.16511226 0.16502506 0.28427419 0.28952772
 0.29192547 0.36162362 0.38247012 0.43283582 0.43939394 0.
 1.        ]
[1.         0.99897119 0.98353909 0.98251029 0.14506173 0.14506173
 0.14506173 0.10082305 0.09876543 0.08950617 0.08950617 0.
 0.        ]
[0.05022744 0.06862439 0.1656042  0.16585197 0.16597597 0.18440219
 0.21669815 0.23930388 0.26167362 0.33026526 0.45074713 0.76067009]
             precision    recall  f1-score   support

        neg       0.84      1.00      0.91      5028
        pos       0.00      0.00      0.00       972

avg / total       0.70      0.84      0.76      6000

