In [1]:
# from IPython.core.interactiveshell import InteractiveShell
# InteractiveShell.ast_node_interactivity = "all"

%matplotlib inline

## 交互数据

In [2]:
import pandas as pd

train_interaction_df = pd.read_table(
    '../data/train_interaction.txt', 
    names=['user_id', 'photo_id', 'click', 'like', 'follow', 'time', 'playing_time', 'duration_time']
)

In [3]:
train_interaction_df.head()

Unnamed: 0,user_id,photo_id,click,like,follow,time,playing_time,duration_time
0,1637,6484142,0,0,0,761036604603,0,11
1,1637,4647664,0,0,0,761035501863,0,11
2,1637,4994626,1,0,0,761036933553,11,10
3,1637,4416881,0,0,0,761119247892,0,9
4,1637,2597756,0,0,0,761119421332,0,11


In [4]:
test_interaction_df = pd.read_table(
    '../data/test_interaction.txt',
    names=['user_id', 'photo_id', 'time', 'duration_time']
)

In [5]:
test_interaction_df.head()

Unnamed: 0,user_id,photo_id,time,duration_time
0,29999,8154819,761158905921,17
1,29999,8374672,761163438550,9
2,29999,7987126,761143659968,11
3,29999,7912672,761159000400,17
4,29999,9062638,761163738888,11


## 训练集

In [6]:
train_df = train_interaction_df[['user_id', 'photo_id', 'time', 'duration_time', 'click']]
train_df.head()

Unnamed: 0,user_id,photo_id,time,duration_time,click
0,1637,6484142,761036604603,11,0
1,1637,4647664,761035501863,11,0
2,1637,4994626,761036933553,10,1
3,1637,4416881,761119247892,9,0
4,1637,2597756,761119421332,11,0


## 测试集

In [7]:
test_df = test_interaction_df.reindex(columns=['user_id', 'photo_id', 'time', 'duration_time', 'click'])
test_df.head()

Unnamed: 0,user_id,photo_id,time,duration_time,click
0,29999,8154819,761158905921,17,
1,29999,8374672,761163438550,9,
2,29999,7987126,761143659968,11,
3,29999,7912672,761159000400,17,
4,29999,9062638,761163738888,11,


## 训练

In [8]:
# time戳转为0~23小时
def to_hour(time):
    return int(time/3600000)%24

In [9]:
# duration_time转为 0, 1~10, 11~20, 21~30, 31~40, 41~50, 51~60, >61
import math

def to_category(duration_time):
    r = math.ceil(duration_time / 10)
    if r > 7:
        r = 7
    return r

In [10]:
from sklearn import preprocessing
from sklearn import linear_model
from sklearn import svm
from sklearn.externals import joblib

# 热值编码器（0~23小时，0~7时长等级）
enc = preprocessing.OneHotEncoder(n_values=[24, 8])
enc.fit([[0, 0]])

for user_id in train_df.user_id.unique():
    train_X = train_df[train_df.user_id == user_id][['time', 'duration_time']]
    train_X.time = train_X.time.apply(to_hour)
    train_X.duration_time = train_X.duration_time.apply(to_category)
    train_X = train_X.values
    train_X = enc.transform(train_X).toarray()
    train_y = train_df[train_df.user_id == user_id]['click'].values
#     clf = linear_model.LogisticRegression(C=100.0, random_state=0)
    clf = linear_model.LogisticRegressionCV()
#     clf = svm.SVC(C=100, random_state=0, probability=True, kernel='linear', decision_function_shape='ovo')
    clf.fit(train_X, train_y)
    joblib.dump(clf, '../submit/pred_user_interaction_model_lr/' + str(user_id))
#     joblib.dump(clf, '../submit/pred_user_interaction_model_svm/' + str(user_id))
    

## 预测

In [11]:
# 装载模型
models = {}
for user_id in test_df.user_id.unique():
    models[user_id] = joblib.load('../submit/pred_user_interaction_model_lr/' + str(user_id))
#     models[user_id] = joblib.load('../submit/pred_user_interaction_model_svm/' + str(user_id))

In [12]:
# 预测函数

def predict(row):
    user_id = row.user_id
    clf = models[user_id]
    pred_X = enc.transform([[to_hour(row.time), to_category(row.duration_time)]]).toarray()
    pred_y = clf.predict_proba(pred_X)
    index = np.where(clf.classes_ == 1)[0][0]
    pred_y = pred_y[0, index]
    pred_y = round(pred_y, 6)
    return pred_y


In [13]:
# 填充click_probability列
import numpy as np

test_df.click = test_df.apply(predict, axis = 1)

In [14]:
test_df = test_df[['user_id', 'photo_id', 'click']]
test_df.head()

Unnamed: 0,user_id,photo_id,click
0,29999,8154819,0.337003
1,29999,8374672,0.336918
2,29999,7987126,0.337003
3,29999,7912672,0.337003
4,29999,9062638,0.336913


## 导出

In [15]:
# 导出预测结果
test_df.to_csv('../submit/pred_user_interaction_lr.txt', index=False, header=False, sep='\t', float_format='%.6f')
# test_df.to_csv('../submit/pred_user_interaction_svm.txt', index=False, header=False, sep='\t', float_format='%.6f')

## 评分

0.639526