## 加载交互数据

In [3]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [6]:
import pandas as pd

train_interaction_path = '../data/train_interaction.txt'
train_columns = ['user_id', 'photo_id', 'click', 'like', 'follow', 'time', 'playing_time', 'duration_time']

train_interaction_df = pd.read_table(train_interaction_path, names=train_columns)


In [7]:
train_interaction_df.info()
train_interaction_df.head()

Unnamed: 0,user_id,photo_id,click,like,follow,time,playing_time,duration_time
0,1637,6484142,0,0,0,761036604603,0,11
1,1637,4647664,0,0,0,761035501863,0,11
2,1637,4994626,1,0,0,761036933553,11,10
3,1637,4416881,0,0,0,761119247892,0,9
4,1637,2597756,0,0,0,761119421332,0,11


## 验证集

取photo_id > 6750000的，作为验证集（新作品），约10%，这样更符合测试集中都是新作品的事实

In [8]:
valid_interaction_df = train_interaction_df[train_interaction_df['photo_id'] > 6750000]

In [9]:
valid_interaction_df.info()
valid_interaction_df.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2152299 entries, 21 to 20854323
Data columns (total 8 columns):
user_id          int64
photo_id         int64
click            int64
like             int64
follow           int64
time             int64
playing_time     int64
duration_time    int64
dtypes: int64(8)
memory usage: 147.8 MB


Unnamed: 0,user_id,photo_id,click,like,follow,time,playing_time,duration_time
21,1637,7300221,0,0,0,761036382656,0,7
30,1637,7090577,1,0,0,761036770788,15,15
47,1637,7427964,0,0,0,761119147039,0,17
52,1637,7115682,0,0,0,761036422329,0,140
56,1637,7089041,0,0,0,761034513725,0,15


## 训练集
剩余photo_id<=6750000的，作为训练集，约90%

In [10]:
train_interaction_df = train_interaction_df[train_interaction_df['photo_id'] <= 6750000]

In [11]:
train_interaction_df.info()
train_interaction_df.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18702045 entries, 0 to 20854343
Data columns (total 8 columns):
user_id          int64
photo_id         int64
click            int64
like             int64
follow           int64
time             int64
playing_time     int64
duration_time    int64
dtypes: int64(8)
memory usage: 1.3 GB


Unnamed: 0,user_id,photo_id,click,like,follow,time,playing_time,duration_time
0,1637,6484142,0,0,0,761036604603,0,11
1,1637,4647664,0,0,0,761035501863,0,11
2,1637,4994626,1,0,0,761036933553,11,10
3,1637,4416881,0,0,0,761119247892,0,9
4,1637,2597756,0,0,0,761119421332,0,11


## 用户平均值法

用户的点击率（字段click的平均值）作为预测值，即

$$
y_{ui} =\frac{\sum_{i \in Train(u), click=1}^{}{1}}{\sum_{i \in Train(u)}^{}{1}}
$$


In [12]:
# 用户click平局值
y_u = train_interaction_df.groupby('user_id')['click'].mean()

In [13]:
y_u.head()

user_id
2    0.179757
3    0.126752
4    0.079392
7    0.168467
9    0.252546
Name: click, dtype: float64

## 预测

用户点击率预测验证集数据

In [14]:
pred_df = valid_interaction_df[['user_id', 'photo_id']].reindex(columns=['user_id', 'photo_id', 'click_probability'])

In [15]:
pred_df.info()
pred_df.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2152299 entries, 21 to 20854323
Data columns (total 3 columns):
user_id              int64
photo_id             int64
click_probability    float64
dtypes: float64(1), int64(2)
memory usage: 65.7 MB


Unnamed: 0,user_id,photo_id,click_probability
21,1637,7300221,
30,1637,7090577,
47,1637,7427964,
52,1637,7115682,
56,1637,7089041,


In [16]:
# 填充click_probability列，取值为用户的平均点击率

import numpy as np

pred_df['click_probability'] = pred_df.apply(
    lambda row : round(y_u[row['user_id']], 6) if np.isnan(row['click_probability']) else row['click_probability'],
    axis = 1
)

In [21]:
pred_df.head()

Unnamed: 0,user_id,photo_id,click_probability
21,1637,7300221,0.215165
30,1637,7090577,0.215165
47,1637,7427964,0.215165
52,1637,7115682,0.215165
56,1637,7089041,0.215165


## 导出

In [17]:
# 导出预测结果
pred_df.to_csv('../out/pred_user_mean.txt', index=False, header=False, sep='\t')

In [18]:
# 导出验证集
valid_interaction_df[['user_id', 'photo_id', 'click']].to_csv('../out/valid.txt', index=False, header=False, sep='\t')

## 评分

In [20]:
%run evaluation_script.py ../out/pred_user_mean.txt ../out/valid.txt

{'code': 0, 'message': 'validation success'}
{'code': 0, 'score': 0.7069625893799051, 'message': 'success'}
