## 加载交互数据

In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
import pandas as pd

train_interaction_path = '../data/train_interaction.txt'
train_columns = ['user_id', 'photo_id', 'click', 'like', 'follow', 'time', 'playing_time', 'duration_time']

train_interaction_df = pd.read_table(train_interaction_path, names=train_columns)

In [3]:
train_interaction_df.info()
train_interaction_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20854344 entries, 0 to 20854343
Data columns (total 8 columns):
user_id          int64
photo_id         int64
click            int64
like             int64
follow           int64
time             int64
playing_time     int64
duration_time    int64
dtypes: int64(8)
memory usage: 1.2 GB


Unnamed: 0,user_id,photo_id,click,like,follow,time,playing_time,duration_time
0,1637,6484142,0,0,0,761036604603,0,11
1,1637,4647664,0,0,0,761035501863,0,11
2,1637,4994626,1,0,0,761036933553,11,10
3,1637,4416881,0,0,0,761119247892,0,9
4,1637,2597756,0,0,0,761119421332,0,11


## 用户平均值法

用户的点击率作为预测值，即

$$
y_{ui} =\frac{\sum_{i \in Train(u), click=1}^{}{1}}{\sum_{i \in Train(u)}^{}{1}}
$$


In [4]:
# 用户click平局值
y_u = train_interaction_df.groupby('user_id')['click'].mean()

In [5]:
y_u.head()

user_id
2    0.174791
3    0.130316
4    0.081518
7    0.167005
9    0.252302
Name: click, dtype: float64

## 预测

用户点击率预测测试集数据

In [6]:
test_interaction_path = '../data/test_interaction.txt'
test_columns = ['user_id', 'photo_id', 'time', 'duration_time']

test_interaction_df = pd.read_table(test_interaction_path, names=test_columns)

In [7]:
test_interaction_df.info()
test_interaction_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3291332 entries, 0 to 3291331
Data columns (total 4 columns):
user_id          int64
photo_id         int64
time             int64
duration_time    int64
dtypes: int64(4)
memory usage: 100.4 MB


Unnamed: 0,user_id,photo_id,time,duration_time
0,29999,8154819,761158905921,17
1,29999,8374672,761163438550,9
2,29999,7987126,761143659968,11
3,29999,7912672,761159000400,17
4,29999,9062638,761163738888,11


In [8]:
pred_df = test_interaction_df[['user_id', 'photo_id']].reindex(columns=['user_id', 'photo_id', 'click_probability'])

In [9]:
pred_df.info()
pred_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3291332 entries, 0 to 3291331
Data columns (total 3 columns):
user_id              int64
photo_id             int64
click_probability    float64
dtypes: float64(1), int64(2)
memory usage: 75.3 MB


Unnamed: 0,user_id,photo_id,click_probability
0,29999,8154819,
1,29999,8374672,
2,29999,7987126,
3,29999,7912672,
4,29999,9062638,


In [10]:
# 填充click_probability列

import numpy as np

pred_df['click_probability'] = pred_df.apply(
    lambda row : round(y_u[row['user_id']], 6) if np.isnan(row['click_probability']) else row['click_probability'],
    axis = 1
)

In [13]:
# 检查有没有未补全的

pred_df[pred_df['click_probability'].isna()].shape

(0, 3)

## 导出


In [14]:
pred_df.to_csv('../submit/pred_user_mean.txt', index=False, header=False, sep='\t')

## 评分

0.678191
