## 加载交互数据

In [1]:
import pandas as pd

train_interaction_df = pd\
    .read_table('/Users/sun/PycharmProjects/kuaishou/data/train_interaction.txt', names=['user_id', 'photo_id', 'click', 'like', 'follow', 'time', 'playing_time', 'duration_time'])
train_interaction_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20854344 entries, 0 to 20854343
Data columns (total 8 columns):
user_id          int64
photo_id         int64
click            int64
like             int64
follow           int64
time             int64
playing_time     int64
duration_time    int64
dtypes: int64(8)
memory usage: 1.2 GB


In [2]:
train_interaction_df.describe()

Unnamed: 0,user_id,photo_id,click,like,follow,time,playing_time,duration_time
count,20854340.0,20854340.0,20854340.0,20854340.0,20854340.0,20854340.0,20854340.0,20854340.0
mean,18831.17,3748311.0,0.2024292,0.003035866,0.0009912563,761064700000.0,2.947877,19.89893
std,10931.91,2185500.0,0.4018105,0.055015,0.03146862,36790460.0,418.8978,27.35497
min,2.0,1.0,0.0,0.0,0.0,760994900000.0,0.0,0.0
25%,9404.0,1830334.0,0.0,0.0,0.0,761032200000.0,0.0,11.0
50%,18921.0,3806133.0,0.0,0.0,0.0,761066900000.0,0.0,11.0
75%,28240.0,5613448.0,0.0,0.0,0.0,761099300000.0,0.0,17.0
max,37820.0,7560365.0,1.0,1.0,1.0,761124900000.0,1912441.0,3831.0


In [3]:
train_interaction_df.head()

Unnamed: 0,user_id,photo_id,click,like,follow,time,playing_time,duration_time
0,1637,6484142,0,0,0,761036604603,0,11
1,1637,4647664,0,0,0,761035501863,0,11
2,1637,4994626,1,0,0,761036933553,11,10
3,1637,4416881,0,0,0,761119247892,0,9
4,1637,2597756,0,0,0,761119421332,0,11


## 验证集
取photo_id > 6750000的，作为验证集，约10%

In [44]:
valid_interaction_df = train_interaction_df[train_interaction_df['photo_id'] > 6750000]
valid_interaction_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2152299 entries, 21 to 20854323
Data columns (total 8 columns):
user_id          int64
photo_id         int64
click            int64
like             int64
follow           int64
time             int64
playing_time     int64
duration_time    int64
dtypes: int64(8)
memory usage: 147.8 MB


## 训练集
剩余photo_id<=6750000的，作为训练集，约90%

In [45]:
train_interaction_df = train_interaction_df[train_interaction_df['photo_id'] <= 6750000]
train_interaction_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18702045 entries, 0 to 20854343
Data columns (total 8 columns):
user_id          int64
photo_id         int64
click            int64
like             int64
follow           int64
time             int64
playing_time     int64
duration_time    int64
dtypes: int64(8)
memory usage: 1.3 GB


## 全局平均值法
全局点击率作为预测值，即
$$
y_{ui} =\frac{\sum_{(u,i) \in Train, click=1}^{}{1}}{\sum_{(u,i) \in Train}^{}{1}}
$$


In [54]:
# click全局平局值
y = train_interaction_df.mean()['click']
y = round(y, 6)
y

0.20269

## 预测集
用全局平均值作为预测值，预测验证集数据

In [59]:
pred_df = valid_interaction_df[['user_id', 'photo_id']]
pred_df = pred_df.reindex(columns=['user_id', 'photo_id', 'click_probability'], fill_value=y)

## 导出结果

In [63]:
pred_df.to_csv('/Users/sun/PycharmProjects/kuaishou/out/pred_global_mean.txt', index=False, header=False, sep='\t')

## 评分
```
python py/evaluation_script.py out/pred_global_mean.txt out/valid.txt 
```

结果：
```
{'code': 0, 'message': 'validation success'}
{'code': 0, 'score': 0.5, 'message': 'success'}
```
