In [1]:
# from IPython.core.interactiveshell import InteractiveShell
# InteractiveShell.ast_node_interactivity = "all"

%matplotlib inline

## 面部特征数据

In [2]:
import pandas as pd

train_face_path = '../data/train_face.txt'
train_face_columns = ['photo_id', 'data']
train_face_df = pd.read_table(train_face_path, names=train_face_columns)

In [3]:
train_face_df.head()

Unnamed: 0,photo_id,data
0,9,"[[0.0377, 1, 3, 50]]"
1,3,"[[0.219, 0, 22, 66]]"
2,8,"[[0.0849, 0, 13, 54], [0.1221, 0, 22, 88]]"
3,2,"[[0.2018, 1, 18, 60]]"
4,19,"[[0.0123, 0, 24, 52]]"


In [4]:
# 取面部特征中面积最大的

import json

def get_max_area_face(row):
    ls = json.loads(row)
    r = [0, 0, 0, 0]
    for l in ls:
        if r[0] < l[0]:
            r = l
    return r


get_max_area_face('[[0.0849, 0, 13, 54], [0.1221, 0, 22, 88]]')
    

[0.1221, 0, 22, 88]

In [5]:
train_face_df['data'] = train_face_df['data'].apply(get_max_area_face)

In [6]:
train_face_df.head()

Unnamed: 0,photo_id,data
0,9,"[0.0377, 1, 3, 50]"
1,3,"[0.219, 0, 22, 66]"
2,8,"[0.1221, 0, 22, 88]"
3,2,"[0.2018, 1, 18, 60]"
4,19,"[0.0123, 0, 24, 52]"


In [7]:
# data列拆分：
train_face_details_df = pd.DataFrame(
    data=list(train_face_df['data'].values), 
    columns=['area', 'gender', 'age', 'score']
)

In [8]:
train_face_df = pd.concat([train_face_df, train_face_details_df], axis=1).drop(['data'], axis=1)

In [9]:
train_face_df.head()

Unnamed: 0,photo_id,area,gender,age,score
0,9,0.0377,1,3,50
1,3,0.219,0,22,66
2,8,0.1221,0,22,88
3,2,0.2018,1,18,60
4,19,0.0123,0,24,52


## 交互数据

In [10]:
import pandas as pd

train_interaction_path = '../data/train_interaction.txt'
train_interaction_columns = ['user_id', 'photo_id', 'click', 'like', 'follow', 'time', 'playing_time', 'duration_time']
train_interaction_df = pd.read_table(train_interaction_path, names=train_interaction_columns)

In [11]:
train_interaction_df.head()

Unnamed: 0,user_id,photo_id,click,like,follow,time,playing_time,duration_time
0,1637,6484142,0,0,0,761036604603,0,11
1,1637,4647664,0,0,0,761035501863,0,11
2,1637,4994626,1,0,0,761036933553,11,10
3,1637,4416881,0,0,0,761119247892,0,9
4,1637,2597756,0,0,0,761119421332,0,11


## 拼接数据

In [12]:
# 拼接交互数据与面部特征数据
train_df = pd.merge(train_interaction_df, train_face_df, on='photo_id', how='left')

In [13]:
train_df.head()

Unnamed: 0,user_id,photo_id,click,like,follow,time,playing_time,duration_time,area,gender,age,score
0,1637,6484142,0,0,0,761036604603,0,11,,,,
1,1637,4647664,0,0,0,761035501863,0,11,,,,
2,1637,4994626,1,0,0,761036933553,11,10,0.3783,1.0,27.0,48.0
3,1637,4416881,0,0,0,761119247892,0,9,,,,
4,1637,2597756,0,0,0,761119421332,0,11,,,,


In [14]:
train_df = train_df[['user_id', 'photo_id', 'time', 'duration_time', 'click', 'gender', 'age', 'score']]

In [15]:
# 空值填充为0
train_df = train_df.fillna(0)

In [16]:
train_df.head()

Unnamed: 0,user_id,photo_id,time,duration_time,click,gender,age,score
0,1637,6484142,761036604603,11,0,0.0,0.0,0.0
1,1637,4647664,761035501863,11,0,0.0,0.0,0.0
2,1637,4994626,761036933553,10,1,1.0,27.0,48.0
3,1637,4416881,761119247892,9,0,0.0,0.0,0.0
4,1637,2597756,761119421332,11,0,0.0,0.0,0.0


## 数据转化

In [17]:
# 时间戳标准化为0~24小时
def to_hour(time):
    return int(time / 3600000) % 24

In [18]:
train_df['time'] = train_df['time'].apply(to_hour)
train_df.head()

Unnamed: 0,user_id,photo_id,time,duration_time,click,gender,age,score
0,1637,6484142,7,11,0,0.0,0.0,0.0
1,1637,4647664,6,11,0,0.0,0.0,0.0
2,1637,4994626,7,10,1,1.0,27.0,48.0
3,1637,4416881,6,9,0,0.0,0.0,0.0
4,1637,2597756,6,11,0,0.0,0.0,0.0


In [19]:
from sklearn import preprocessing

age_tsf = preprocessing.MinMaxScaler()
score_tsf = preprocessing.MinMaxScaler()
time_tsf = preprocessing.MinMaxScaler()
dur_tsf = preprocessing.QuantileTransformer()

train_df['age'] = age_tsf.fit_transform(train_df['age'].values.reshape(-1, 1)).flatten()
train_df['score'] = score_tsf.fit_transform(train_df['score'].values.reshape(-1, 1)).flatten()
train_df['time'] = time_tsf.fit_transform(train_df['time'].values.reshape(-1, 1)).flatten()
train_df['duration_time'] = dur_tsf.fit_transform(train_df['duration_time'].values.reshape(-1, 1)).flatten()




In [20]:
train_df.head()

Unnamed: 0,user_id,photo_id,time,duration_time,click,gender,age,score
0,1637,6484142,0.304348,0.399399,0,0.0,0.0,0.0
1,1637,4647664,0.26087,0.399399,0,0.0,0.0,0.0
2,1637,4994626,0.304348,0.193193,1,1.0,0.692308,0.484848
3,1637,4416881,0.26087,0.133133,0,0.0,0.0,0.0
4,1637,2597756,0.26087,0.399399,0,0.0,0.0,0.0


## 清除数据

In [21]:
import gc

del train_face_df
del train_face_details_df
del train_face_columns
del train_face_path

del train_interaction_df
del train_interaction_columns
del train_interaction_path

gc.collect()

67

## 逻辑回归

In [22]:
# 每个用户训练一个逻辑回归模型，并导出

from sklearn import linear_model
from sklearn.externals import joblib

for user_id in train_df['user_id'].unique():
    train_X = train_df[train_df['user_id'] == user_id][['time', 'duration_time', 'gender', 'age', 'score']].values
    train_y = train_df[train_df['user_id'] == user_id]['click'].values
    clf = linear_model.LogisticRegression()
    clf.fit(train_X, train_y)
    # 持久化为磁盘文件
    joblib.dump(clf, '../submit/pred_user_interest_face_model/' + str(user_id))
    

In [23]:
del train_df
gc.collect()

330

## 预测

In [49]:
test_interaction_df = pd.read_table(
    '../data/test_interaction.txt', 
    names=['user_id', 'photo_id', 'time', 'duration_time']
)

In [50]:
test_interaction_df.head()

Unnamed: 0,user_id,photo_id,time,duration_time
0,29999,8154819,761158905921,17
1,29999,8374672,761163438550,9
2,29999,7987126,761143659968,11
3,29999,7912672,761159000400,17
4,29999,9062638,761163738888,11


In [51]:
test_face_df = pd.read_table(
    '../data/test_face.txt', 
    names=['photo_id', 'data']
)

In [52]:
test_face_df.head()

Unnamed: 0,photo_id,data
0,7560371,"[[0.0044, 0, 26, 49], [0.0047, 0, 27, 46], [0...."
1,7560367,"[[0.0049, 0, 27, 46], [0.0047, 1, 27, 47], [0...."
2,7560375,"[[0.0795, 1, 1, 43]]"
3,7560370,"[[0.0877, 1, 6, 52]]"
4,7560378,"[[0.1951, 0, 22, 87]]"


In [53]:
test_face_df['data'] = test_face_df['data'].apply(get_max_area_face)
# data列拆分：
test_face_details_df = pd.DataFrame(
    data=list(test_face_df['data'].values), 
    columns=['area', 'gender', 'age', 'score']
)
test_face_df = pd.concat([test_face_df, test_face_details_df], axis=1).drop(['data'], axis=1)

In [54]:
test_face_df.head()

Unnamed: 0,photo_id,area,gender,age,score
0,7560371,0.0047,0,27,46
1,7560367,0.0059,1,29,38
2,7560375,0.0795,1,1,43
3,7560370,0.0877,1,6,52
4,7560378,0.1951,0,22,87


In [55]:
pred_df = pd.merge(test_interaction_df, test_face_df, on='photo_id', how='left')

In [56]:
pred_df = pred_df.fillna(0)

In [57]:
pred_df.head()

Unnamed: 0,user_id,photo_id,time,duration_time,area,gender,age,score
0,29999,8154819,761158905921,17,0.0113,1.0,31.0,45.0
1,29999,8374672,761163438550,9,0.0207,1.0,28.0,40.0
2,29999,7987126,761143659968,11,0.1103,0.0,27.0,53.0
3,29999,7912672,761159000400,17,0.0,0.0,0.0,0.0
4,29999,9062638,761163738888,11,0.0,0.0,0.0,0.0


In [58]:
# 数据转换
pred_df['time'] = pred_df['time'].apply(to_hour)
pred_df['age'] = age_tsf.fit_transform(pred_df['age'].values.reshape(-1, 1)).flatten()
pred_df['score'] = score_tsf.fit_transform(pred_df['score'].values.reshape(-1, 1)).flatten()
pred_df['time'] = time_tsf.fit_transform(pred_df['time'].values.reshape(-1, 1)).flatten()
pred_df['duration_time'] = dur_tsf.fit_transform(pred_df['duration_time'].values.reshape(-1, 1)).flatten()



In [59]:
pred_df = pred_df.reindex(columns=['user_id', 'photo_id', 'time', 'duration_time', 'area', 'gender', 'age', 'score', 'click_probability'])


In [60]:
pred_df.head()

Unnamed: 0,user_id,photo_id,time,duration_time,area,gender,age,score,click_probability
0,29999,8154819,0.833333,0.734234,0.0113,1.0,0.794872,0.454545,
1,29999,8374672,0.916667,0.14014,0.0207,1.0,0.717949,0.40404,
2,29999,7987126,0.416667,0.42042,0.1103,0.0,0.692308,0.535354,
3,29999,7912672,0.833333,0.734234,0.0,0.0,0.0,0.0,
4,29999,9062638,0.916667,0.42042,0.0,0.0,0.0,0.0,


In [61]:
del test_interaction_df
del test_face_df
gc.collect()

123

In [62]:
# 装载预测模型

pred_models = {}

for user_id in pred_df['user_id'].unique():
    pred_models[user_id] = joblib.load('../submit/pred_user_interest_face_model/' + str(user_id))
    

In [63]:
# 预测函数
def predict(row):
#     print(np.array([row['time'], row['duration_time'], row['area'], row['gender'], row['age'], row['score']]))
    user_id = int(row['user_id'])
    clf = pred_models[user_id]
    proba = clf.predict_proba(np.array([row['time'], row['duration_time'], row['gender'], row['age'], row['score']]).reshape(1, -1))[0, 1]
    return round(proba, 6)


In [64]:
# 预测填充click_probability列

import numpy as np

pred_df['click_probability'] = pred_df.apply(predict, axis = 1)

In [65]:
pred_df = pred_df[['user_id', 'photo_id', 'click_probability']]

In [66]:
pred_df.head()

Unnamed: 0,user_id,photo_id,click_probability
0,29999,8154819,0.472987
1,29999,8374672,0.535491
2,29999,7987126,0.31651
3,29999,7912672,0.359282
4,29999,9062638,0.396593


## 导出

In [67]:
# 导出预测结果
pred_df.to_csv('../submit/pred_user_interest_face.txt', index=False, header=False, sep='\t')

## 评分
