In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
train_item = pd.read_csv('../data/tianchi_fresh_comp_train_item.csv')
train_user = pd.read_csv('../data/tianchi_fresh_comp_train_user.csv')

In [3]:
train_item.head()

Unnamed: 0,item_id,item_geohash,item_category
0,100002303,,3368
1,100003592,,7995
2,100006838,,12630
3,100008089,,7791
4,100012750,,9614


In [4]:
train_user.head()

Unnamed: 0,user_id,item_id,behavior_type,user_geohash,item_category,time
0,10001082,285259775,1,97lk14c,4076,2014-12-08 18
1,10001082,4368907,1,,5503,2014-12-12 12
2,10001082,4368907,1,,5503,2014-12-12 12
3,10001082,53616768,1,,9762,2014-12-02 15
4,10001082,151466952,1,,5232,2014-12-12 11


In [5]:
train_user.count()

user_id          23291027
item_id          23291027
behavior_type    23291027
user_geohash      7380017
item_category    23291027
time             23291027
dtype: int64

user_geohash字段绝大部分都为空，故在建模时可以直接删除该列

In [6]:
train_user = train_user.drop('user_geohash', axis=1)
train_user.head()

Unnamed: 0,user_id,item_id,behavior_type,item_category,time
0,10001082,285259775,1,4076,2014-12-08 18
1,10001082,4368907,1,5503,2014-12-12 12
2,10001082,4368907,1,5503,2014-12-12 12
3,10001082,53616768,1,9762,2014-12-02 15
4,10001082,151466952,1,5232,2014-12-12 11


time字段里包含日期和小时，需要提取出日期

In [7]:
train_user['time'] = train_user['time'].apply(lambda x: x.split(' ')[0])
train_user.head()

Unnamed: 0,user_id,item_id,behavior_type,item_category,time
0,10001082,285259775,1,4076,2014-12-08
1,10001082,4368907,1,5503,2014-12-12
2,10001082,4368907,1,5503,2014-12-12
3,10001082,53616768,1,9762,2014-12-02
4,10001082,151466952,1,5232,2014-12-12


In [8]:
# 划分训练集和测试集
test_user_df = train_user[train_user['time'] == '2014-12-18']
train_user_df = train_user[train_user['time'] < '2014-12-18']
print test_user_df.head(5)
print train_user_df.head(5)

       user_id    item_id  behavior_type  item_category        time
217  100029775  247380548              1          10223  2014-12-18
232  100029775  247380548              2          10223  2014-12-18
250  100029775  205264014              1           1863  2014-12-18
274  100029775  205264014              1           1863  2014-12-18
317  100029775   87557153              1          10894  2014-12-18
    user_id    item_id  behavior_type  item_category        time
0  10001082  285259775              1           4076  2014-12-08
1  10001082    4368907              1           5503  2014-12-12
2  10001082    4368907              1           5503  2014-12-12
3  10001082   53616768              1           9762  2014-12-02
4  10001082  151466952              1           5232  2014-12-12


In [9]:
# behavior_type进行one-hot
user_behavior_type = pd.get_dummies(train_user_df['behavior_type'], prefix='behavior_type_')
user_behavior_type.head()

Unnamed: 0,behavior_type__1,behavior_type__2,behavior_type__3,behavior_type__4
0,1,0,0,0
1,1,0,0,0
2,1,0,0,0
3,1,0,0,0
4,1,0,0,0


In [10]:
train_user_df = pd.concat([train_user_df[['user_id', 'item_id', 'time']], user_behavior_type], axis=1)
train_user_df.head()

Unnamed: 0,user_id,item_id,time,behavior_type__1,behavior_type__2,behavior_type__3,behavior_type__4
0,10001082,285259775,2014-12-08,1,0,0,0
1,10001082,4368907,2014-12-12,1,0,0,0
2,10001082,4368907,2014-12-12,1,0,0,0
3,10001082,53616768,2014-12-02,1,0,0,0
4,10001082,151466952,2014-12-12,1,0,0,0


In [11]:
train_user_df = train_user_df.groupby(['time', 'user_id', 'item_id']).agg('sum').reset_index()
train_user_df.head()

Unnamed: 0,time,user_id,item_id,behavior_type__1,behavior_type__2,behavior_type__3,behavior_type__4
0,2014-11-18,492,59758671,2,0,0,0
1,2014-11-18,492,76093985,1,0,0,0
2,2014-11-18,492,110036513,2,0,0,0
3,2014-11-18,492,176404510,1,0,0,0
4,2014-11-18,492,178412255,2,0,0,0


In [12]:
train_user_df = train_user_df.set_index('time')
train_user_df.head()

Unnamed: 0_level_0,user_id,item_id,behavior_type__1,behavior_type__2,behavior_type__3,behavior_type__4
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2014-11-18,492,59758671,2,0,0,0
2014-11-18,492,76093985,1,0,0,0
2014-11-18,492,110036513,2,0,0,0
2014-11-18,492,176404510,1,0,0,0
2014-11-18,492,178412255,2,0,0,0


In [13]:
# 选取16号的特征集数据
train_x = train_user_df.loc['2014-12-16', :]
train_x.head()

Unnamed: 0_level_0,user_id,item_id,behavior_type__1,behavior_type__2,behavior_type__3,behavior_type__4
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2014-12-16,3726,54088634,2,0,0,0
2014-12-16,3726,163729118,1,0,0,0
2014-12-16,3726,345226281,2,1,0,0
2014-12-16,3726,395752908,2,0,0,0
2014-12-16,36465,5527236,2,1,0,0


In [14]:
# 选取17号的购买行为作为分类标签
train_y = train_user_df.loc['2014-12-17', ['user_id', 'item_id', 'behavior_type__4']]
train_y.head()

Unnamed: 0_level_0,user_id,item_id,behavior_type__4
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2014-12-17,492,333812946,0
2014-12-17,3726,37476258,0
2014-12-17,3726,64607517,0
2014-12-17,3726,73091924,0
2014-12-17,3726,112317995,0


In [15]:
train_data = pd.merge(train_x, train_y, on=['user_id', 'item_id'], how='left').fillna(0)
train_data.head()

Unnamed: 0,user_id,item_id,behavior_type__1,behavior_type__2,behavior_type__3,behavior_type__4_x,behavior_type__4_y
0,3726,54088634,2,0,0,0,0.0
1,3726,163729118,1,0,0,0,0.0
2,3726,345226281,2,1,0,0,0.0
3,3726,395752908,2,0,0,0,0.0
4,36465,5527236,2,1,0,0,0.0


In [16]:
train_data['label'] = train_data.behavior_type__4_y.apply(lambda x: 1 if x > 0 else 0)
train_data.head()

Unnamed: 0,user_id,item_id,behavior_type__1,behavior_type__2,behavior_type__3,behavior_type__4_x,behavior_type__4_y,label
0,3726,54088634,2,0,0,0,0.0,0
1,3726,163729118,1,0,0,0,0.0,0
2,3726,345226281,2,1,0,0,0.0,0
3,3726,395752908,2,0,0,0,0.0,0
4,36465,5527236,2,1,0,0,0.0,0


In [17]:
test_user_behavior_type = pd.get_dummies(test_user_df['behavior_type'], prefix='test_user_behavior_type')
test_user_behavior_type.head()

Unnamed: 0,test_user_behavior_type_1,test_user_behavior_type_2,test_user_behavior_type_3,test_user_behavior_type_4
217,1,0,0,0
232,0,1,0,0
250,1,0,0,0
274,1,0,0,0
317,1,0,0,0


In [18]:
test_user_df = pd.concat([test_user_df[['user_id', 'item_id', 'time']], test_user_behavior_type], axis=1)
test_user_df.head()

Unnamed: 0,user_id,item_id,time,test_user_behavior_type_1,test_user_behavior_type_2,test_user_behavior_type_3,test_user_behavior_type_4
217,100029775,247380548,2014-12-18,1,0,0,0
232,100029775,247380548,2014-12-18,0,1,0,0
250,100029775,205264014,2014-12-18,1,0,0,0
274,100029775,205264014,2014-12-18,1,0,0,0
317,100029775,87557153,2014-12-18,1,0,0,0


In [19]:
test_user_df = test_user_df.groupby(['time', 'user_id', 'item_id']).agg('sum').reset_index()
test_user_df.head()

Unnamed: 0,time,user_id,item_id,test_user_behavior_type_1,test_user_behavior_type_2,test_user_behavior_type_3,test_user_behavior_type_4
0,2014-12-18,492,68197904,3,0,0,0
1,2014-12-18,492,169720786,2,0,0,0
2,2014-12-18,492,262056903,3,0,0,0
3,2014-12-18,492,319959973,2,0,0,0
4,2014-12-18,3726,2742779,2,0,0,0


In [20]:
test_user_df.set_index('time')
test_user_df.head()

Unnamed: 0,time,user_id,item_id,test_user_behavior_type_1,test_user_behavior_type_2,test_user_behavior_type_3,test_user_behavior_type_4
0,2014-12-18,492,68197904,3,0,0,0
1,2014-12-18,492,169720786,2,0,0,0
2,2014-12-18,492,262056903,3,0,0,0
3,2014-12-18,492,319959973,2,0,0,0
4,2014-12-18,3726,2742779,2,0,0,0


In [21]:
test_x = train_user_df.loc['2014-12-17', :]
test_x.head()

Unnamed: 0_level_0,user_id,item_id,behavior_type__1,behavior_type__2,behavior_type__3,behavior_type__4
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2014-12-17,492,333812946,4,0,0,0
2014-12-17,3726,37476258,1,0,0,0
2014-12-17,3726,64607517,2,0,0,0
2014-12-17,3726,73091924,2,0,0,0
2014-12-17,3726,112317995,1,0,0,0


In [22]:
test_y = test_user_df[['user_id', 'item_id', 'test_user_behavior_type_4']]
test_y.head()

Unnamed: 0,user_id,item_id,test_user_behavior_type_4
0,492,68197904,0
1,492,169720786,0
2,492,262056903,0
3,492,319959973,0
4,3726,2742779,0


In [23]:
test_data = pd.merge(test_x, test_y, on=['user_id', 'item_id'], how='left').fillna(0)
test_data.head()

Unnamed: 0,user_id,item_id,behavior_type__1,behavior_type__2,behavior_type__3,behavior_type__4,test_user_behavior_type_4
0,492,333812946,4,0,0,0,0.0
1,3726,37476258,1,0,0,0,0.0
2,3726,64607517,2,0,0,0,0.0
3,3726,73091924,2,0,0,0,0.0
4,3726,112317995,1,0,0,0,0.0


In [24]:
test_data['label'] = test_data.test_user_behavior_type_4.apply(lambda x: 1 if x > 0 else 0)
test_data.head()

Unnamed: 0,user_id,item_id,behavior_type__1,behavior_type__2,behavior_type__3,behavior_type__4,test_user_behavior_type_4,label
0,492,333812946,4,0,0,0,0.0,0
1,3726,37476258,1,0,0,0,0.0,0
2,3726,64607517,2,0,0,0,0.0,0
3,3726,73091924,2,0,0,0,0.0,0
4,3726,112317995,1,0,0,0,0.0,0


In [25]:
x_train = train_data.iloc[:, 2:6]
y_train = train_data.iloc[:, -1]

In [26]:
x_test = test_data.iloc[:, 2:6]
y_test = test_data.iloc[:, -1]

## Logistic Regression

In [27]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

In [28]:
# 建模
logistic = LogisticRegression()

# 精确度
precision = np.mean(cross_val_score(logistic, x_train, y_train, cv=10, scoring='precision'))
print '精准度：', precision

# 召回率
recall = np.mean(cross_val_score(logistic, x_train, y_train, cv=10, scoring='recall'))
print '召回率：', recall

# F1
F1 = np.mean(cross_val_score(logistic, x_train, y_train, cv=10, scoring='f1'))
print 'F1值：', F1

精准度： 0.166666666667
召回率： 0.00654205607477
F1值： 0.0125807223152


In [69]:
logistic.fit(x_train, y_train)
y_predict = logistic.predict(test_user_df.iloc[:, 3:])

In [71]:
test_user_df['label'] = y_predict
submission = test_user_df[test_user_df['label']==1]
submission.head()

Unnamed: 0,time,user_id,item_id,test_user_behavior_type_1,test_user_behavior_type_2,test_user_behavior_type_3,test_user_behavior_type_4,label
14755,2014-12-18,5538350,371523534,10,1,3,1,1
21103,2014-12-18,11244543,385729946,11,0,3,1,1
22888,2014-12-18,11759070,88640862,14,0,4,0,1
30277,2014-12-18,14073732,239991623,8,0,3,0,1
39500,2014-12-18,17219776,128702084,10,0,2,0,1


In [72]:
submission = submission[['user_id', 'item_id']]
submission.head()

Unnamed: 0,user_id,item_id
14755,5538350,371523534
21103,11244543,385729946
22888,11759070,88640862
30277,14073732,239991623
39500,17219776,128702084


In [74]:
submission.to_csv('tianchi_mobile_recommendation_predict.csv', index=False)