## 读取数据

In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
# 生成读取行为小时数据函数
def read_action_hourly(app_name, date_from, date_to):
    dt_from = pd.to_datetime(date_from)
    dt_to = pd.to_datetime(date_to)
    dfs = []
    
    for day in pd.date_range(date_from, date_to, freq='d'):
        f = 'data/sample-data/section10/action_hourly/{}/{}/action_hourly.tsv'.format(app_name, day.strftime('%Y-%m-%d'))
#         print(f)
        dfs.append(pd.read_csv(f, sep='\t'))
        
    df = pd.concat(dfs)
    return df

In [3]:
action_hourly = read_action_hourly('game-01', '2013-08-01', '2013-08-08')

In [4]:
action_hourly.head()

Unnamed: 0,log_date,log_hour,app_name,user_id,count
0,2013-08-01,16,game-01,7339,1
1,2013-08-01,20,game-01,1973,87
2,2013-08-01,10,game-01,1973,30
3,2013-08-01,11,game-01,1973,48
4,2013-08-01,23,game-01,94,3


In [5]:
# 将数据整理成时间段为列
train_list = []
dates = action_hourly['log_date'].unique()

for i in range(len(dates)-1):
    day = dates[i]
    x = action_hourly[action_hourly['log_date'] == day]
    df = pd.pivot_table(x, index='user_id', columns='log_hour', values='count')
    df = df.apply(lambda x: np.where(x>=7, 1, 0))
    df.columns = ['{}{}_{}'.format('p', i+1, h) for h in df.columns]
    train_list.append(df)

In [6]:
train_list[0].head()

Unnamed: 0_level_0,p1_0,p1_1,p1_2,p1_3,p1_4,p1_5,p1_6,p1_7,p1_8,p1_9,...,p1_14,p1_15,p1_16,p1_17,p1_18,p1_19,p1_20,p1_21,p1_22,p1_23
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
71,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
78,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
94,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
99,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
131,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0


In [7]:
# 生成用于训练的数据
train_data = train_list[0]

for i in range(1,len(train_list)):
    df = train_list[i]
    train_data = pd.merge(train_data, df, on='user_id', how='left')
    train_data = train_data.fillna(0).astype(int)

In [8]:
train_data.head()

Unnamed: 0_level_0,p1_0,p1_1,p1_2,p1_3,p1_4,p1_5,p1_6,p1_7,p1_8,p1_9,...,p7_14,p7_15,p7_16,p7_17,p7_18,p7_19,p7_20,p7_21,p7_22,p7_23
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
71,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
78,0,0,0,0,0,0,0,0,1,0,...,0,0,1,1,1,0,1,0,0,0
94,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
99,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
131,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,1,0,0,0,0


In [9]:
# 生成用于输出的数据
ans0 = action_hourly[action_hourly['log_date'] == dates[-1]]

In [10]:
# 将数据整理成以时间段为列
ans = ans0.pivot_table(index='user_id', columns='log_hour', values='count')
ans = ans.apply(lambda x: np.where(x>=7, 1, 0))
ans.columns = ['{}_{}'.format('a', h) for h in ans.columns]

In [11]:
ans.head()

Unnamed: 0_level_0,a_0,a_1,a_2,a_3,a_4,a_5,a_6,a_7,a_8,a_9,...,a_14,a_15,a_16,a_17,a_18,a_19,a_20,a_21,a_22,a_23
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
12,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
66,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
72,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
78,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,1,1,0,0,0,0
97,0,0,0,0,0,0,0,0,0,1,...,1,0,0,0,0,0,0,0,0,0


In [12]:
# 将数据与训练数据合并
train_data = pd.merge(train_data, ans, on='user_id', how='left')

In [13]:
train_data = train_data.fillna(0).astype(int)

In [14]:
train_data.head()

Unnamed: 0_level_0,p1_0,p1_1,p1_2,p1_3,p1_4,p1_5,p1_6,p1_7,p1_8,p1_9,...,a_14,a_15,a_16,a_17,a_18,a_19,a_20,a_21,a_22,a_23
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
71,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
78,0,0,0,0,0,0,0,0,1,0,...,0,0,1,0,1,1,0,0,0,0
94,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
99,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
131,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,1,1,0,0


## 随机森林算法

In [15]:
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier

In [16]:
rf_fit_list = []
for h in range(0, 24):
    rf = RandomForestClassifier(n_estimators=100, random_state=0)
    df = train_data[['p{}_{}'.format(i, h) for i in range(1, 8)]+['a_{}'.format(h)]]
    df1 = df[df.iloc[:, -1] == 1]
    df0_orig = df[df.iloc[:, -1] == 0]
    df0 = df0_orig.sample(len(df1), random_state=0)
    df = pd.concat([df1, df0])
    X = df.iloc[:, :-1]
    y = df.iloc[:, -1]
    rf.fit(X, y)
# y

In [17]:
from sklearn.metrics import confusion_matrix

In [18]:
confusion_matrix(y, rf.predict(X))

array([[59,  6],
       [19, 46]], dtype=int64)

In [19]:
# X_train, X_test, y_train, y_test = train_test_split(test_size=0.25)

In [22]:
rf = RandomForestClassifier(n_estimators=100)

In [23]:
df = train_data[['p{}_{}'.format(i, h) for i in range(1, 8)]+['a_{}'.format(h)]]

In [24]:
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

In [25]:
rf.fit(X, y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [26]:
confusion_matrix(y, rf.predict(X))

array([[468,   2],
       [ 40,  25]], dtype=int64)

In [27]:
(468 + 25) / (468+2+40+25)

0.9214953271028037