## 读取数据

In [1]:
import pandas as pd
import numpy as np

In [2]:
%matplotlib inline

In [4]:
# DAU
dau = pd.read_csv('data/section7-dau.csv')

In [5]:
dau.head(2)

Unnamed: 0,region_month,region_day,app_name,user_id,device
0,2013-01,2013-01-01,game-02,10061580,FP
1,2013-01,2013-01-01,game-02,10154440,FP


In [6]:
# MAU
mau = dau[['region_month', 'user_id', 'device']].drop_duplicates()

In [7]:
# FP MAU
fp_mau = dau[dau['device'] == 'FP'][['region_month', 'user_id', 'device']].drop_duplicates()

In [8]:
# SP MAU
sp_mau = dau[dau['device'] == 'SP'][['region_month', 'user_id', 'device']].drop_duplicates()

In [9]:
# 取出1月份和2月份数据
fp_mau1 = fp_mau[fp_mau['region_month'] == '2013-01'].copy()
fp_mau2 = fp_mau[fp_mau['region_month'] == '2013-02'].copy()

sp_mau1 = sp_mau[sp_mau['region_month'] == '2013-01'].copy()
sp_mau2 = sp_mau[sp_mau['region_month'] == '2013-02'].copy()

In [10]:
# 标记访问，并且关联2月份访问情况
mau['is_access'] = 1
fp_mau1 = pd.merge(fp_mau1, mau[mau['region_month'] == '2013-02'][['user_id', 'is_access']], how='left', on='user_id')

In [11]:
fp_mau1.head()

Unnamed: 0,region_month,user_id,device,is_access
0,2013-01,10061580,FP,1.0
1,2013-01,10154440,FP,
2,2013-01,10165615,FP,1.0
3,2013-01,10321356,FP,1.0
4,2013-01,10447112,FP,1.0


In [12]:
# 缺失值填充0
fp_mau1['is_access'] = np.where(fp_mau1['is_access'].isnull(), 0, fp_mau1['is_access'])

In [13]:
# 1月份访问过游戏的非智能手机用户在2月份是否是继续通过非智能手机来访问的
fp_mau2['is_fp'] = 1
fp_mau1 = pd.merge(fp_mau1, fp_mau2[['user_id', 'is_fp']], how='left', on='user_id')

In [14]:
fp_mau1['is_fp'] = np.where(fp_mau1['is_fp'].isnull(), 0, fp_mau1['is_fp'])

In [15]:
fp_mau1.head()

Unnamed: 0,region_month,user_id,device,is_access,is_fp
0,2013-01,10061580,FP,1.0,1.0
1,2013-01,10154440,FP,0.0,0.0
2,2013-01,10165615,FP,1.0,1.0
3,2013-01,10321356,FP,1.0,1.0
4,2013-01,10447112,FP,1.0,1.0


In [16]:
#  1月份访问过游戏的非智能手机在2月份是否通过智能手机访问
sp_mau2['is_sp'] = 1
fp_mau1 = pd.merge(fp_mau1, sp_mau2[['user_id', 'is_sp']], how='left', on='user_id')
fp_mau1['is_sp'] = np.where(fp_mau1['is_sp'].isnull(), 0, fp_mau1['is_sp'])

In [17]:
fp_mau1.head()

Unnamed: 0,region_month,user_id,device,is_access,is_fp,is_sp
0,2013-01,10061580,FP,1.0,1.0,0.0
1,2013-01,10154440,FP,0.0,0.0,0.0
2,2013-01,10165615,FP,1.0,1.0,0.0
3,2013-01,10321356,FP,1.0,1.0,0.0
4,2013-01,10447112,FP,1.0,1.0,0.0


In [18]:
# 1月份通过非智能手机访问但2月份没有访问或者通过智能手机访问的用户
fp_mau1 = fp_mau1[(fp_mau1['is_access'] == 0) | (fp_mau1['is_sp'] == 1)]

In [19]:
fp_mau1.head()

Unnamed: 0,region_month,user_id,device,is_access,is_fp,is_sp
1,2013-01,10154440,FP,0.0,0.0,0.0
7,2013-01,10528830,FP,0.0,0.0,0.0
20,2013-01,1163733,FP,1.0,0.0,1.0
21,2013-01,11727630,FP,0.0,0.0,0.0
43,2013-01,13401362,FP,1.0,0.0,1.0


In [20]:
# 非智能手机每天的访问
fp_dau1 = dau[(dau['device'] == 'FP') & (dau['region_month'] == '2013-01')].copy()
fp_dau1['is_access'] = 1

In [21]:
fp_dau1.head()

Unnamed: 0,region_month,region_day,app_name,user_id,device,is_access
0,2013-01,2013-01-01,game-02,10061580,FP,1
1,2013-01,2013-01-01,game-02,10154440,FP,1
3,2013-01,2013-01-01,game-02,10165615,FP,1
4,2013-01,2013-01-01,game-02,10321356,FP,1
6,2013-01,2013-01-01,game-02,10447112,FP,1


In [22]:
# 转化成交叉表
fp_dau1_cast = fp_dau1.pivot_table(index='user_id', columns='region_day', values='is_access').fillna(0)
fp_dau1_cast.columns = ['X' + str(i) for i in range(1, 32)]  # 更改字段名称
fp_dau1_cast = fp_dau1_cast.reset_index()

In [23]:
fp_dau1_cast.head(2)

Unnamed: 0,user_id,X1,X2,X3,X4,X5,X6,X7,X8,X9,...,X22,X23,X24,X25,X26,X27,X28,X29,X30,X31
0,397286,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
1,471341,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [24]:
# 将2月份访问数据和智能手机用户数据合并
fp_dau1_cast = pd.merge(fp_dau1_cast, fp_mau1[['user_id', 'is_sp']], on='user_id')

In [25]:
fp_dau1_cast.head()

Unnamed: 0,user_id,X1,X2,X3,X4,X5,X6,X7,X8,X9,...,X23,X24,X25,X26,X27,X28,X29,X30,X31,is_sp
0,471341,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,503874,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1073544,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1073864,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1163733,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0


In [26]:
fp_dau1_cast['is_sp'].value_counts()

0.0    190
1.0     62
Name: is_sp, dtype: int64

## 基于逻辑回归分析建立模型

### statsmodels

In [27]:
import statsmodels.api as sm

In [28]:
X = fp_dau1_cast.loc[:, ['X1', 'X4', 'X5', 'X7', 'X10', 'X13', 'X22', 'X29', 'X31']]
y = fp_dau1_cast['is_sp']

In [29]:
X_new = sm.add_constant(X)

  return ptp(axis=axis, out=out, **kwargs)


In [30]:
model = sm.Logit(y, X_new).fit()

Optimization terminated successfully.
         Current function value: 0.251449
         Iterations 8


In [31]:
model.summary()

0,1,2,3
Dep. Variable:,is_sp,No. Observations:,252.0
Model:,Logit,Df Residuals:,242.0
Method:,MLE,Df Model:,9.0
Date:,"Sat, 30 Nov 2019",Pseudo R-squ.:,0.5493
Time:,19:12:29,Log-Likelihood:,-63.365
converged:,True,LL-Null:,-140.6
,,LLR p-value:,1.045e-28

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-3.6036,0.427,-8.440,0.000,-4.440,-2.767
X1,1.5334,0.572,2.681,0.007,0.412,2.654
X4,1.7753,0.642,2.764,0.006,0.516,3.034
X5,-1.0353,0.762,-1.358,0.174,-2.529,0.459
X7,1.7002,0.711,2.392,0.017,0.307,3.094
X10,-2.6753,0.942,-2.841,0.005,-4.521,-0.829
X13,1.3726,0.755,1.819,0.069,-0.107,2.852
X22,1.6233,0.638,2.543,0.011,0.372,2.874
X29,2.0012,0.648,3.088,0.002,0.731,3.271


In [35]:
# 智能手机账号迁移设定的概率
fp_dau1_cast['prob'] = [round(i, 2) for i in model.predict(X_new)]

In [36]:
# 预测在智能手机上是否进行了账号迁移设定
fp_dau1_cast['pred'] = np.where(fp_dau1_cast['prob'] > 0.5, 1, 0)

In [37]:
fp_dau1_cast.head()

Unnamed: 0,user_id,X1,X2,X3,X4,X5,X6,X7,X8,X9,...,X25,X26,X27,X28,X29,X30,X31,is_sp,prob,pred
0,471341,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.43,0
1,503874,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.11,0
2,1073544,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,1073864,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.03,0
4,1163733,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.39,0


In [41]:
# 预测值和实际值
pred_table = pd.DataFrame(model.pred_table(), columns=[0, 1]).astype(int)

In [42]:
pred_table

Unnamed: 0,0,1
0,180,10
1,20,42


In [46]:
# 计算准确率
(pred_table.iloc[0, 0] + pred_table.iloc[1, 1]) / np.sum(np.sum(pred_table))

0.8809523809523809

### sklearn

In [47]:
from sklearn.linear_model import LogisticRegression as LR

In [48]:
lr = LR(solver='lbfgs')

In [50]:
lr.fit(X, y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

In [52]:
lr.score(X, y)

0.8849206349206349

In [54]:
fp_dau1_cast['pre'] = lr.predict(X)

In [55]:
fp_dau1_cast.groupby(['is_sp', 'pre'])['user_id'].count().reset_index().rename(columns={'user_id': 'count'})

Unnamed: 0,is_sp,pre,count
0,0.0,0.0,185
1,0.0,1.0,5
2,1.0,0.0,24
3,1.0,1.0,38


In [56]:
from sklearn.metrics import confusion_matrix

In [58]:
confusion_ = confusion_matrix(y, lr.predict(X))

In [59]:
confusion_

array([[185,   5],
       [ 24,  38]], dtype=int64)

In [60]:
(confusion_[0][0] + confusion_[1][1]) / np.sum(confusion_)

0.8849206349206349