In [20]:
import numpy as np                     # 引入基础软件包numpy
import pandas as pd                    # 引入基础软件包pandas
import statsmodels.api as sm  # 引入Logistic regression软件包statsmodels
from sklearn.model_selection import train_test_split # 引入训练集/测试集构造工具包
from sklearn import metrics            # 引入模型评价指标AUC计算工具包
import matplotlib.pyplot as plt        # 引入绘图软件包
import scipy                           # 引入scipy软件包完成卡方检验

In [21]:
# 数据读入
data_path =  u'./Data/wide_data.csv'
raw_data = pd.read_table(data_path, sep=',', header=0)

In [22]:
raw_data.head(5).T

Unnamed: 0,0,1,2,3,4
OBS_ID,1.0,2.0,3.0,4.0,5.0
MODE,3.0,3.0,3.0,3.0,3.0
availability_AIR,1.0,1.0,1.0,1.0,1.0
availability_TRAIN,1.0,1.0,1.0,1.0,1.0
availability_BUS,1.0,1.0,1.0,1.0,1.0
availability_CAR,1.0,1.0,1.0,1.0,1.0
HINC,35.0,30.0,40.0,70.0,45.0
PSIZE,1.0,2.0,1.0,3.0,2.0
TTME_AIR,69.0,64.0,69.0,64.0,64.0
TTME_TRAIN,34.0,44.0,34.0,44.0,44.0


In [23]:
raw_data['y'] = raw_data['MODE'].apply(lambda x : 1 if x==3 else 0)

In [24]:
model_data = raw_data[[
    'OBS_ID'
    , 'HINC', 'PSIZE'
    , 'TTME_AIR', 'TTME_TRAIN', 'TTME_BUS'
    , 'INVC_AIR', 'INVC_TRAIN', 'INVC_BUS', 'INVC_CAR'
    , 'INVT_AIR', 'INVT_TRAIN','INVT_BUS', 'INVT_CAR'
    , 'y'
]]

In [25]:
model_data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
OBS_ID,210.0,105.5,60.765944,1.0,53.25,105.5,157.75,210.0
HINC,210.0,34.547619,19.711317,2.0,20.0,34.5,50.0,72.0
PSIZE,210.0,1.742857,1.012161,1.0,1.0,1.0,2.0,6.0
TTME_AIR,210.0,61.009524,15.719427,5.0,64.0,64.0,69.0,99.0
TTME_TRAIN,210.0,35.690476,12.279224,1.0,34.0,34.0,44.0,99.0
TTME_BUS,210.0,41.657143,12.077365,5.0,35.0,35.0,53.0,60.0
INVC_AIR,210.0,85.252381,27.409147,44.0,60.0,81.0,107.0,180.0
INVC_TRAIN,210.0,51.338095,27.032056,11.0,31.0,42.0,73.0,112.0
INVC_BUS,210.0,33.457143,12.591125,12.0,25.0,32.0,44.0,70.0
INVC_CAR,210.0,20.995238,14.67788,2.0,11.0,16.0,29.75,86.0


In [26]:
model_data.info()                      # 查看每一列的数据类型 & 数值缺失情况
# 重要返回信息
# | RangeIndex: 210 entries, 0 to 209
# | Data columns (total 9 columns):
# | ...
# | HINC              210 non-null int64
# | ...
# 1.缺失值处理
model_data = model_data.dropna()       # 缺失值处理，删除
model_data = model_data.fillna(0)      # 缺失值处理，填充（零，均值，中位数，预测值等）

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 210 entries, 0 to 209
Data columns (total 15 columns):
OBS_ID        210 non-null int64
HINC          210 non-null int64
PSIZE         210 non-null int64
TTME_AIR      210 non-null float64
TTME_TRAIN    210 non-null float64
TTME_BUS      210 non-null float64
INVC_AIR      210 non-null float64
INVC_TRAIN    210 non-null float64
INVC_BUS      210 non-null float64
INVC_CAR      210 non-null float64
INVT_AIR      210 non-null float64
INVT_TRAIN    210 non-null float64
INVT_BUS      210 non-null float64
INVT_CAR      210 non-null float64
y             210 non-null int64
dtypes: float64(11), int64(4)
memory usage: 24.7 KB


In [27]:
# 2.数值型
import re                               # 正则表达式工具包
float_patten = '^(-?\\d+)(\\.\\d+)?$'   # 定义浮点数正则patten
float_re = re.compile(float_patten)     # 编译
model_data['HINC'][model_data['HINC'].apply(lambda x : 'not_float' if float_re.match(str(x)) == None else 'float') == 'not_float'] # 查看非浮点型数据
# 重要返回信息
# | 2    null
# | Name: distance, dtype: object

Series([], Name: HINC, dtype: int64)

In [28]:
# model_data = model_data[model_data['HINC'] != 'null']
# model_data['HINC'] = model_data['HINC'].astype(float)

In [29]:
# 离散变量分析
multi_categorical_x_list = ['PSIZE']

print('======离散变量显著性======')
for categorical_x in multi_categorical_x_list:
    crosstab = pd.crosstab( model_data['y'],model_data[categorical_x])
    p=scipy.stats.chi2_contingency(crosstab)[1]
    print(categorical_x,":",p)

PSIZE : 0.0024577358937625327


In [30]:
# 连续变量分析
continuous_x_list = ['HINC','TTME_AIR', 'TTME_TRAIN', 'TTME_BUS'
    ,'INVC_AIR', 'INVC_TRAIN', 'INVC_BUS', 'INVC_CAR'
    , 'INVT_AIR', 'INVT_TRAIN','INVT_BUS', 'INVT_CAR' ]
result = []
for continuous_x in continuous_x_list:
    logistic = sm.Logit(model_data['y'],model_data[continuous_x]).fit()
    p = logistic.pvalues[continuous_x]
    y_predict = logistic.predict(model_data[continuous_x])
    AUC = metrics.roc_auc_score(model_data['y'],y_predict)
    result.append(continuous_x+":"+str(p)+'  AUC:'+str(AUC))
print('======连续变量显著性======')
for i in result:
    print(i)

Optimization terminated successfully.
         Current function value: 0.655165
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.618397
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.631421
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.626189
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.574280
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.623141
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.607407
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.564209
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.601355
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.618396
  

In [31]:
crosstab = pd.crosstab( model_data['y'],model_data['PSIZE'])
p=scipy.stats.chi2_contingency(crosstab)[1]
print("PSIZE:",p)

PSIZE: 0.0024577358937625327


In [32]:
logistic = sm.Logit(model_data['y'],model_data['INVT_CAR']).fit()
p = logistic.pvalues['INVT_CAR']
y_predict = logistic.predict(model_data['INVT_CAR'])
AUC = metrics.roc_auc_score(model_data['y'],y_predict)
result = 'INVT_CAR:'+str(p)+'  AUC:'+str(AUC)
print(result)

Optimization terminated successfully.
         Current function value: 0.592252
         Iterations 4
INVT_CAR:2.971604856310474e-09  AUC:0.6242563699629587


In [33]:
from statsmodels.stats.outliers_influence import variance_inflation_factor #诊断多重共线性，vif
X = model_data[[
     'HINC', 'PSIZE'
    ,'TTME_AIR', 'TTME_TRAIN', 'TTME_BUS'
    ,'INVC_AIR', 'INVC_TRAIN', 'INVC_BUS', 'INVC_CAR'
    , 'INVT_AIR', 'INVT_TRAIN','INVT_BUS', 'INVT_CAR'
]]

vif = pd.DataFrame()
vif['VIF Factor'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif['features'] = X.columns
print('================多重共线性==============')
print(vif)

    VIF Factor    features
0     5.711776        HINC
1    11.084716       PSIZE
2    10.452382    TTME_AIR
3    12.324754  TTME_TRAIN
4    21.219219    TTME_BUS
5    27.755673    INVC_AIR
6    16.198591  INVC_TRAIN
7    33.760088    INVC_BUS
8    10.367841    INVC_CAR
9    18.772230    INVT_AIR
10  114.079612  INVT_TRAIN
11  129.731893    INVT_BUS
12   42.598680    INVT_CAR


In [34]:
# 建模数据构造
X = model_data[[ 'HINC','PSIZE','TTME_TRAIN' , 'INVC_CAR']]
y = raw_data['y']
# 哑变量处理
dummies = pd.get_dummies(X['PSIZE'], drop_first=False)
dummies.columns = [ 'PSIZE'+'_'+str(x) for x in dummies.columns.values]
X = pd.concat([X, dummies], axis=1)
X = X.drop('PSIZE',axis=1) # 删去原离散变量
# X = X.drop('PSIZE_1',axis=1) 
# X = X.drop('PSIZE_2',axis=1) 
# X = X.drop('PSIZE_3',axis=1) 
X = X.drop('PSIZE_4',axis=1) 
X = X.drop('PSIZE_5',axis=1) 
X = X.drop('PSIZE_6',axis=1) 

X['Intercept'] = 1

# 训练集与测试集的比例为80%和20%
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.8, random_state=1234)

# 全变量建模
logistic = sm.Logit(y_train,X_train).fit()
print(logistic.summary2())
# 重要返回信息
# | ------------------------------------------------------------------
# |                Coef.   Std.Err.     z     P>|z|    [0.025   0.975]
# | ------------------------------------------------------------------
# | HINC           0.0264    0.0100   2.6477  0.0081   0.0068   0.0459
# | TTME_TRAIN     0.0389    0.0195   1.9916  0.0464   0.0006   0.0772
# | INVC_CAR      -0.0512    0.0204  -2.5103  0.0121  -0.0913  -0.0112
# | PSIZE_1       -0.3077    0.7317  -0.4206  0.6741  -1.7419   1.1264
# | PSIZE_2       -1.0800    0.6417  -1.6829  0.0924  -2.3378   0.1778
# | PSIZE_3       -0.7585    0.7582  -1.0004  0.3171  -2.2444   0.7275
# | Intercept     -1.8879    1.1138  -1.6951  0.0901  -4.0708   0.2950
# | =================================================================
print("========训练集AUC========")
y_train_predict = logistic.predict(X_train)
print(metrics.roc_auc_score(y_train,y_train_predict))
print("========测试集AUC========")
y_test_predict = logistic.predict(X_test)
print(metrics.roc_auc_score(y_test,y_test_predict))
# | ========训练集AUC========
# | 0.7533854166666667
# | ========测试集AUC========
# | 0.6510263929618768

Optimization terminated successfully.
         Current function value: 0.509064
         Iterations 6
                         Results: Logit
Model:              Logit            Pseudo R-squared: 0.149     
Dependent Variable: y                AIC:              185.0455  
Date:               2020-02-21 00:11 BIC:              206.9133  
No. Observations:   168              Log-Likelihood:   -85.523   
Df Model:           6                LL-Null:          -100.51   
Df Residuals:       161              LLR p-value:      3.9775e-05
Converged:          1.0000           Scale:            1.0000    
No. Iterations:     6.0000                                       
------------------------------------------------------------------
               Coef.   Std.Err.     z     P>|z|    [0.025   0.975]
------------------------------------------------------------------
HINC           0.0264    0.0100   2.6477  0.0081   0.0068   0.0459
TTME_TRAIN     0.0389    0.0195   1.9916  0.0464   0.0006   0.

In [35]:
# 建模数据构造
X = model_data[[ 'HINC','PSIZE','TTME_TRAIN' , 'INVC_CAR']]
y = raw_data['y']
# 哑变量处理
dummies = pd.get_dummies(X['PSIZE'], drop_first=False)
dummies.columns = [ 'PSIZE'+'_'+str(x) for x in dummies.columns.values]
X = pd.concat([X, dummies], axis=1)
X = X.drop('PSIZE',axis=1) # 删去原离散变量
X = X.drop('PSIZE_1',axis=1) 
X = X.drop('PSIZE_2',axis=1) 
X = X.drop('PSIZE_3',axis=1) 
X = X.drop('PSIZE_4',axis=1) 
X = X.drop('PSIZE_5',axis=1) 
X = X.drop('PSIZE_6',axis=1) 
X['Intercept'] = 1
# 训练集与测试集的比例为80%和20%
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.8, random_state=1234)
# 全变量建模
logistic = sm.Logit(y_train,X_train).fit()
print(logistic.summary2())
# 重要返回信息
# | ------------------------------------------------------------------
# |                Coef.   Std.Err.     z     P>|z|    [0.025   0.975]
# | ------------------------------------------------------------------
# | HINC           0.0266    0.0096   2.7731  0.0056   0.0078   0.0454
# | TTME_TRAIN     0.0335    0.0161   2.0838  0.0372   0.0020   0.0650
# | INVC_CAR      -0.0450    0.0168  -2.6805  0.0074  -0.0778  -0.0121
# | Intercept     -2.3486    0.8275  -2.8384  0.0045  -3.9704  -0.7269
# | =================================================================
print("========训练集AUC========")
y_train_predict = logistic.predict(X_train)
print(metrics.roc_auc_score(y_train,y_train_predict))
print("========测试集AUC========")
y_test_predict = logistic.predict(X_test)
print(metrics.roc_auc_score(y_test,y_test_predict))
# | ========训练集AUC========
# | 0.7344618055555555
# | ========测试集AUC========
# | 0.7419354838709677

Optimization terminated successfully.
         Current function value: 0.521493
         Iterations 6
                         Results: Logit
Model:              Logit            Pseudo R-squared: 0.128     
Dependent Variable: y                AIC:              183.2216  
Date:               2020-02-21 00:11 BIC:              195.7174  
No. Observations:   168              Log-Likelihood:   -87.611   
Df Model:           3                LL-Null:          -100.51   
Df Residuals:       164              LLR p-value:      1.0518e-05
Converged:          1.0000           Scale:            1.0000    
No. Iterations:     6.0000                                       
------------------------------------------------------------------
               Coef.   Std.Err.     z     P>|z|    [0.025   0.975]
------------------------------------------------------------------
HINC           0.0266    0.0096   2.7731  0.0056   0.0078   0.0454
TTME_TRAIN     0.0335    0.0161   2.0838  0.0372   0.0020   0.

In [36]:
np.exp(0.0266)-1

0.026956937820850735

In [37]:
np.exp(-0.0450)-1

-0.04400251816690004