# Python建模库介绍

In [1]:
import numpy as np
import pandas as pd

In [5]:
data = pd.DataFrame({ 'x0': [1, 2, 3, 4, 5], 'x1': [0.01, -0.01, 0.25, -4.1, 0.], 'y': [-1.5, 0., 3.6, 1.3, -2.]})
data.loc[:, 'x0':'x1'].values

array([[ 1.  ,  0.01],
       [ 2.  , -0.01],
       [ 3.  ,  0.25],
       [ 4.  , -4.1 ],
       [ 5.  ,  0.  ]])

In [9]:
data['category']=pd.Categorical(['a', 'b', 'a', 'a','b'], categories=['a', 'b'])
pd.get_dummies(data, prefix='category')

Unnamed: 0,x0,x1,y,category_a,category_b
0,1,0.01,-1.5,1,0
1,2,-0.01,0.0,0,1
2,3,0.25,3.6,1,0
3,4,-4.1,1.3,1,0
4,5,0.0,-2.0,0,1


-----用Patsy创建模型描述-----

In [30]:
import patsy
y, X=patsy.dmatrices('y ~ x0 + x1 + 0', data)
# Pasty对象可以直接传递到算法（比如numpy.linalg.lstsq），它执行普通最小二乘回归
coef, resid, _, _ = np.linalg.lstsq(X, y)
coef=pd.Series(coef.squeeze(), index=X.design_info.column_names)
coef

  after removing the cwd from sys.path.


x0    0.009254
x1   -0.254854
dtype: float64

In [35]:
y, X = patsy.dmatrices('y ~ x0 + np.log(np.abs(x1) + 1)', data)
# 常见的变量转化包括标准化（平均值为0，方差为1）和中心化（减去平均值）
y, X = patsy.dmatrices('y ~ standardize(x0) + center(x1)', data)
new_data = pd.DataFrame({ 'x0': [6, 7, 8, 9], 'x1': [3.1, -0.5, 0, 2.3], 'y': [1, 2, 3, 4]})
new_X=patsy.build_design_matrices([X.design_info], new_data)
new_X

[DesignMatrix with shape (4, 3)
   Intercept  standardize(x0)  center(x1)
           1          2.12132        3.87
           1          2.82843        0.27
           1          3.53553        0.77
           1          4.24264        3.07
   Terms:
     'Intercept' (column 0)
     'standardize(x0)' (column 1)
     'center(x1)' (column 2)]

In [42]:
# 按照名称数据集的列相加时，用特殊I函数将它们封装起来
y, X=patsy.dmatrices('y ~ I(x0+x1)', data)
X

DesignMatrix with shape (5, 2)
  Intercept  I(x0 + x1)
          1        1.01
          1        1.99
          1        3.25
          1       -0.10
          1        5.00
  Terms:
    'Intercept' (column 0)
    'I(x0 + x1)' (column 1)

In [43]:
data = pd.DataFrame({ 'key1': ['a', 'a', 'b', 'b', 'a', 'b', 'a', 'b'], 
                     'key2': [0, 1, 0, 1, 0, 1, 0, 0], 
                     'v1': [1, 2, 3, 4, 5, 6, 7, 8], 
                     'v2': [-1, 0, 2.5, -0.5, 4.0, -1.2, 0.2, -1.7] })
data['key2']=data['key2'].map({0: 'zero', 1: 'one'})
y, X=patsy.dmatrices('v2 ~ key1+key2+key1:key2', data)
X

DesignMatrix with shape (8, 4)
  Intercept  key1[T.b]  key2[T.zero]  key1[T.b]:key2[T.zero]
          1          0             1                       0
          1          0             0                       0
          1          1             1                       1
          1          1             0                       0
          1          0             1                       0
          1          1             0                       0
          1          0             1                       0
          1          1             1                       1
  Terms:
    'Intercept' (column 0)
    'key1' (column 1)
    'key2' (column 2)
    'key1:key2' (column 3)

-----statsmodels介绍-----

In [44]:
import statsmodels.api as sm
import statsmodels.formula.api as smf

In [62]:
def dnorm(mean, variance, size=1):
    if isinstance(size, int):
        size=size,
    return mean+np.sqrt(variance)*np.random.randn(*size)
np.random.seed(12345)
N=100
X=np.c_[dnorm(0, 0.4, size=N),dnorm(0, 0.6, size=N),dnorm(0, 0.2, size=N)]
eps=dnorm(0, 0.1, size=N)
beta = [0.1, 0.3, 0.5]
y=np.dot(X, beta)+eps
# 线性模型通常要拟合一个截距。sm.add_constant函数可以添加一个截距的列到现存的矩阵
X_model=sm.add_constant(X)
# sm.OLS类可以拟合一个普通最小二乘回归
model=sm.OLS(y, X)
# 这个模型的fit方法返回了一个回归结果对象，它包含估计的模型参数和其他内容
results=model.fit()
results.params
# 对结果使用summary方法可以打印模型的详细诊断结果
results.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.43
Model:,OLS,Adj. R-squared:,0.413
Method:,Least Squares,F-statistic:,24.42
Date:,"Tue, 29 Jan 2019",Prob (F-statistic):,7.44e-12
Time:,14:53:59,Log-Likelihood:,-34.305
No. Observations:,100,AIC:,74.61
Df Residuals:,97,BIC:,82.42
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
x1,0.1783,0.053,3.364,0.001,0.073,0.283
x2,0.2230,0.046,4.818,0.000,0.131,0.315
x3,0.5010,0.080,6.237,0.000,0.342,0.660

0,1,2,3
Omnibus:,4.662,Durbin-Watson:,2.201
Prob(Omnibus):,0.097,Jarque-Bera (JB):,4.098
Skew:,0.481,Prob(JB):,0.129
Kurtosis:,3.243,Cond. No.,1.74


In [73]:
data = pd.DataFrame(X, columns=['col0', 'col1', 'col2'])
data['y'] = y
# 使用statsmodels的公式API和Patsy的公式字符串
results=smf.ols('y ~ col0+col1+col2',  data=data).fit()
results.params
results.tvalues
# 给出一个样本外数据，根据估计的模型参数计算预测值
results.predict(data[:5])

0   -0.002327
1   -0.141904
2    0.041226
3   -0.323070
4   -0.100535
dtype: float64

时间序列分析

In [74]:
init_x=4
import random
values=[init_x, init_x]
N=1000
b0=0.8
b1=-0.4
noise=dnorm(0, 0.1, N)
for i in range(N):
    new_x=values[-1]*b0+values[-2]*b1+noise[i]
    values.append(new_x)
# 拟合AR模型时，你可能不知道滞后项的个数，因此可以用较多的滞后量来拟合这个模型
MAXLAGS = 5
model = sm.tsa.AR(values)
results=model.fit(MAXLAGS)
results.params

array([-0.00616093,  0.78446347, -0.40847891, -0.01364148,  0.01496872,
        0.01429462])

-----scikit-learn介绍-----

In [89]:
# 用一个kaggle竞赛的经典数据集，关于泰坦尼克号乘客的生还率
train=pd.read_csv('E:/useFiles/datasets/titanic/train.csv')
test=pd.read_csv('E:/useFiles/datasets/titanic/test.csv')
train[:4]
# statsmodels和scikit-learn通常不能接收缺失数据
train.isnull().sum()
test.isnull().sum()
# 用训练数据集的中位数补全两个表年龄的空值
impute_value=train['Age'].median()
train['Age']=train['Age'].fillna(impute_value)
test['Age']=train['Age'].fillna(impute_value)
# 增加了一个列IsFemale，作为“Sex”列的编码
train['IsFemale'] = (train['Sex'] == 'female').astype(int)
test['IsFemale'] = (test['Sex'] == 'female').astype(int)

# 确定一些模型变量
predictors=['Pclass','IsFemale','Age']
X_train=train[predictors].values
X_test=test[predictors].values
y_train=train['Survived'].values

# 用scikit-learn的LogisticRegression逻辑回归模型（分类算法）创建一个模型实例
from sklearn.linear_model import LogisticRegression
model=LogisticRegression()
# 可以用模型的fit方法，将它拟合到训练数据
model.fit(X_train, y_train)

# 用model.predict对测试数据进行预测
y_predict=model.predict(X_test)
y_predict[:10]


array([0, 0, 0, 0, 1, 0, 0, 0, 1, 0], dtype=int64)

In [97]:
# 交叉验证可以用来进行参数调节，避免对训练数据过拟合，可以提高预测性或对新数据的健壮性
# 交叉验证通过分割训练数据来模拟样本外预测，基于模型的精度得分（比如均方差），可以对模型参数进行网格搜索
from sklearn.linear_model import LogisticRegressionCV
model_cv=LogisticRegressionCV()
model_cv.fit(X_train, y_train)

# 要手动进行交叉验证，可以使用cross_val_score处理数据分割
from sklearn.model_selection import cross_val_score
model=LogisticRegression(C=10)
scores=cross_val_score(model, X_train, y_train, cv=4)
scores
# 交叉验证过的模型需要更长的时间来训练，但会有更高的模型性能

array([0.77232143, 0.80269058, 0.77027027, 0.78828829])

In [124]:
# 调用sklearn波士顿房价数据
from sklearn import datasets
# 调用线性回归函数
from sklearn.linear_model import LinearRegression
# 导入数据集，这里将全部数据用于训练，没有对数据进行划分
loaded_data=datasets.load_boston()
data_X=loaded_data.data
data_y=loaded_data.target
# 设置线性回归模块
model=LinearRegression()
# 训练数据
model.fit(data_X, data_y)
# 预测，与原标签进行比较
print(data_y[:4])
model.predict(data_X[:4, :])


[24.  21.6 34.7 33.4]


array([30.00821269, 25.0298606 , 30.5702317 , 28.60814055])

In [None]:
关于建模和数据科学工具的书：
Andreas Mueller and Sarah Guido (O’Reilly)的 《Introduction to MachineLearning with Python》
Jake VanderPlas (O’Reilly)的 《Python Data Science Handbook》
Joel Grus (O’Reilly) 的 《Data Science from Scratch: First Principles》
Sebastian Raschka (Packt Publishing) 的《Python Machine Learning》
Aurélien Géron (O’Reilly) 的《Hands-On Machine Learning with Scikit-Learn and TensorFlow》

不断熟悉各种统计和机器学习框架的文档，学习最新的功能和API。