In [1]:
import pandas as pd
from pandas import Series, DataFrame # 最常用的类

In [2]:
import matplotlib.pyplot as plt
import numpy as np
np.random.seed(12345)
plt.rc('figure', figsize=(10, 6))
PREVIOUS_MAX_ROWS = pd.options.display.max_rows
pd.options.display.max_rows = 20
np.set_printoptions(precision=4, suppress=True) # numpy 不要使用科学计数法，小数4位

**使用那个库进行模型开发取决于应用**

两个流行的**建模工具包**
- `scikit-learn`
- `statsmodels`

## pandas与建模代码的组合

**使用`pandas`用于数据的载入和数据清洗，之后切换到模型库去建立模型是一个常见的模型开发工作流**

机器学习中，**特征工程**是模型开发的重要组成部分之一， 特征工程是指从原生数据集中**提取**可用于模型上下文的**有效信息**的数据转换过程和分析

`pandas`和其他数据分析库的**结合点**通常是 **NumPy**， 要将DataFrame转换为NumPy数组，使用`.values`属性

有些库**对pandas有本地化支持**，可以自动为你做一下工作
- 将数据从DataFrame转换到NumPy中并将模型参数名附于输出表的列或Series上
- 其他情况下，不得不手动去处理这些“**元数据管理**”操作

In [3]:
import pandas as pd
import numpy as np
data = pd.DataFrame({
    'x0': [1, 2, 3, 4, 5],
    'x1': [0.01, -0.01, 0.25, -4.1, 0.],
    'y': [-1.5, 0., 3.6, 1.3, -2.]})
data

data.values

array([[ 1.  ,  0.01, -1.5 ],
       [ 2.  , -0.01,  0.  ],
       [ 3.  ,  0.25,  3.6 ],
       [ 4.  , -4.1 ,  1.3 ],
       [ 5.  ,  0.  , -2.  ]])

In [4]:
data.columns

Index(['x0', 'x1', 'y'], dtype='object')

In [5]:
pd.DataFrame( data.values,  columns=data.columns )

Unnamed: 0,x0,x1,y
0,1.0,0.01,-1.5
1,2.0,-0.01,0.0
2,3.0,0.25,3.6
3,4.0,-4.1,1.3
4,5.0,0.0,-2.0


In [6]:
df2 = pd.DataFrame(data.values, columns=['one', 'two', 'three'])
df2

Unnamed: 0,one,two,three
0,1.0,0.01,-1.5
1,2.0,-0.01,0.0
2,3.0,0.25,3.6
3,4.0,-4.1,1.3
4,5.0,0.0,-2.0


In [9]:
model_cols = ['x0', 'x1']
data.loc[:, model_cols].values

array([[ 1.  ,  0.01],
       [ 2.  , -0.01],
       [ 3.  ,  0.25],
       [ 4.  , -4.1 ],
       [ 5.  ,  0.  ]])

In [13]:
data.iloc[:, :-1].values

array([[ 1.  ,  0.01, -1.5 ],
       [ 2.  , -0.01,  0.  ],
       [ 3.  ,  0.25,  3.6 ],
       [ 4.  , -4.1 ,  1.3 ],
       [ 5.  ,  0.  , -2.  ]])

In [15]:
data[['x0', 'x1']].values

array([[ 1.  ,  0.01],
       [ 2.  , -0.01],
       [ 3.  ,  0.25],
       [ 4.  , -4.1 ],
       [ 5.  ,  0.  ]])

In [21]:
data.reindex(columns=['x0', 'x1']).values

array([[ 1.  ,  0.01],
       [ 2.  , -0.01],
       [ 3.  ,  0.25],
       [ 4.  , -4.1 ],
       [ 5.  ,  0.  ]])

In [10]:
data['category'] = pd.Categorical(['a', 'b', 'a', 'a', 'b'],
                                  categories=['a', 'b'])
data

Unnamed: 0,x0,x1,y,category
0,1,0.01,-1.5,a
1,2,-0.01,0.0,b
2,3,0.25,3.6,a
3,4,-4.1,1.3,a
4,5,0.0,-2.0,b


In [22]:
dummies = pd.get_dummies(data.category, prefix='category')
dummies

Unnamed: 0,category_a,category_b
0,1,0
1,0,1
2,1,0
3,1,0
4,0,1


In [24]:
dummies = pd.get_dummies(data, prefix='category')
dummies

Unnamed: 0,x0,x1,y,category_a,category_b
0,1,0.01,-1.5,1,0
1,2,-0.01,0.0,0,1
2,3,0.25,3.6,1,0
3,4,-4.1,1.3,1,0
4,5,0.0,-2.0,0,1


## 使用Patsy创建模型描述

`Patsy`是一个用于**描述统计模型(尤其是线性模型 )** 的Python库， 使用一种**小型基于字符串的“公式语法”**，这种语法受到了R,S统计编程中公式语言的启示

`Patsy`能够**很好的支持statsmodels中特定的线性模型**

Pasty的**公式是特殊字符串语法**
$$ yo \text~ x0 + x1$$
语法$a+b$并不是**加法**， 而是指**为模型而创建的设计矩阵中的名词列**

`pasty.dmatrices`函数在数据集上，使用一个公式字符串，并为一个线性模型产生一个**设计矩阵**

In [25]:
data = pd.DataFrame({
    'x0': [1, 2, 3, 4, 5],
    'x1': [0.01, -0.01, 0.25, -4.1, 0.],
    'y': [-1.5, 0., 3.6, 1.3, -2.]})
data

Unnamed: 0,x0,x1,y
0,1,0.01,-1.5
1,2,-0.01,0.0
2,3,0.25,3.6
3,4,-4.1,1.3
4,5,0.0,-2.0


In [27]:
import patsy

In [28]:
patsy.dmatrices( 'y ~ x0+x1', data)

(DesignMatrix with shape (5, 1)
      y
   -1.5
    0.0
    3.6
    1.3
   -2.0
   Terms:
     'y' (column 0),
 DesignMatrix with shape (5, 3)
   Intercept  x0     x1
           1   1   0.01
           1   2  -0.01
           1   3   0.25
           1   4  -4.10
           1   5   0.00
   Terms:
     'Intercept' (column 0)
     'x0' (column 1)
     'x1' (column 2))

In [36]:
y, X = patsy.dmatrices( 'y ~ x0+x1', data)

In [34]:
patsy.dmatrices( 'y ~ x0+x1+0', data) # 不要截距项 

(DesignMatrix with shape (5, 1)
      y
   -1.5
    0.0
    3.6
    1.3
   -2.0
   Terms:
     'y' (column 0),
 DesignMatrix with shape (5, 2)
   x0     x1
    1   0.01
    2  -0.01
    3   0.25
    4  -4.10
    5   0.00
   Terms:
     'x0' (column 0)
     'x1' (column 1))

In [37]:
coef, resid, _, _ = np.linalg.lstsq( X, y )

  coef, resid, _, _ = np.linalg.lstsq( X, y )


In [38]:
coef

array([[ 0.3129],
       [-0.0791],
       [-0.2655]])

In [39]:
resid

array([19.6379])

In [40]:
coef = pd.Series(coef.squeeze(), index=X.design_info.column_names)
coef

Intercept    0.312910
x0          -0.079106
x1          -0.265464
dtype: float64

### Patsy公式中数据转换

可以将Python代码混合到你的Patsy公式中，在执行公式时，**Patsy库将尝试在封闭的作用域中寻找使用你使用的函数**

In [41]:
y, X = patsy.dmatrices('y ~ x0 + np.log(np.abs(x1) + 1)', data)
X

DesignMatrix with shape (5, 3)
  Intercept  x0  np.log(np.abs(x1) + 1)
          1   1                 0.00995
          1   2                 0.00995
          1   3                 0.22314
          1   4                 1.62924
          1   5                 0.00000
  Terms:
    'Intercept' (column 0)
    'x0' (column 1)
    'np.log(np.abs(x1) + 1)' (column 2)

In [42]:
y, X = patsy.dmatrices('y ~ standardize(x0) + center(x1)', data)
X

DesignMatrix with shape (5, 3)
  Intercept  standardize(x0)  center(x1)
          1         -1.41421        0.78
          1         -0.70711        0.76
          1          0.00000        1.02
          1          0.70711       -3.33
          1          1.41421        0.77
  Terms:
    'Intercept' (column 0)
    'standardize(x0)' (column 1)
    'center(x1)' (column 2)

作为建模的一部分，可能会**在一个数据集上拟合一个模型，之后基于另一个模型评价该模型**， 这个过程中可以保留部分数据或者之后再加入新数据

在应用像剧中和标准化这样的转换时，在**基于新数据使用模型或进行预测时要小心**，这些转换被称为**有状态的转换**，因为在形成新数据集时必须使用原数据集中的均值或标准差等统计值

In [43]:
new_data = pd.DataFrame({
    'x0': [6, 7, 8, 9],
    'x1': [3.1, -0.5, 0, 2.3],
    'y': [1, 2, 3, 4]})
new_data

Unnamed: 0,x0,x1,y
0,6,3.1,1
1,7,-0.5,2
2,8,0.0,3
3,9,2.3,4


In [44]:
new_X = patsy.build_design_matrices([X.design_info], new_data)
new_X

[DesignMatrix with shape (4, 3)
   Intercept  standardize(x0)  center(x1)
           1          2.12132        3.87
           1          2.82843        0.27
           1          3.53553        0.77
           1          4.24264        3.07
   Terms:
     'Intercept' (column 0)
     'standardize(x0)' (column 1)
     'center(x1)' (column 2)]

In [45]:
y, X = patsy.dmatrices('y ~ I(x0 + x1)', data)
X

DesignMatrix with shape (5, 2)
  Intercept  I(x0 + x1)
          1        1.01
          1        1.99
          1        3.25
          1       -0.10
          1        5.00
  Terms:
    'Intercept' (column 0)
    'I(x0 + x1)' (column 1)

### 分类数据和Patsy

当你在Patsy**公式中使用费数组名称列时，将会被默认转换为 虚拟变量**， 如果有拦截，其中一个级别将被排出以避免共线性

In [46]:
data = pd.DataFrame({
    'key1': ['a', 'a', 'b', 'b', 'a', 'b', 'a', 'b'],
    'key2': [0, 1, 0, 1, 0, 1, 0, 0],
    'v1': [1, 2, 3, 4, 5, 6, 7, 8],
    'v2': [-1, 0, 2.5, -0.5, 4.0, -1.2, 0.2, -1.7]
})

data

Unnamed: 0,key1,key2,v1,v2
0,a,0,1,-1.0
1,a,1,2,0.0
2,b,0,3,2.5
3,b,1,4,-0.5
4,a,0,5,4.0
5,b,1,6,-1.2
6,a,0,7,0.2
7,b,0,8,-1.7


In [47]:
y,X = patsy.dmatrices( 'v2~key1', data )
X

DesignMatrix with shape (8, 2)
  Intercept  key1[T.b]
          1          0
          1          0
          1          1
          1          1
          1          0
          1          1
          1          0
          1          1
  Terms:
    'Intercept' (column 0)
    'key1' (column 1)

In [48]:
y,X = patsy.dmatrices( 'v2~key1+0', data )
X

DesignMatrix with shape (8, 2)
  key1[a]  key1[b]
        1        0
        1        0
        0        1
        0        1
        1        0
        0        1
        1        0
        0        1
  Terms:
    'key1' (columns 0:2)

In [49]:
y, X = patsy.dmatrices('v2 ~ C(key2)', data)
X

DesignMatrix with shape (8, 2)
  Intercept  C(key2)[T.1]
          1             0
          1             1
          1             0
          1             1
          1             0
          1             1
          1             0
          1             0
  Terms:
    'Intercept' (column 0)
    'C(key2)' (column 1)

In [50]:
data['key2'] = data['key2'].map({0: 'zero', 1: 'one'})
data

Unnamed: 0,key1,key2,v1,v2
0,a,zero,1,-1.0
1,a,one,2,0.0
2,b,zero,3,2.5
3,b,one,4,-0.5
4,a,zero,5,4.0
5,b,one,6,-1.2
6,a,zero,7,0.2
7,b,zero,8,-1.7


In [51]:
y, X = patsy.dmatrices('v2 ~ key1 + key2', data)
X

DesignMatrix with shape (8, 3)
  Intercept  key1[T.b]  key2[T.zero]
          1          0             1
          1          0             0
          1          1             1
          1          1             0
          1          0             1
          1          1             0
          1          0             1
          1          1             1
  Terms:
    'Intercept' (column 0)
    'key1' (column 1)
    'key2' (column 2)

In [52]:
y, X = patsy.dmatrices('v2 ~ key1 + key2 + key1:key2', data)
X

DesignMatrix with shape (8, 4)
  Intercept  key1[T.b]  key2[T.zero]  key1[T.b]:key2[T.zero]
          1          0             1                       0
          1          0             0                       0
          1          1             1                       1
          1          1             0                       0
          1          0             1                       0
          1          1             0                       0
          1          0             1                       0
          1          1             1                       1
  Terms:
    'Intercept' (column 0)
    'key1' (column 1)
    'key2' (column 2)
    'key1:key2' (column 3)

##  statsmodels介绍

`statsmodels`是一个Python库，用于**拟合多种统计模型，执行统计测试以及数据探索和可视化**， `statsmodels`包含更多的“经典”频率学派统计方法， 而贝叶斯方法和机器学习模型可在其他库中找到

- 线性模型，广义线性模型和鲁棒性线性模型
- 线性混合效应模型
- 方差分析方法
- 时间序列过程和状态空间模型
- 广义的矩量法

### 评估线性模型

统计模型中有几种线性回归模型，从较基本的（如，普通最小二乘）到更复杂的（例如，迭代重新加权的最小二乘）

`statsmodels`中的线性模型有两种不同的接口
1. **基于数组的**
2. **基于公式的**

In [54]:
import statsmodels.api as sm
import statsmodels.formula.api as smf

In [67]:
def dnorm(mean, variance, size=1):
    if isinstance(size, int):
        size = size,
    return mean + np.sqrt(variance) * np.random.randn(*size)

In [68]:
np.random.seed(12345)
N = 100
X = np.c_[
    
    dnorm(0, 0.4, size=N),
    dnorm(0, 0.4, size=N),
    dnorm(0, 0.4, size=N)
    
]
X 

array([[-0.1295, -0.9902,  0.7131],
       [ 0.3029, -0.3558, -0.3595],
       [-0.3285, -0.0207,  0.1957],
       [-0.3515, -0.5876, -0.3652],
       [ 1.2433, -0.3052, -0.7391],
       [ 0.8813, -0.0229, -0.5218],
       [ 0.0588,  0.6928, -1.6725],
       [ 0.1782,  0.6204, -0.0968],
       [ 0.4864, -0.3728, -0.4756],
       [ 0.7883,  1.0004, -0.0839],
       [ 0.637 , -0.3344,  0.9217],
       [-0.8198,  0.289 ,  0.3855],
       [ 0.1739,  0.5882, -0.3123],
       [ 0.1448, -0.9925,  0.7842],
       [ 0.8557, -0.6467, -0.0858],
       [ 0.5606, -0.2548,  0.9044],
       [-1.2659,  0.1394, -0.5356],
       [-0.2352, -0.1223,  0.3815],
       [ 1.0556,  0.4232,  0.7992],
       [-0.2774, -1.0429, -0.1616],
       [-0.3414, -1.4248, -0.2819],
       [ 0.3017, -0.738 ,  0.2962],
       [ 2.0548,  0.2236, -0.6082],
       [-0.6459,  0.4441, -1.1539],
       [-0.365 , -0.1737,  0.3956],
       [ 0.0785, -0.088 ,  0.6469],
       [ 0.1914,  0.0681,  0.7004],
       [ 0.3313, -0.3836,  0

In [59]:
eps = dnorm(0, 0.1, size=N) 

In [62]:
beta = [0.1, 0.3, 0.5]

In [60]:
y = np.dot(X, [0.1, 0.3, 0.5]) * eps

In [61]:
y 

array([ 0.0257,  0.1142, -0.007 ,  0.0431, -0.0405, -0.0107, -0.2605,
       -0.1114,  0.0871, -0.051 ,  0.1405,  0.0577, -0.0014,  0.014 ,
       -0.0138, -0.0594, -0.04  , -0.016 ,  0.4255, -0.1878,  0.0201,
       -0.0095, -0.0209,  0.022 , -0.0321,  0.0316,  0.1606,  0.0237,
        0.0086, -0.0341, -0.0184,  0.447 ,  0.0641,  0.4334,  0.0326,
        0.1485, -0.0338,  0.0003, -0.0241,  0.044 , -0.156 , -0.0585,
       -0.0851, -0.1858,  0.0085, -0.1493,  0.0218,  0.0214, -0.0062,
       -0.0363, -0.0163,  0.0691, -0.0417, -0.1064,  0.1187, -0.2252,
        0.0343, -0.0044,  0.0262,  0.0225, -0.0525,  0.0085, -0.1271,
       -0.3843,  0.0311, -0.0398,  0.0545, -0.0779,  0.0772, -0.0103,
       -0.3595,  0.0356, -0.3162, -0.0105, -0.0068,  0.0369,  0.0937,
       -0.0167,  0.0094, -0.1703, -0.0252,  0.0154,  0.0319, -0.0236,
        0.4024, -0.0002, -0.6177, -0.1472,  0.0876,  0.0217, -0.0089,
        0.1352,  0.0191,  0.0967, -0.0182,  0.0063, -0.0531,  0.0471,
       -0.0077,  0.0

In [69]:
x_mode  = sm.add_constant(X)

In [72]:
x_mode[:5]

array([[ 1.    , -0.1295, -0.9902,  0.7131],
       [ 1.    ,  0.3029, -0.3558, -0.3595],
       [ 1.    , -0.3285, -0.0207,  0.1957],
       [ 1.    , -0.3515, -0.5876, -0.3652],
       [ 1.    ,  1.2433, -0.3052, -0.7391]])

In [75]:
model = sm.OLS(y,X)

In [78]:
model.fit().params

array([0.0225, 0.0014, 0.0578])

In [80]:
model.fit().summary()

0,1,2,3
Dep. Variable:,y,R-squared (uncentered):,0.074
Model:,OLS,Adj. R-squared (uncentered):,0.046
Method:,Least Squares,F-statistic:,2.595
Date:,"Thu, 19 Nov 2020",Prob (F-statistic):,0.0569
Time:,11:55:54,Log-Likelihood:,56.378
No. Observations:,100,AIC:,-106.8
Df Residuals:,97,BIC:,-98.94
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
x1,0.0225,0.021,1.049,0.297,-0.020,0.065
x2,0.0014,0.023,0.063,0.950,-0.044,0.047
x3,0.0578,0.023,2.520,0.013,0.012,0.103

0,1,2,3
Omnibus:,18.102,Durbin-Watson:,1.797
Prob(Omnibus):,0.0,Jarque-Bera (JB):,94.905
Skew:,0.004,Prob(JB):,2.4600000000000003e-21
Kurtosis:,7.773,Cond. No.,1.1


In [81]:
data = pd.DataFrame(X, columns=['col0', 'col1', 'col2'])
data['y'] = y
data[:5]

Unnamed: 0,col0,col1,col2,y
0,-0.129468,-0.990209,0.713082,0.02571
1,0.30291,-0.355782,-0.359465,0.114244
2,-0.328522,-0.020659,0.195658,-0.007031
3,-0.351475,-0.587555,-0.365171,0.043087
4,1.243269,-0.305206,-0.739109,-0.040477


In [83]:
results = smf.ols('y ~ col0 + col1 + col2', data=data).fit()
results.params
results.tvalues

Intercept   -0.015065
col0         1.043712
col1         0.062095
col2         2.464623
dtype: float64

### 评估时间序列处理

`statsmodels`中的另一类模型用于**时间序列分析**，其中包括**自回归模型，卡尔曼滤波和其他状态的空间参数，以及多变量自回归模型**

In [84]:
init_x = 4

import random
values = [init_x, init_x]
N = 1000

b0 = 0.8
b1 = -0.4
noise = dnorm(0, 0.1, N)
for i in range(N):
    new_x = values[-1] * b0 + values[-2] * b1 + noise[i]
    values.append(new_x)

In [85]:
values

[4,
 4,
 2.152523777752035,
 -0.32393996541078857,
 -1.2397719558149354,
 -0.9715996753389815,
 -0.16118465379544833,
 0.3194575104761386,
 0.7385035048026619,
 -0.25310599187150973,
 -0.7872278373809024,
 -0.6800016407228995,
 0.10203863666923912,
 0.6458097097082918,
 0.43973486913794985,
 0.22179001127225442,
 0.09275453205781499,
 -0.15200457222244781,
 -0.04525697830633495,
 -0.09817758493656323,
 0.6127294075449543,
 0.9752106985440546,
 0.5017355496159331,
 0.23279905799896766,
 0.6473629005975279,
 0.38145641217990833,
 -0.24802662215445037,
 -0.24744014381637924,
 0.31330744942782,
 -0.09607014293098726,
 -0.2477557196828938,
 -0.3863265250050961,
 -0.17719324959580113,
 -0.4602117274121091,
 -0.6688260465492446,
 0.47564294572985005,
 0.43006663646413656,
 -0.08389730298513823,
 -0.0379347901096814,
 -0.3628681369756984,
 -0.08074173433966625,
 0.40354609765419075,
 0.5664598890683412,
 0.03575187506493488,
 -0.4851566606359389,
 0.07580181765662425,
 0.28478289140952945,
 0.

In [86]:
MAXLAGS = 5
model = sm.tsa.AR(values)
results = model.fit(MAXLAGS)

statsmodels.tsa.AR has been deprecated in favor of statsmodels.tsa.AutoReg and
statsmodels.tsa.SARIMAX.

AutoReg adds the ability to specify exogenous variables, include time trends,
and add seasonal dummies. The AutoReg API differs from AR since the model is
treated as immutable, and so the entire specification including the lag
length must be specified when creating the model. This change is too
substantial to incorporate into the existing AR api. The function
ar_select_order performs lag length selection for AutoReg models.

AutoReg only estimates parameters using conditional MLE (OLS). Use SARIMAX to
estimate ARX and related models using full MLE via the Kalman Filter.





In [87]:
results.params

array([ 0.002 ,  0.7857, -0.4179,  0.0121, -0.0169,  0.0353])

## scikit-learn介绍

`scikit-learn`是**使用最广泛且最受信任的通用Python机器学习库**，包含**广泛的标准监督和无监督的机器学习方法**，包括用于模型选择和评估，数据转换，数据加载和模型持久化的工具，这些模型可用于分类，聚类，预测和其他常见任务

In [90]:
train = pd.read_csv('pydata-book/datasets/titanic/train.csv')
test = pd.read_csv('pydata-book/datasets/titanic/test.csv')

In [91]:
train

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [92]:
test

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
...,...,...,...,...,...,...,...,...,...,...,...
413,1305,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S
414,1306,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C
415,1307,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
416,1308,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S


In [95]:
train.columns.difference(test.columns)

Index(['Survived'], dtype='object')

In [100]:
train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [103]:
test.isna().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [106]:
inmput_value = train['Age'].median();inmput_value

28.0

In [108]:
train['Age'] = train['Age'].fillna(inmput_value)

In [109]:
test['Age'] = test['Age'].fillna(inmput_value)

In [137]:
train['IsFemale'] = (train['Sex'] == 'female').astype(int)
test['IsFemale'] = (test['Sex'] == 'female').astype(int)

In [138]:
train

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,IsFemale
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,1
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S,1
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S,0
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S,1
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,28.0,1,2,W./C. 6607,23.4500,,S,1
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C,0


In [139]:
predictors = ['Pclass', 'IsFemale', 'Age']

In [140]:
X_train = train[predictors].values
X_test = test[predictors].values


In [141]:
y_train = train['Survived'].values

In [142]:
X_train[:5]

array([[ 3.,  0., 22.],
       [ 1.,  1., 38.],
       [ 3.,  1., 26.],
       [ 1.,  1., 35.],
       [ 3.,  0., 35.]])

In [143]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()

In [144]:
model.fit(X_train, y_train)

LogisticRegression()

In [145]:
y_predict = model.predict(X_test)
y_predict[:10]

array([0, 0, 0, 0, 1, 0, 1, 0, 1, 0])

In [146]:
from sklearn.linear_model import LogisticRegressionCV
model_cv = LogisticRegressionCV(10)
model_cv.fit(X_train, y_train)



LogisticRegressionCV()

In [147]:
from sklearn.model_selection import cross_val_score
model = LogisticRegression(C=10)
scores = cross_val_score(model, X_train, y_train, cv=4)
scores

array([0.7758, 0.7982, 0.7758, 0.7883])