In [2]:
import numpy as np
import pandas as pd
np.random.seed(12345)
import matplotlib.pyplot as plt
plt.rc('figure', figsize=(10, 6))
PREVIOUS_MAX_ROWS = pd.options.display.max_rows
pd.options.display.max_columns = 20
pd.options.display.max_rows = 20
pd.options.display.max_colwidth = 80
np.set_printoptions(precision=4, suppress=True)

In [3]:
data=pd.DataFrame({
    'x0':[1,2,3,4,5],
    'x1':[0.01,-0.01,0.25,-4.1,0.],
    'y':[-1.5,0.,3.6,1.3,-2.]
})
data

Unnamed: 0,x0,x1,y
0,1,0.01,-1.5
1,2,-0.01,0.0
2,3,0.25,3.6
3,4,-4.1,1.3
4,5,0.0,-2.0


In [4]:
data.columns

Index(['x0', 'x1', 'y'], dtype='object')

In [5]:
data.to_numpy()

array([[ 1.  ,  0.01, -1.5 ],
       [ 2.  , -0.01,  0.  ],
       [ 3.  ,  0.25,  3.6 ],
       [ 4.  , -4.1 ,  1.3 ],
       [ 5.  ,  0.  , -2.  ]])

In [6]:
df2=pd.DataFrame(data.to_numpy(),columns=['one','two','three'])
df2

Unnamed: 0,one,two,three
0,1.0,0.01,-1.5
1,2.0,-0.01,0.0
2,3.0,0.25,3.6
3,4.0,-4.1,1.3
4,5.0,0.0,-2.0


In [7]:
df3=data.copy()
df3['strings']=['a','b','c','d','e']

In [8]:
df3

Unnamed: 0,x0,x1,y,strings
0,1,0.01,-1.5,a
1,2,-0.01,0.0,b
2,3,0.25,3.6,c
3,4,-4.1,1.3,d
4,5,0.0,-2.0,e


In [9]:
df3.to_numpy()

array([[1, 0.01, -1.5, 'a'],
       [2, -0.01, 0.0, 'b'],
       [3, 0.25, 3.6, 'c'],
       [4, -4.1, 1.3, 'd'],
       [5, 0.0, -2.0, 'e']], dtype=object)

In [10]:
model_cols=['x0','x1']
data.loc[:,model_cols].to_numpy()

array([[ 1.  ,  0.01],
       [ 2.  , -0.01],
       [ 3.  ,  0.25],
       [ 4.  , -4.1 ],
       [ 5.  ,  0.  ]])

In [11]:
values=pd.Series([0,1,0,0]*2)
dim=pd.Series(['apple','orange'])
values

0    0
1    1
2    0
3    0
4    0
5    1
6    0
7    0
dtype: int64

In [12]:
dim

0     apple
1    orange
dtype: object

In [13]:
dim.take(values)

0     apple
1    orange
0     apple
0     apple
0     apple
1    orange
0     apple
0     apple
dtype: object

In [14]:
data['category']=pd.Categorical(['a','b','a','a','b'],
                                categories=['a','b'])
data

Unnamed: 0,x0,x1,y,category
0,1,0.01,-1.5,a
1,2,-0.01,0.0,b
2,3,0.25,3.6,a
3,4,-4.1,1.3,a
4,5,0.0,-2.0,b


In [15]:
data.category

0    a
1    b
2    a
3    a
4    b
Name: category, dtype: category
Categories (2, object): ['a', 'b']

In [16]:
dummies=pd.get_dummies(data.category,prefix='category')
dummies

Unnamed: 0,category_a,category_b
0,1,0
1,0,1
2,1,0
3,1,0
4,0,1


In [17]:
data_with_dummies=data.drop('category',axis=1).join(dummies)
data_with_dummies

Unnamed: 0,x0,x1,y,category_a,category_b
0,1,0.01,-1.5,1,0
1,2,-0.01,0.0,0,1
2,3,0.25,3.6,1,0
3,4,-4.1,1.3,1,0
4,5,0.0,-2.0,0,1


In [18]:
data

Unnamed: 0,x0,x1,y,category
0,1,0.01,-1.5,a
1,2,-0.01,0.0,b
2,3,0.25,3.6,a
3,4,-4.1,1.3,a
4,5,0.0,-2.0,b


In [19]:
data.drop('category',axis=1)

Unnamed: 0,x0,x1,y
0,1,0.01,-1.5
1,2,-0.01,0.0
2,3,0.25,3.6
3,4,-4.1,1.3
4,5,0.0,-2.0


In [20]:
import patsy
from patsy.highlevel import dmatrices
y,x=patsy.dmatrices('y~x0+x1',data)

In [21]:
print(y)

[[-1.5]
 [ 0. ]
 [ 3.6]
 [ 1.3]
 [-2. ]]


In [22]:
x

DesignMatrix with shape (5, 3)
  Intercept  x0     x1
          1   1   0.01
          1   2  -0.01
          1   3   0.25
          1   4  -4.10
          1   5   0.00
  Terms:
    'Intercept' (column 0)
    'x0' (column 1)
    'x1' (column 2)

In [23]:
np.asarray(y)

array([[-1.5],
       [ 0. ],
       [ 3.6],
       [ 1.3],
       [-2. ]])

In [24]:
np.asarray(x)

array([[ 1.  ,  1.  ,  0.01],
       [ 1.  ,  2.  , -0.01],
       [ 1.  ,  3.  ,  0.25],
       [ 1.  ,  4.  , -4.1 ],
       [ 1.  ,  5.  ,  0.  ]])

In [25]:
patsy.dmatrices('y~x0+x1+0',data)[1]

DesignMatrix with shape (5, 2)
  x0     x1
   1   0.01
   2  -0.01
   3   0.25
   4  -4.10
   5   0.00
  Terms:
    'x0' (column 0)
    'x1' (column 1)

In [26]:
coef,resid,_,_=np.linalg.lstsq(x,y,rcond=None)

In [27]:
coef

array([[ 0.3129],
       [-0.0791],
       [-0.2655]])

In [28]:
coef=pd.Series(coef.squeeze(),index=x.design_info.column_names)
coef

Intercept    0.312910
x0          -0.079106
x1          -0.265464
dtype: float64

In [29]:
y,x=patsy.dmatrices('y~x0+np.log(np.abs(x1)+1)',data)
x

DesignMatrix with shape (5, 3)
  Intercept  x0  np.log(np.abs(x1) + 1)
          1   1                 0.00995
          1   2                 0.00995
          1   3                 0.22314
          1   4                 1.62924
          1   5                 0.00000
  Terms:
    'Intercept' (column 0)
    'x0' (column 1)
    'np.log(np.abs(x1) + 1)' (column 2)

In [30]:
y,x=patsy.dmatrices('y~standardize(x0)+center(x1)',data)
x

DesignMatrix with shape (5, 3)
  Intercept  standardize(x0)  center(x1)
          1         -1.41421        0.78
          1         -0.70711        0.76
          1          0.00000        1.02
          1          0.70711       -3.33
          1          1.41421        0.77
  Terms:
    'Intercept' (column 0)
    'standardize(x0)' (column 1)
    'center(x1)' (column 2)

In [31]:
data['x1']-data['x1'].mean()

0    0.78
1    0.76
2    1.02
3   -3.33
4    0.77
Name: x1, dtype: float64

In [32]:
new_data=pd.DataFrame({
    'x0':[6,7,8,9],
    'x1':[3.1,-0.5,0,2.3],
    'y':[1,2,3,4]
})
new_x=patsy.build_design_matrices([x.design_info],new_data)
new_x

[DesignMatrix with shape (4, 3)
   Intercept  standardize(x0)  center(x1)
           1          2.12132        3.87
           1          2.82843        0.27
           1          3.53553        0.77
           1          4.24264        3.07
   Terms:
     'Intercept' (column 0)
     'standardize(x0)' (column 1)
     'center(x1)' (column 2)]

In [33]:
y,x=patsy.dmatrices('y~I(x0 + x1)',data)
x

DesignMatrix with shape (5, 2)
  Intercept  I(x0 + x1)
          1        1.01
          1        1.99
          1        3.25
          1       -0.10
          1        5.00
  Terms:
    'Intercept' (column 0)
    'I(x0 + x1)' (column 1)

In [34]:
data = pd.DataFrame({
    'key1': ['a', 'a', 'b', 'b', 'a', 'b', 'a', 'b'],
    'key2': [0, 1, 0, 1, 0, 1, 0, 0],
    'v1': [1, 2, 3, 4, 5, 6, 7, 8],
    'v2': [-1, 0, 2.5, -0.5, 4.0, -1.2, 0.2, -1.7]
})
y,x=patsy.dmatrices('v2~key1',data)

In [35]:
x

DesignMatrix with shape (8, 2)
  Intercept  key1[T.b]
          1          0
          1          0
          1          1
          1          1
          1          0
          1          1
          1          0
          1          1
  Terms:
    'Intercept' (column 0)
    'key1' (column 1)

In [36]:
y,x=patsy.dmatrices('v2~key1+0',data)

In [37]:
x

DesignMatrix with shape (8, 2)
  key1[a]  key1[b]
        1        0
        1        0
        0        1
        0        1
        1        0
        0        1
        1        0
        0        1
  Terms:
    'key1' (columns 0:2)

In [38]:
y,x=patsy.dmatrices('v2~C(key2)',data)
x

DesignMatrix with shape (8, 2)
  Intercept  C(key2)[T.1]
          1             0
          1             1
          1             0
          1             1
          1             0
          1             1
          1             0
          1             0
  Terms:
    'Intercept' (column 0)
    'C(key2)' (column 1)

In [39]:
data['key2']=data['key2'].map({0:'zero',1:'one'})
data

Unnamed: 0,key1,key2,v1,v2
0,a,zero,1,-1.0
1,a,one,2,0.0
2,b,zero,3,2.5
3,b,one,4,-0.5
4,a,zero,5,4.0
5,b,one,6,-1.2
6,a,zero,7,0.2
7,b,zero,8,-1.7


In [40]:
y,x=patsy.dmatrices('v2~key1+key2',data)
x

DesignMatrix with shape (8, 3)
  Intercept  key1[T.b]  key2[T.zero]
          1          0             1
          1          0             0
          1          1             1
          1          1             0
          1          0             1
          1          1             0
          1          0             1
          1          1             1
  Terms:
    'Intercept' (column 0)
    'key1' (column 1)
    'key2' (column 2)

In [41]:
y,x=patsy.dmatrices('v2~key1+key2+key1:key2',data)
x

DesignMatrix with shape (8, 4)
  Intercept  key1[T.b]  key2[T.zero]  key1[T.b]:key2[T.zero]
          1          0             1                       0
          1          0             0                       0
          1          1             1                       1
          1          1             0                       0
          1          0             1                       0
          1          1             0                       0
          1          0             1                       0
          1          1             1                       1
  Terms:
    'Intercept' (column 0)
    'key1' (column 1)
    'key2' (column 2)
    'key1:key2' (column 3)

In [42]:
import statsmodels.api as sm
import statsmodels.formula.api as smf

In [44]:
rng=np.random.default_rng(seed=12345)

def dnorm(mean,variance,size=1):
    if isinstance(size,int):
        size=size,
    return mean+np.sqrt(variance)*rng.standard_normal(*size)
N=100
X=np.c_[dnorm(0,0.4,size=N),
        dnorm(0,0.6,size=N),
        dnorm(0,0.2,size=N)]
eps=dnorm(0,0.1,size=N)
beta=[0.1,0.3,0.5]

eps


array([ 0.0613, -0.0409,  0.1121, -0.3424,  0.0775,  0.0698, -0.2088,
       -0.0691, -0.1758,  0.4289, -0.9865, -0.4578, -0.5394, -0.1196,
       -0.2146, -0.13  ,  0.2274, -0.5162, -0.2674, -0.0706, -0.3802,
        0.117 ,  0.1044, -0.2247, -0.5601,  0.5062,  0.149 ,  0.134 ,
        0.0229, -0.0023,  0.2628, -0.0049, -0.1995,  0.2244,  0.0189,
       -0.2443, -0.2866,  0.3686,  0.1996,  0.6247, -0.1665, -0.2351,
        0.1277,  0.4362,  0.1887,  0.137 ,  0.5967, -0.2351, -0.3946,
       -0.2355, -0.1002,  0.1181,  0.3232,  0.1283,  0.4817, -0.2617,
        0.6026, -0.2568,  0.3702, -0.2966,  0.1086,  0.0161,  0.1637,
        0.2376, -0.0674, -0.0211,  0.0756, -0.4127, -0.2312, -0.5137,
        0.0017,  0.7027,  0.2344, -0.2737, -0.2238,  0.3086, -0.15  ,
       -0.5086, -0.3235, -0.3334, -0.323 ,  0.2138,  0.0224, -0.4707,
       -0.1707, -0.36  , -0.3821, -0.4765,  0.5525, -0.0972,  0.1822,
        0.2747, -0.1215,  0.1692,  0.171 ,  0.0703,  0.6861,  0.0898,
       -0.0723,  0.2

In [45]:
X

array([[-0.9005, -0.1894, -1.0279],
       [ 0.7993, -1.546 , -0.3274],
       [-0.5507, -0.1203,  0.3294],
       [-0.1639,  0.824 ,  0.2083],
       [-0.0477, -0.2131, -0.0482],
       [-0.4686, -1.4356, -0.1527],
       [-0.8651, -0.0963,  0.7086],
       [ 0.4104,  0.608 ,  0.1262],
       [ 0.2284,  0.1565,  0.4068],
       [-1.2351, -0.3316,  0.1767],
       [ 1.4846,  1.4317, -0.2994],
       [ 0.6125,  1.4717,  0.6956],
       [-0.4803, -0.0762, -0.5537],
       [ 0.5706,  0.6301, -0.5349],
       [-0.2953,  0.304 , -0.1919],
       [-0.0384,  0.6053, -0.3263],
       [ 0.4989,  1.1257, -0.2493],
       [-0.7948,  0.6353, -0.2683],
       [ 0.3642,  0.0679,  0.4413],
       [ 0.8848, -0.5062,  0.0242],
       [ 0.8363, -0.6289,  0.1574],
       [-0.1895, -0.0198, -0.7102],
       [ 0.5711,  0.8971, -0.3788],
       [-1.0256,  0.2328,  0.485 ],
       [-0.1   ,  0.0411, -0.5384],
       [ 0.2843,  0.1993,  0.5271],
       [-0.8498,  0.0277, -0.4609],
       [-0.0517,  0.4239,  0

In [46]:
y=np.dot(X,beta)+eps
y

array([-0.5995, -0.5885,  0.1856, -0.0075, -0.0154, -0.4841,  0.0301,
        0.2175,  0.0973,  0.2943, -0.5582,  0.3928, -0.8872, -0.141 ,
       -0.2488, -0.1155,  0.4903, -0.5393,  0.01  , -0.1218, -0.4065,
       -0.263 ,  0.2412, -0.0149, -0.8269,  0.858 , -0.1582,  0.3229,
       -0.3182, -0.2518,  0.012 , -0.2769,  0.4892,  0.0271,  0.3262,
       -0.6701, -0.4364,  0.1988,  0.2911,  1.2293, -0.1345,  0.1162,
       -0.2833,  0.8264,  0.6517,  0.3693,  0.4606, -0.36  , -0.6794,
       -0.3239,  0.2289,  0.3339, -0.0289,  0.3515,  0.4105,  0.0234,
       -0.0882, -0.4222,  0.9503, -0.8432, -0.1774, -0.5828, -0.0479,
        0.4998, -0.41  , -0.0651, -0.1192, -0.7378,  0.1129, -0.5059,
        0.2002,  1.0372,  0.3964,  0.3722,  0.0822, -0.0632,  0.1685,
       -0.3024,  0.1657, -0.1187, -0.4788,  0.1031, -0.2355, -0.9313,
        0.3353, -0.032 , -0.5318, -0.0093,  0.3378, -0.3119, -0.0479,
        0.3288, -0.1556,  0.3523, -0.1236, -0.0679,  0.8316,  0.0703,
       -0.3865, -0.2

In [47]:
X[:5]

array([[-0.9005, -0.1894, -1.0279],
       [ 0.7993, -1.546 , -0.3274],
       [-0.5507, -0.1203,  0.3294],
       [-0.1639,  0.824 ,  0.2083],
       [-0.0477, -0.2131, -0.0482]])

In [48]:
y[:5]

array([-0.5995, -0.5885,  0.1856, -0.0075, -0.0154])

In [50]:
x_model=sm.add_constant(X)
x_model[:5]

array([[ 1.    , -0.9005, -0.1894, -1.0279],
       [ 1.    ,  0.7993, -1.546 , -0.3274],
       [ 1.    , -0.5507, -0.1203,  0.3294],
       [ 1.    , -0.1639,  0.824 ,  0.2083],
       [ 1.    , -0.0477, -0.2131, -0.0482]])

In [51]:
model=sm.OLS(y,X)

In [52]:
results=model.fit()
results.params

array([0.0668, 0.268 , 0.4505])

In [53]:
print(results.summary())

                                 OLS Regression Results                                
Dep. Variable:                      y   R-squared (uncentered):                   0.469
Model:                            OLS   Adj. R-squared (uncentered):              0.452
Method:                 Least Squares   F-statistic:                              28.51
Date:                Fri, 21 Oct 2022   Prob (F-statistic):                    2.66e-13
Time:                        11:45:42   Log-Likelihood:                         -25.611
No. Observations:                 100   AIC:                                      57.22
Df Residuals:                      97   BIC:                                      65.04
Df Model:                           3                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

In [54]:
data=pd.DataFrame(X,columns=['col0','col1','col2'])
data['y']=y
data[:5]

Unnamed: 0,col0,col1,col2,y
0,-0.900506,-0.18943,-1.02787,-0.599527
1,0.799252,-1.545984,-0.327397,-0.588454
2,-0.550655,-0.120254,0.329359,0.185634
3,-0.163916,0.82404,0.208275,-0.007477
4,-0.047651,-0.213147,-0.048244,-0.015374


In [55]:
results=smf.ols('y~col0+col1+col2',data=data).fit()
results.params

Intercept   -0.020799
col0         0.065813
col1         0.268970
col2         0.449419
dtype: float64

In [56]:
results.tvalues

Intercept   -0.652501
col0         1.219768
col1         6.312369
col2         6.567428
dtype: float64

In [57]:
results.predict(data[:5])

0   -0.592959
1   -0.531160
2    0.058636
3    0.283658
4   -0.102947
dtype: float64

In [58]:
init_x=4

values=[init_x,init_x]
N=1000
b0=0.8
b1=-0.4
noise=dnorm(0,0.1,N)
for i in range(N):
    new_x=values[-1]*b0+values[-2]*b1+noise[i]
    values.append(new_x)
    

In [59]:
from statsmodels.tsa.ar_model import AutoReg

MAXLAGS =5
model=AutoReg(values,MAXLAGS)
results=model.fit()

In [60]:
results.params

array([ 0.0235,  0.8097, -0.4287, -0.0334,  0.0427, -0.0567])

In [126]:
train=pd.read_csv('datasets/titanic/train.csv')
test=pd.read_csv('datasets/titanic/test.csv')
train.head(4)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S


In [127]:
test.head(4)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S


In [128]:
train.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [129]:
test.isna().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [130]:
impute_value=train['Age'].median()
train['Age']=train['Age'].fillna(impute_value)
test['Age']=test['Age'].fillna(impute_value)


In [131]:
train['IsFemale']=(train['Sex']=='female').astype(int)
test['IsFemale']=(test['Sex']=='female').astype(int)


In [132]:
predicotrs=['Pclass','IsFemale','Age']
x_train=train[predicotrs].to_numpy()
x_test=test[predicotrs].to_numpy()
y_train=train['Survived'].to_numpy()
x_train[:5]


array([[ 3.,  0., 22.],
       [ 1.,  1., 38.],
       [ 3.,  1., 26.],
       [ 1.,  1., 35.],
       [ 3.,  0., 35.]])

In [133]:
test[predicotrs][:5]

Unnamed: 0,Pclass,IsFemale,Age
0,3,0,34.5
1,3,1,47.0
2,2,0,62.0
3,3,0,27.0
4,3,1,22.0


In [134]:
y_train[:5]

array([0, 1, 1, 1, 0], dtype=int64)

In [135]:
x_test[:5]

array([[ 3. ,  0. , 34.5],
       [ 3. ,  1. , 47. ],
       [ 2. ,  0. , 62. ],
       [ 3. ,  0. , 27. ],
       [ 3. ,  1. , 22. ]])

In [136]:
from sklearn.linear_model import LinearRegression, LogisticRegression

model=LogisticRegression()


In [137]:
model.fit(x_train,y_train)


LogisticRegression()

In [138]:
y_predict=model.predict(x_test)
y_predict[:10]

array([0, 0, 0, 0, 1, 0, 1, 0, 1, 0], dtype=int64)

In [139]:
from sklearn.linear_model import LogisticRegressionCV
model_cv=LogisticRegressionCV(Cs=10)
model_cv.fit(x_train,y_train)


LogisticRegressionCV()

In [140]:
from sklearn.model_selection import cross_val_score

model=LogisticRegression(C=10)
scores=cross_val_score(model,x_train,y_train,cv=4)
scores

array([0.7758, 0.7982, 0.7758, 0.7883])