In [2]:
import numpy as np
import pandas as pd

Reading csv files

In [3]:
df=pd.read_csv('patients101.csv')

In [7]:
## checking the first few lines of the csv
df.head(10)
#df.tail()

Unnamed: 0,age,totalchol,sysBP,weight,height,sedmins,obese,marriage,gender
0,52,193,128,92.300003,152.1,60,obese,other,F
1,63,194,112,71.099998,151.7,300,obese,married,F
2,48,225,128,58.099998,162.9,480,normal,divorced,F
3,21,145,106,79.800003,170.0,120,overweight,married,M
4,66,224,124,116.2,160.0,480,obese,widowed,F
5,31,270,118,77.5,165.8,480,overweight,married,M
6,64,165,158,88.0,183.7,20,overweight,married,M
7,73,241,124,75.800003,170.2,240,overweight,married,M
8,39,240,122,100.8,170.3,2,obese,married,F
9,73,183,196,81.199997,160.6,240,obese,married,F


In [9]:
### checking the type of each column
df.dtypes
### checking dimensions of df
df.shape

age            int64
totalchol      int64
sysBP          int64
weight       float64
height       float64
sedmins        int64
obese         object
marriage      object
gender        object
dtype: object

Run basic linear Regression

In [10]:
from sklearn.linear_model import LinearRegression

In [11]:
### taking 'age' and 'totalchol' as covariate, 'sysBP' as response
X = df[['age','totalchol']]
y = df['sysBP']

In [12]:
### to run linear model, we need an LinearRegression class
linearmodel = LinearRegression()
linearmodel.fit(X,y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [15]:
linearmodel.intercept_

92.99651464509856

In [13]:
### checking linear model coefficient
linearmodel.coef_

array([0.63869737, 0.00300319])

In [14]:
### using our fitted model for prediction
linearmodel.predict(X[0:5])

array([126.7883926 , 133.81706682, 124.32970507, 106.8446213 ,
       135.8232545 ])

In [19]:
### for discrete data like gender
X_1 = df[['age','totalchol','gender']]
X_1.head()

Unnamed: 0,age,totalchol,gender
0,52,193,F
1,63,194,F
2,48,225,F
3,21,145,M
4,66,224,F


In [20]:
### the following code will not work
#linearmodel.fit(X_1,y)
### create dummy variables to deal with discrete value
gender_dummy = pd.get_dummies(X_1['gender'])

In [21]:
X_1_withDummies = pd.concat([X_1.drop('gender',axis = 1),gender_dummy], axis=1, sort=False)
X_1_withDummies.head()

Unnamed: 0,age,totalchol,F,M
0,52,193,1,0
1,63,194,1,0
2,48,225,1,0
3,21,145,0,1
4,66,224,1,0


In [22]:
linearmodel.fit(X_1_withDummies,y)
### remark: after the above fit, your first fit with only 'age' and 'totalchol' is upgraded.

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [23]:
linearmodel.coef_

array([ 0.63488443,  0.00425716, -1.02301701,  1.02301701])

Exercise: Try other regression like logistic regression with gender as the response

Matrix Related

In [14]:
np.random.seed(456) # set seed number
A = np.random.rand(5,5)*5 # generate a 5-by-5 matrix A

## compute eigenvalues and eigenvectors by default method
eigenvals, eigenvecs = np.linalg.eig(A)
## print the eigenvectors
print(eigenvecs)

[[ 0.477782   -0.07423003  0.68949419 -0.89650641 -0.84354297]
 [ 0.53081157  0.6750859  -0.48336982  0.3289025   0.46911142]
 [ 0.36522739 -0.3950795  -0.23209361  0.26666139  0.12164777]
 [ 0.58220347  0.50568844  0.46009943 -0.03088222 -0.11999566]
 [ 0.13270835 -0.35628685 -0.15935013  0.12663881  0.19792066]]


In [15]:
## print the eigenvalues
print(eigenvals)

[10.93980155  2.1571264   1.32795015 -0.52341536  0.06642139]


In [16]:
### matrix norm
np.linalg.norm(A)

13.10976091757407

In [17]:
### the norm function is default to be Frobenius norm
np.sqrt(np.sum(A**2))

13.10976091757407

In [18]:
### matrix nuclear norm
np.linalg.norm(A,ord = 'nuc')

19.745225836681367

Exercise: nuclear norm of A can be calculated as :
$$trace(\sqrt{A*A})$$
Use the formula above to check the nuclear norm with the one calculated by np.linalg.norm

PCA

In [19]:
from sklearn.decomposition import PCA,KernelPCA

In [20]:
pca = PCA(n_components=5)
pca.fit(A)

PCA(copy=True, iterated_power='auto', n_components=5, random_state=None,
    svd_solver='auto', tol=0.0, whiten=False)

In [21]:
pca.singular_values_

array([4.19290602e+00, 3.91564940e+00, 1.64096295e+00, 9.94234693e-01,
       2.72616511e-16])

In [22]:
pca.explained_variance_ratio_

array([4.80418784e-01, 4.18983886e-01, 7.35846574e-02, 2.70126723e-02,
       2.03092569e-33])

pca is closely related with the covariance matrix

In [23]:
var_A = np.cov(A)

In [24]:
np.linalg.svd(var_A)

(array([[-0.63266383,  0.09762752, -0.72857525,  0.24368086,  0.00175856],
        [ 0.74662103,  0.0569548 , -0.65743801, -0.05052401,  0.06734821],
        [ 0.03287184,  0.83860299,  0.06018111, -0.06682672, -0.53626196],
        [-0.20187217, -0.0153911 , -0.14952001, -0.96548082,  0.06709166],
        [-0.02146131,  0.53266608,  0.10476341,  0.0380518 ,  0.83867929]]),
 array([4.15967350e+00, 1.48448199e+00, 1.12406605e+00, 6.13834847e-02,
        1.97487468e-16]),
 array([[-0.63266383,  0.74662103,  0.03287184, -0.20187217, -0.02146131],
        [ 0.09762752,  0.0569548 ,  0.83860299, -0.0153911 ,  0.53266608],
        [-0.72857525, -0.65743801,  0.06018111, -0.14952001,  0.10476341],
        [ 0.24368086, -0.05052401, -0.06682672, -0.96548082,  0.0380518 ],
        [-0.00175856, -0.06734821,  0.53626196, -0.06709166, -0.83867929]]))