In [1]:
# load libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.preprocessing import scale, StandardScaler
from sklearn.decomposition import PCA

In [2]:
# load dataset
data = pd.read_csv("data_lecture_12.csv")
data.head()

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,OD280_OD315_of_diluted_wines,proline
0,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050
2,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185
3,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480
4,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735


In [3]:
# partition dataset into training and validation sets using holdout method
X_train, X_val, Y_train, Y_val = train_test_split(data[['malic_acid', 'ash', 'alcalinity_of_ash', 'magnesium', 'total_phenols',\
                                                         'flavanoids', 'nonflavanoid_phenols', 'proanthocyanins', 'color_intensity',\
                                                         'hue', 'OD280_OD315_of_diluted_wines', 'proline']],\
                                                    data['alcohol'], test_size = 0.2, random_state = 0)

In [4]:
# explore training set
print(X_train.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 142 entries, 161 to 172
Data columns (total 12 columns):
malic_acid                      142 non-null float64
ash                             142 non-null float64
alcalinity_of_ash               142 non-null float64
magnesium                       142 non-null int64
total_phenols                   142 non-null float64
flavanoids                      142 non-null float64
nonflavanoid_phenols            142 non-null float64
proanthocyanins                 142 non-null float64
color_intensity                 142 non-null float64
hue                             142 non-null float64
OD280_OD315_of_diluted_wines    142 non-null float64
proline                         142 non-null int64
dtypes: float64(10), int64(2)
memory usage: 14.4 KB
None


In [5]:
# explore validation set
print(X_val.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 36 entries, 54 to 44
Data columns (total 12 columns):
malic_acid                      36 non-null float64
ash                             36 non-null float64
alcalinity_of_ash               36 non-null float64
magnesium                       36 non-null int64
total_phenols                   36 non-null float64
flavanoids                      36 non-null float64
nonflavanoid_phenols            36 non-null float64
proanthocyanins                 36 non-null float64
color_intensity                 36 non-null float64
hue                             36 non-null float64
OD280_OD315_of_diluted_wines    36 non-null float64
proline                         36 non-null int64
dtypes: float64(10), int64(2)
memory usage: 3.7 KB
None


In [6]:
# scale data
X_train = scale(X_train)
X_val = scale(X_val)
print([X_train.mean(), X_val.mean()]) # show means
print([X_train.std(), X_val.std()]) # show standard deviation

[-7.114809524006285e-17, 1.2592807455239045e-16]
[1.0, 1.0]


In [7]:
# build regression model with all as predictors
model = linear_model.LinearRegression().fit(X = X_train, y = Y_train)
score_train = model.score(X = X_train, y = Y_train) # R squared (training)
score_val = model.score(X = X_val, y = Y_val) # R squared (validation)
print(model.coef_)
print([score_train, score_val])

[ 0.11423491  0.05735224 -0.10531089  0.0446806  -0.0131364   0.21775444
  0.05690744 -0.15666349  0.42101127  0.05115575  0.08839278  0.24272   ]
[0.6048488360784352, 0.41220150505672914]


In [8]:
# build LASSO regression model
model = linear_model.Lasso(alpha = 1).fit(X = X_train, y = Y_train)
print(model.coef_)
print(model.intercept_)
score_train = model.score(X = X_train, y = Y_train) # R squared (training)
score_val = model.score(X = X_val, y = Y_val) # R squared (validation)
print([score_train, score_val])

[ 0.  0. -0.  0.  0.  0. -0.  0.  0. -0.  0.  0.]
12.984859154929579
[0.0, -0.008879893188963761]


In [9]:
# reduce dimensionality using PCA
pca = PCA(n_components = 6).fit(X_train) # project data into principal components
X_train_pca = pca.transform(X_train) # apply projection on training set
X_val_pca = pca.transform(X_val) # apply projection on validation set

In [10]:
# show percentage of variance explained by each principal component
print(pca.explained_variance_ratio_)

[0.39461488 0.1662307  0.11371862 0.07620343 0.06504009 0.05129967]


In [11]:
# QUESTION: how many principal components do we use?
# ANSWER: we use the first six principal components that explain 86.7% of the variance

In [12]:
# show coefficients of principal components
print(pca.components_)

[[-0.25694531 -0.02386606 -0.23700857  0.11726249  0.38652386  0.42819794
  -0.30911422  0.30881861 -0.1389186   0.32088422  0.38579505  0.26212283]
 [-0.24469632 -0.42237884 -0.03778652 -0.36660273 -0.17228757 -0.07659385
  -0.00339845 -0.1698935  -0.5501029   0.28175272  0.13125497 -0.40279499]
 [-0.04010051 -0.58253958 -0.62559843  0.09732088 -0.14217041 -0.11347446
  -0.20693714 -0.08955465  0.24509037 -0.17577975 -0.20556826  0.20257547]
 [-0.17453814  0.06056349  0.16958229  0.77976808 -0.23862671 -0.1555229
  -0.35750766 -0.13182968 -0.24595393  0.11793382 -0.12333527 -0.11462073]
 [ 0.61797779 -0.25927728  0.06276918  0.10478326  0.07462063  0.06148154
  -0.25223024  0.42222642 -0.15909143 -0.30885211  0.16887734 -0.36809851]
 [-0.39146633 -0.24777433  0.11642874  0.18101695 -0.02195904 -0.06475922
   0.44560084  0.66420598  0.13637706  0.04473311 -0.21592031 -0.16556636]]


In [13]:
# build regression model with principal components as predictors
model = linear_model.LinearRegression().fit(X = X_train_pca, y = Y_train)
score_train = model.score(X = X_train_pca, y = Y_train) # R squared (training)
score_val = model.score(X = X_val_pca, y = Y_val) # R squared (validation)
print(model.coef_)
print([score_train, score_val])

[ 0.07732375 -0.35592129  0.13684361 -0.16614539 -0.17149378 -0.1551822 ]
[0.5712788929279324, 0.5181850359346911]
