<a href="https://colab.research.google.com/github/sunnyskydream/ML-practice/blob/main/1_A_Supervised_Learning_Multiple_Linear_Regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
import scipy as sp
import sklearn

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme()
%matplotlib inline

import requests, zipfile
import io

#import data
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data'
res = requests.get(url).content

auto = pd.read_csv(io.StringIO(res.decode('utf-8')), header = None)
auto.columns = ['symboling','normalized-losses',' make','fuel-type',' aspiration',' num-of-doors',' body-style',' drive-wheels',' engine-location',' wheel-base',' length','width','height',' curb-weight',' engine-type',' num-of-cylinders',' engine-size',' fuel-system',' bore',' stroke',' compression-ratio','horsepower',' peak-rpm',' city-mpg',' highway-mpg','price']
print('auto.shape: {}'.format(auto.shape))
auto.head()

auto.shape: (205, 26)


Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,length,width,height,curb-weight,engine-type,num-of-cylinders,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,168.8,64.1,48.8,2548,dohc,four,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495
1,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,168.8,64.1,48.8,2548,dohc,four,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500
2,1,?,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,171.2,65.5,52.4,2823,ohcv,six,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500
3,2,164,audi,gas,std,four,sedan,fwd,front,99.8,176.6,66.2,54.3,2337,ohc,four,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950
4,2,164,audi,gas,std,four,sedan,4wd,front,99.4,176.6,66.4,54.3,2824,ohc,five,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450


In [2]:
#clean data
auto = auto[['price', 'horsepower','width','height']]
auto.isin(['?']).sum()

price         4
horsepower    2
width         0
height        0
dtype: int64

In [3]:
auto= auto.replace('?', np.nan).dropna()
print('auto.shape: {}'.format(auto.shape))

auto.shape: (199, 4)


In [4]:
auto.dtypes
auto = auto.assign(price = pd.to_numeric(auto.price))
auto = auto.assign(horsepower = pd.to_numeric(auto.horsepower))
print('change data type confirm \n{}\n'.format(auto.dtypes))

change data type confirm 
price           int64
horsepower      int64
width         float64
height        float64
dtype: object



In [5]:
#check multi-colinearity: corr for width and horsepower is high, for testing only keep all three
auto.head()
auto.corr()

Unnamed: 0,price,horsepower,width,height
price,1.0,0.810533,0.753871,0.13499
horsepower,0.810533,1.0,0.615315,-0.087407
width,0.753871,0.615315,1.0,0.309223
height,0.13499,-0.087407,0.309223,1.0


In [6]:
#build model
#import 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

#target = auto.price
X = auto.drop('price', axis = 1)
y = auto['price']

#train and test set split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.5, random_state = 0)

#initialize and train
model = LinearRegression()
model.fit(X_train, y_train)

# R squre, intercept, 
print('R square(train): {:.3f}'.format(model.score(X_train,y_train)))
print('R square(test): {:.3f}'.format(model.score(X_test,y_test)))

print('\n coefficient\n{}'.format(pd.Series(model.coef_, index = X.columns)))
print('\n intercept\n{}'.format(model.intercept_))

R square(train): 0.733
R square(test): 0.737

 coefficient
horsepower      81.651078
width         1829.174506
height         229.510077
dtype: float64

 intercept
-128409.0463033857


Regularization 正則化 抑制模型複雜度的作用項<br/>
Regularization 方法一般形式，应该是 min  , R是regularization term。

一般方法有
*   L1 regularization: 对整个绝对值只和进行惩罚。
*   L2 regularization：对系数平方和进行惩罚。
*   Elastic-net 混合regularization

https://www.zhihu.com/question/59939602

在多元線性回歸中,對於投入的解釋變數數量,由分析者這邊的調整來調整模型的複雜度, 相對於此,Lasso/Ridge 回歸則藉由模型自己來抑制參數本身的大小

Lasso Regression <br/>
Ridge Regression

In [7]:
#Compare Ridge and LinearRegression 
auto.head()

Unnamed: 0,price,horsepower,width,height
0,13495,111,64.1,48.8
1,16500,111,64.1,48.8
2,16500,154,65.5,52.4
3,13950,102,66.2,54.3
4,17450,115,66.4,54.3


In [9]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split

#target = auto.price
X = auto.drop('price', axis = 1)
y = auto['price']

#train and test set split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.5, random_state = 0)

#initialize and train
linear = LinearRegression()
ridge = Ridge(random_state = 0)

for model in [linear, ridge]: 
  model.fit(X_train, y_train)
  print('{}(train): {:.6f}'.format(model.__class__.__name__ , model.score(X_train,y_train)))
  print('{}(test): {:.6f}'.format(model.__class__.__name__ , model.score(X_test,y_test)))

  #Linear Regression does better on train set, but Ridge does better on test set becuase of regularization

LinearRegression(train): 0.733358
LinearRegression(test): 0.737069
Ridge(train): 0.733355
Ridge(test): 0.737768


In [11]:
#https://stackoverflow.com/questions/36367736/use-name-as-attribute