# Linear Regression, Ridge and Lasso

In [13]:
# house pricing dataset
from sklearn.datasets import fetch_california_housing
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [14]:
housing = fetch_california_housing()
print(housing.data.shape, housing.target.shape)

(20640, 8) (20640,)


In [15]:
print(housing.data)

[[   8.3252       41.            6.98412698 ...    2.55555556
    37.88       -122.23      ]
 [   8.3014       21.            6.23813708 ...    2.10984183
    37.86       -122.22      ]
 [   7.2574       52.            8.28813559 ...    2.80225989
    37.85       -122.24      ]
 ...
 [   1.7          17.            5.20554273 ...    2.3256351
    39.43       -121.22      ]
 [   1.8672       18.            5.32951289 ...    2.12320917
    39.43       -121.32      ]
 [   2.3886       16.            5.25471698 ...    2.61698113
    39.37       -121.24      ]]


In [16]:
print(housing.feature_names)

['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude']


In [17]:
x = pd.DataFrame(housing.data)
x.head()

Unnamed: 0,0,1,2,3,4,5,6,7
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25


In [18]:
x.columns = housing.feature_names
x.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25


In [19]:
y = housing.target
y

array([4.526, 3.585, 3.521, ..., 0.923, 0.847, 0.894])

In [20]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0)

In [21]:
X_train.shape

(14448, 8)

In [22]:
X_test.shape

(6192, 8)

In [31]:
# with cross_validation.

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
lg = LinearRegression()
scores = cross_val_score(lg, x, y, cv=5)
print(scores.mean())
print(scores)

0.5530311140279577
[0.54866323 0.46820691 0.55078434 0.53698703 0.66051406]


In [25]:
# Normal train and test data with random_state

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
lg = LinearRegression()
lg.fit(X_train, y_train)

In [26]:
y_pre = lg.predict(X_test)
y_pre

array([2.2702672 , 2.79059912, 1.90984782, ..., 3.56895295, 0.98163812,
       2.72552317])

In [27]:
y_test

array([1.369, 2.413, 2.007, ..., 3.519, 0.925, 2.983])

In [28]:
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
mean_absolute_error(y_test, y_pre)

0.5361818140641836

In [29]:
r2_score(y_test, y_pre)

0.5926087785518775

In [30]:
mean_squared_error(y_test, y_pre)

0.543148967003724