### normal_equation code

In [20]:
import numpy as np


class LinearRegression(object):
    def __init__(self, fit_intercept=True, copy_X=True):
        self.fit_intercept = fit_intercept
        self.copy_X = copy_X

        self._coef = None
        self._intercept = None
        self._new_X = None

    def fit(self, X, y):
        """
        Linear regression 모델을 적합한다.
        Matrix X와 Vector Y가 입력 값으로 들어오면 Normal equation을 활용하여, weight값을
        찾는다. 이 때, instance가 생성될 때, fit_intercept 설정에 따라 fit 실행이 달라진다.
        fit을 할 때는 입력되는 X의 값은 반드시 새로운 변수(self._new_X)에 저장
        된 후 실행되어야 한다.
        fit_intercept가 True일 경우:
            - Matrix X의 0번째 Column에 값이 1인 column vector를추가한다.
        적합이 종료된 후 각 변수의 계수(coefficient 또는 weight값을 의미)는 self._coef와
        self._intercept_coef에 저장된다. 이때 self._coef는 numpy array을 각 변수항의
        weight값을 저장한 1차원 vector이며, self._intercept_coef는 상수항의 weight를
        저장한 scalar(float) 이다.
        Parameters
        ----------
        X : numpy array, 2차원 matrix 형태로 [n_samples,n_features] 구조를 가진다
        y : numpy array, 1차원 vector 형태로 [n_targets]의 구조를 가진다.
        Returns
        -------
        self : 현재의 인스턴스가 리턴된다
        """
        pass

    def predict(self, X):
        """
        적합된 Linear regression 모델을 사용하여 입력된 Matrix X의 예측값을 반환한다.
        이 때, 입력된 Matrix X는 별도의 전처리가 없는 상태로 입력되는 걸로 가정한다.
        fit_intercept가 True일 경우:
            - Matrix X의 0번째 Column에 값이 1인 column vector를추가한다.
        normalize가 True일 경우:
            - Standard normalization으로 Matrix X의 column 0(상수)를 제외한 모든 값을
              정규화을 실행함
            - 정규화를 할때는 self._mu_X와 self._std_X 에 있는 값을 사용한다.
        Parameters
        ----------
        X : numpy array, 2차원 matrix 형태로 [n_samples,n_features] 구조를 가진다
        Returns
        -------
        y : numpy array, 예측된 값을 1차원 vector 형태로 [n_predicted_targets]의
            구조를 가진다.
        """
        return None

    @property
    def coef(self):
        return self._coef

    @property
    def intercept(self):
        return self._intercept

### linear_regression_example

In [186]:
class LinearRegression(object):
    def __init__(self, fit_intercept=True, copy_X=True):
        self.fit_intercept = fit_intercept
        self.copy_X = copy_X
        
        self._coef = None
        self._intercept = None
        self._new_X = None
        
    def fit(self, x, y):
        self._new_X = np.array(x)
        y = y.reshape(-1,1)
        
        if self.fit_intercept:
            intercept_vector = np.ones([len(self._new_X), 1])
            self._new_X = np.concatenate(
                    (intercept_vector, self._new_X), axis=1)
            
        weights = np.linalg.inv(
                self._new_X.T.dot(self._new_X)).dot(
                                            self._new_X.T.dot(y)).flatten()
        
        if self.fit_intercept:
            self._intercept = weights[0]
            self._coef = weights[1:]
        else:
            self._coef = weights
    
    def predict(self, x):
        test_X = np.array(x)
        
        if self.fit_intercept:
            intercept_vector = np.ones([len(test_X), 1])
            test_X = np.concatenate(
                    (intercept_vector, test_X), axis=1)
            
            weights = np.concatenate(([self._intercept],self._coef), axis=0)
        
        else:
            weights = self._coef
        return test_X.dot(weights)

    @property
    def coef(self):
        return self._coef
    @property
    def intercept(self):
        return self._intercept

In [187]:
import pandas as pd
import numpy as np
import imp

In [188]:
df = pd.read_csv("C:/Data/test2.csv")
df.head()

Unnamed: 0,x,y
0,77,79.775152
1,21,23.177279
2,22,25.609262
3,20,17.857388
4,36,41.849864


In [189]:
x = df["x"].values.reshape(-1,1)
y = df["y"].values

In [190]:
lr = LinearRegression(fit_intercept=True)

In [191]:
lr.fit(x,y)

In [192]:
y.shape

(300,)

In [193]:
lr.fit(x,y)

In [194]:
lr.intercept

-0.46181077366111367

In [195]:
lr.coef

array([1.01433536])

In [196]:
lr.predict(x)[:10]

array([77.64201157, 20.83923168, 21.85356704, 19.82489633, 36.05426201,
       14.75321955, 62.42698124, 95.90004796, 19.82489633,  4.609866  ])

### Validation

In [197]:
from sklearn import linear_model
sk_lr = linear_model.LinearRegression(normalize=False)
sk_lr.fit(x,y)

LinearRegression()

In [198]:
sk_lr.intercept_

-0.4618107736611776

In [199]:
import numpy.testing as npt
npt.assert_almost_equal(sk_lr.intercept_, lr.intercept)

In [201]:
sk_lr.coef_

array([1.01433536])

In [204]:
X_test = df["x"].values.reshape(-1,1)

In [206]:
lr.predict(X_test)[:5]

array([77.64201157, 20.83923168, 21.85356704, 19.82489633, 36.05426201])

In [205]:
sk_lr.predict(X_test)[:5]

array([77.64201157, 20.83923168, 21.85356704, 19.82489633, 36.05426201])

### Load Dataset

In [209]:
df = pd.read_csv("C:/Data/mlr09.csv")
df.head()

Unnamed: 0,height_in_feet,weight_in_pounds,successful_field_goals,percent_of_successful_free_throws,average_points_scored
0,6.8,225,0.442,0.672,9.2
1,6.3,180,0.435,0.797,11.7
2,6.4,190,0.456,0.761,15.8
3,6.2,180,0.416,0.651,8.6
4,6.9,205,0.449,0.9,23.2


In [212]:
y = df["average_points_scored"].values
y

array([ 9.2, 11.7, 15.8,  8.6, 23.2, 27.4,  9.3, 16. ,  4.7, 12.5, 20.1,
        9.1,  8.1,  8.6, 20.3, 25. , 19.2,  3.3, 11.2, 10.5, 10.1,  7.2,
       13.6,  9. , 24.6, 12.6,  5.6,  8.7,  7.7, 24.1, 11.7,  7.7,  9.6,
        7.2, 12.3,  8.9, 13.6, 11.2,  2.8,  3.2,  9.4, 11.9, 15.4,  7.4,
       18.9,  7.9, 12.2, 11. ,  2.8, 11.8, 17.1, 11.6,  5.8,  8.3])

In [213]:
df.iloc[:,:-1].head()

Unnamed: 0,height_in_feet,weight_in_pounds,successful_field_goals,percent_of_successful_free_throws
0,6.8,225,0.442,0.672
1,6.3,180,0.435,0.797
2,6.4,190,0.456,0.761
3,6.2,180,0.416,0.651
4,6.9,205,0.449,0.9


In [214]:
X = df.iloc[:,:-1].values

In [215]:
X[:5]

array([[  6.8  , 225.   ,   0.442,   0.672],
       [  6.3  , 180.   ,   0.435,   0.797],
       [  6.4  , 190.   ,   0.456,   0.761],
       [  6.2  , 180.   ,   0.416,   0.651],
       [  6.9  , 205.   ,   0.449,   0.9  ]])

### Rescaled

In [221]:
mu_X = np.mean(X, axis=0)
std_X = np.std(X, axis=0)

rescaled_X = (X - mu_X) / std_X

In [219]:
mu_X

array([  6.58703704, 209.90740741,   0.44911111,   0.74185185])

In [217]:
rescaled_X[:5]

array([[ 0.46843663,  0.50336336, -0.12692668, -0.70404955],
       [-0.63137111, -0.99746237, -0.25187012,  0.55584824],
       [-0.41140956, -0.66394554,  0.12296022,  0.19299768],
       [-0.85133266, -0.99746237, -0.59100234, -0.91571238],
       [ 0.68839818, -0.1636703 , -0.00198323,  1.59400403]])

### Validation

In [222]:
lr.fit(rescaled_X,y)

In [223]:
lr.coef

array([-1.67779283,  0.28359762,  2.68586629,  1.12816882])

In [225]:
lr.intercept

11.790740740740738

In [226]:
sk_lr.fit(rescaled_X,y)

LinearRegression()

In [227]:
sk_lr.coef_

array([-1.67779283,  0.28359762,  2.68586629,  1.12816882])

In [228]:
sk_lr.intercept_

11.790740740740736

### Linear Regression with sklearn

In [229]:
from sklearn.datasets import load_boston
import matplotlib.pyplot as plt
import numpy as np

In [234]:
boston = load_boston()

In [235]:
boston.keys()

dict_keys(['data', 'target', 'feature_names', 'DESCR', 'filename'])

In [240]:
print(boston["data"])

[[6.3200e-03 1.8000e+01 2.3100e+00 ... 1.5300e+01 3.9690e+02 4.9800e+00]
 [2.7310e-02 0.0000e+00 7.0700e+00 ... 1.7800e+01 3.9690e+02 9.1400e+00]
 [2.7290e-02 0.0000e+00 7.0700e+00 ... 1.7800e+01 3.9283e+02 4.0300e+00]
 ...
 [6.0760e-02 0.0000e+00 1.1930e+01 ... 2.1000e+01 3.9690e+02 5.6400e+00]
 [1.0959e-01 0.0000e+00 1.1930e+01 ... 2.1000e+01 3.9345e+02 6.4800e+00]
 [4.7410e-02 0.0000e+00 1.1930e+01 ... 2.1000e+01 3.9690e+02 7.8800e+00]]


In [241]:
x_data = boston.data
y_data = boston.target.reshape(boston.target.size,1)

x_data[:3]

array([[6.3200e-03, 1.8000e+01, 2.3100e+00, 0.0000e+00, 5.3800e-01,
        6.5750e+00, 6.5200e+01, 4.0900e+00, 1.0000e+00, 2.9600e+02,
        1.5300e+01, 3.9690e+02, 4.9800e+00],
       [2.7310e-02, 0.0000e+00, 7.0700e+00, 0.0000e+00, 4.6900e-01,
        6.4210e+00, 7.8900e+01, 4.9671e+00, 2.0000e+00, 2.4200e+02,
        1.7800e+01, 3.9690e+02, 9.1400e+00],
       [2.7290e-02, 0.0000e+00, 7.0700e+00, 0.0000e+00, 4.6900e-01,
        7.1850e+00, 6.1100e+01, 4.9671e+00, 2.0000e+00, 2.4200e+02,
        1.7800e+01, 3.9283e+02, 4.0300e+00]])

In [243]:
from sklearn import preprocessing

minmax_scale = preprocessing.MinMaxScaler().fit(x_data)
x_scaled_data = minmax_scale.transform(x_data)

x_scaled_data[:3]

array([[0.00000000e+00, 1.80000000e-01, 6.78152493e-02, 0.00000000e+00,
        3.14814815e-01, 5.77505269e-01, 6.41606591e-01, 2.69203139e-01,
        0.00000000e+00, 2.08015267e-01, 2.87234043e-01, 1.00000000e+00,
        8.96799117e-02],
       [2.35922539e-04, 0.00000000e+00, 2.42302053e-01, 0.00000000e+00,
        1.72839506e-01, 5.47997701e-01, 7.82698249e-01, 3.48961980e-01,
        4.34782609e-02, 1.04961832e-01, 5.53191489e-01, 1.00000000e+00,
        2.04470199e-01],
       [2.35697744e-04, 0.00000000e+00, 2.42302053e-01, 0.00000000e+00,
        1.72839506e-01, 6.94385898e-01, 5.99382080e-01, 3.48961980e-01,
        4.34782609e-02, 1.04961832e-01, 5.53191489e-01, 9.89737254e-01,
        6.34657837e-02]])

In [263]:
from sklearn import linear_model

regr = linear_model.LinearRegression(fit_intercept=True,
                                    normalize=False,
                                    copy_X=True,
                                    n_jobs=8)

regr.fit(x_scaled_data, y_data)
regr

LinearRegression(n_jobs=8)

In [248]:
regr.coef_

array([[ -9.60975755,   4.64204584,   0.56083933,   2.68673382,
         -8.63457306,  19.88368651,   0.06721501, -16.22666104,
          7.03913802,  -6.46332721,  -8.95582398,   3.69282735,
        -19.01724361]])

In [249]:
regr.intercept_

array([26.62026758])

In [254]:
regr.predict(x_scaled_data[:10])

array([[30.00384338],
       [25.02556238],
       [30.56759672],
       [28.60703649],
       [27.94352423],
       [25.25628446],
       [23.00180827],
       [19.53598843],
       [11.52363685],
       [18.92026211]])

In [266]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test =train_test_split(x_scaled_data, y_data, test_size=0.2)

In [267]:
from sklearn import linear_model

regr = linear_model.LinearRegression(fit_intercept=True,
                                    normalize=False,
                                    copy_X=True,
                                    n_jobs=8)

regr.fit(X_train, y_train)
regr

LinearRegression(n_jobs=8)

In [269]:
y_true = y_test
y_pred = regr.predict(X_test)

In [270]:
np.sqrt(((y_true - y_pred)** 2).sum()/len(y_true))

5.311009812970682

In [271]:
from sklearn.metrics import mean_squared_error
np.sqrt(mean_squared_error(y_true, y_pred))

5.311009812970682