# 머신러닝 선형 회귀

In [None]:
import os
import pandas as pd
import numpy as np
from plt_rcs import *
import hds

In [None]:
df = pd.read_csv('https://bit.ly/UsedCarsPrice')

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.columns

In [None]:
cols = ['MetColor', 'Automatic']

In [None]:
df[cols] = df[cols].astype(str)

In [None]:
df.dtypes

In [None]:
df.describe().round(3)

In [None]:
df.sort_values('KM')

In [None]:
df = df.loc[df['KM'].gt(1)]

In [None]:
df.shape

In [None]:
df.describe(include=object)

In [None]:
df['Price'].agg(func=['min', 'max'])

In [None]:
plt.rc(group='figure', figsize=(4,4))

In [None]:
sns.histplot(
    data=df, x='Price',
    binrange=(4000, 16000), binwidth=1000,
    fc='0.8', ec='0'
)
plt.show()

In [None]:
sns.kdeplot(
    data=df, x='Price',
    fill=True, color='0.8'
)
plt.axvline(x=df['Price'].median())
plt.axvline(x=df['Price'].mean(), color='red', linestyle='--')
plt.show()

In [None]:
hds.plot.corr_heatmap(data=df)

In [None]:
hds.plot.regline(data=df, x='Age', y='Price')

In [None]:
hds.plot.regline(data=df, x='KM', y='Price')

In [None]:
hds.plot.regline(data=df, x='HP', y='Price')

In [None]:
hds.plot.regline(data=df, x='CC', y='Price')

In [None]:
hds.plot.regline(data=df, x='Doors', y='Price')

In [None]:
hds.plot.regline(data=df, x='Weight', y='Price')

In [None]:
hds.plot.box_group(data=df, x='FuelType', y='Price', palette='Pastel1')

In [None]:
hds.plot.box_group(data=df, x='MetColor', y='Price', palette='Pastel1')

In [None]:
hds.plot.box_group(data=df, x='Automatic', y='Price', palette='Pastel1')

In [None]:
cond1 = df['Doors'].ne(2)
cond2 = df['Weight'].le(1250)
df = df.loc[cond1 & cond2, :]
df.shape

In [None]:
df.dtypes

In [None]:
df[cols] = df[cols].astype(int)

In [None]:
df.dtypes

In [None]:
df.groupby('FuelType')['Price'].mean()

In [None]:
df = pd.get_dummies(data=df, columns=['FuelType'], dtype=int)
df.head()

In [None]:
yvar = 'Price'
X = df.drop(columns=yvar)
y = df[yvar].copy()
display(X)
display(y)

## 데이터셋 분할

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=1234)

In [None]:
X_train.shape

In [None]:
X_valid.shape

In [None]:
y_train.mean()
# np.float64(9697.907297830374)
y_valid.mean()
# np.float64(9692.633858267716)

## 선형 회귀모델 학습

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
model_linear = LinearRegression()

In [None]:
model_linear.fit(X=X_train, y=y_train)

In [None]:
model_linear.get_params()
# {'copy_X': True,
#  'fit_intercept': True,
#  'n_jobs': None,
#  'positive': False,
#  'tol': 1e-06}

In [None]:
model_linear.score(X=X_train, y=y_train)
# 0.7453200377582148
model_linear.score(X=X_valid, y=y_valid)
# 0.7350351379962555

In [None]:
model_linear.coef_
# array([-9.93166454e+01, -1.67673109e-02, -2.96617966e+01, -1.05935604e+01,
#         3.84406446e+01,  2.65917973e+00, -2.74794384e+01,  1.74106123e+01,
#         6.13430567e+02, -1.94223202e+03,  1.32880146e+03])

In [None]:
model_linear.intercept_
# np.float64(-3566.4391928929563)

## 릿지 선형 회귀모델 학습

In [None]:
from sklearn.linear_model import Ridge

In [None]:
model_ridge = Ridge(alpha=1)

In [None]:
model_ridge.fit(X=X_train, y=y_train)

In [None]:
model_ridge.get_params()

In [None]:
model_ridge.score(X=X_train, y=y_train)
# 0.745024558772102

model_ridge.score(X=X_valid, y=y_valid)

## 라쏘 선형 회귀모델 학습

In [None]:
from sklearn.linear_model import Lasso

In [None]:
model_lasso = Lasso(alpha=1)

In [None]:
model_lasso.fit(X=X_train, y=y_train)

In [None]:
model_lasso.score(X=X_train, y=y_train)
# 0.7451306731162222
model_lasso.score(X=X_valid, y=y_valid)
# 0.7368623585441034

## 세 모델의 회귀계수 비교

In [None]:
pd.DataFrame(
    data={
        'Linear': model_linear.coef_,
        'Ridge': model_ridge.coef_,
        'Lasso': model_lasso.coef_
    },
    index=X_train.columns
)

## 예측값 생성

In [None]:
y_pred_linear = model_linear.predict(X=X_valid)
y_pred_ridge = model_ridge.predict(X=X_valid)
y_pred_lasso = model_lasso.predict(X=X_valid)

In [None]:
pd.DataFrame(
    data={
        'Real': y_valid,
        'Pred_Linear': y_pred_linear,
        'Pred_Ridge': y_pred_ridge,
        'Pred_Lasso': y_pred_lasso
    }
)

In [None]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import root_mean_squared_error
from sklearn.metrics import mean_absolute_percentage_error

In [None]:
mean_squared_error(y_true=y_valid, y_pred=y_pred_linear)
# 972596.4199871043

In [None]:
root_mean_squared_error(y_true=y_valid, y_pred=y_pred_ridge)
# 981.7528319556632

In [None]:
mean_absolute_percentage_error(y_true=y_valid, y_pred=y_pred_lasso)
# 0.08251582058445193

In [None]:
hds.stat.regmetrics(y_true=y_valid, y_pred=y_pred_linear)

In [None]:
hds.stat.regmetrics(y_true=y_valid, y_pred=y_pred_ridge)

In [None]:
hds.stat.regmetrics(y_true=y_valid, y_pred=y_pred_lasso)

## 최적의 규제 상수 탐색

In [None]:
alphas = np.arange(0.1, 20.1, 0.1)

In [None]:
vl_req = []

for alpha in alphas:
    model_ridge.set_params(alpha=alpha).fit(X=X_train, y=y_train)
    vl_req.append(model_ridge.score(X=X_valid, y=y_valid))

In [None]:
sns.lineplot(x=alphas, y=vl_req)

In [None]:
np.max(vl_req)
# np.float64(0.7397932326581274)
index = np.argmax(vl_req)
# np.int64(106)

In [None]:
# 최적의 람다값 확인
alphas[index]
# np.float64(10.700000000000001)

### 라쏘

In [None]:
model_lasso.get_params()

In [None]:
vl_req = []

for alpha in alphas:
    model_lasso.set_params(alpha=alpha, max_iter=2000).fit(X=X_train, y=y_train)
    vl_req.append(model_lasso.score(X=X_valid, y=y_valid))

In [None]:
sns.lineplot(x=alphas, y=vl_req)

In [None]:
np.max(vl_req)

In [None]:
index = np.argmax(vl_req)
alphas[index]

In [None]:
model_lasso.set_params(alpha=alphas[index]).fit(X=X_train, y=y_train)

In [None]:
y_pred_lasso = model_lasso.predict(X=X_valid)

In [None]:
hds.stat.regmetrics(y_true=y_valid, y_pred=y_pred_lasso)