<a href="https://colab.research.google.com/github/udadai/ML-study/blob/main/tips_wine_regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 간단한 회귀

## tips

In [138]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

### 데이터

In [139]:
tips = pd.read_csv('tips.csv')
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [140]:
tips.tail()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.0,Female,Yes,Sat,Dinner,2
241,22.67,2.0,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2
243,18.78,3.0,Female,No,Thur,Dinner,2


In [141]:
tips.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   total_bill  244 non-null    float64
 1   tip         244 non-null    float64
 2   sex         244 non-null    object 
 3   smoker      244 non-null    object 
 4   day         244 non-null    object 
 5   time        244 non-null    object 
 6   size        244 non-null    int64  
dtypes: float64(2), int64(1), object(4)
memory usage: 13.5+ KB


In [142]:
pd.unique(tips['sex'])

array(['Female', 'Male'], dtype=object)

In [143]:
pd.unique(tips['smoker'])

array(['No', 'Yes'], dtype=object)

In [144]:
pd.unique(tips['day'])

array(['Sun', 'Sat', 'Thur', 'Fri'], dtype=object)

In [145]:
pd.unique(tips['time'])

array(['Dinner', 'Lunch'], dtype=object)

In [146]:
# tips['sex_replace'] = tips['sex'].replace({'Female':0, 'Male':1})
# tips['sex_map'] = tips['sex'].map({'Female':0, 'Male':1})

In [147]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
tips['sex_le'] = le.fit_transform(tips['sex'])
tips['smoker_le'] = le.fit_transform(tips['smoker'])
tips['day_le'] = le.fit_transform(tips['day'])
tips['time_le'] = le.fit_transform(tips['time'])
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,sex_le,smoker_le,day_le,time_le
0,16.99,1.01,Female,No,Sun,Dinner,2,0,0,2,0
1,10.34,1.66,Male,No,Sun,Dinner,3,1,0,2,0
2,21.01,3.5,Male,No,Sun,Dinner,3,1,0,2,0
3,23.68,3.31,Male,No,Sun,Dinner,2,1,0,2,0
4,24.59,3.61,Female,No,Sun,Dinner,4,0,0,2,0


In [148]:
tips_target = tips['tip']
tips_input = tips.drop(['tip', 'sex', 'smoker', 'day', 'time'], axis=1)
tips_input.head()

Unnamed: 0,total_bill,size,sex_le,smoker_le,day_le,time_le
0,16.99,2,0,0,2,0
1,10.34,3,1,0,2,0
2,21.01,3,1,0,2,0
3,23.68,2,1,0,2,0
4,24.59,4,0,0,2,0


In [149]:
train_input, test_input, train_target, test_target = train_test_split(tips_input, tips_target, random_state=42)

In [150]:
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
ss.fit(train_input)
train_scaled = ss.transform(train_input)
test_scaled = ss.transform(test_input)

### 모델

In [151]:
from sklearn.neighbors import KNeighborsRegressor
knr = KNeighborsRegressor(n_neighbors=8)
knr.fit(train_scaled, train_target)
print(knr.score(train_scaled, train_target))
print(knr.score(test_scaled, test_target))

0.5066367995716345
0.3183101605393934


In [152]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(train_scaled, train_target)
print(lr.score(train_scaled, train_target))
print(lr.score(test_scaled, test_target))

0.47760215904261927
0.3470695928047469


### 검증

In [153]:
from sklearn.metrics import mean_absolute_error, mean_squared_error
knr_predict = knr.predict(test_scaled)
knr_mae = mean_absolute_error(test_target, knr_predict)
lr_predict = lr.predict(test_scaled)
lr_mae = mean_absolute_error(test_target, lr_predict)
print(knr_mae, lr_mae)

0.7714344262295082 0.7006905273520009


In [154]:
knr_predict = knr.predict(test_scaled)
knr_mse = mean_squared_error(test_target, knr_predict)
lr_predict = lr.predict(test_scaled)
lr_mse = mean_squared_error(test_target, lr_predict)
print(np.sqrt(knr_mse), np.sqrt(lr_mse))

0.9523324472913512 0.9320272674782089


## wine

### 데이터

In [155]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [156]:
redwine = pd.read_csv('redwine.csv')
redwine.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5.0
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5.0
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5.0
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6.0
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5.0


In [157]:
redwine.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1599 entries, 0 to 1598
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         1599 non-null   float64
 1   volatile acidity      1599 non-null   float64
 2   citric acid           1599 non-null   float64
 3   residual sugar        1599 non-null   float64
 4   chlorides             1599 non-null   float64
 5   free sulfur dioxide   1599 non-null   float64
 6   total sulfur dioxide  1599 non-null   float64
 7   density               1599 non-null   float64
 8   pH                    1599 non-null   float64
 9   sulphates             1599 non-null   float64
 10  alcohol               1599 non-null   float64
 11  quality               1599 non-null   float64
dtypes: float64(12)
memory usage: 150.0 KB


In [158]:
redwine_target = redwine['quality']
redwine_input = redwine.drop(['quality'], axis=1)

In [159]:
train_input, test_input, train_target, test_target = train_test_split(redwine_input, redwine_target, random_state=42)

### 모델

In [160]:
from sklearn.neighbors import KNeighborsRegressor
knr = KNeighborsRegressor()
knr.fit(train_input, train_target)
print(knr.score(train_input, train_target))
print(knr.score(test_input, test_target))

0.4493132723664731
0.14483455418034874


In [161]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(train_input, train_target)
print(lr.score(train_input, train_target))
print(lr.score(test_input, test_target))

0.354852874520566
0.3722831200818114


### 검증

In [162]:
from sklearn.metrics import mean_absolute_error, mean_squared_error
knr_predict = knr.predict(train_input)
knr_mae = mean_absolute_error(train_target, knr_predict)
lr_predict = lr.predict(train_input)
lr_mae = mean_absolute_error(train_target, lr_predict)
print(knr_mae, lr_mae)

0.44904086738949134 0.5001720018903489
