## 데이터 전처리

In [252]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

In [253]:
df = pd.read_csv('../csv/train.csv')

In [254]:
drop_cols = ['ID','판매도시','판매구역','모델출시년도']
df = df.drop(columns=drop_cols)
df = df[df["주행거리"] >= 200]
df['연평균주행거리'] = df['주행거리'] / (2022 - df['생산년도'])
df = df[df['연평균주행거리'] <= 100000]
df = df.reset_index(drop=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 56695 entries, 0 to 56694
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   생산년도         56695 non-null  int64  
 1   브랜드          56695 non-null  object 
 2   차량모델명        56695 non-null  object 
 3   주행거리         56695 non-null  int64  
 4   배기량          56695 non-null  int64  
 5   압축천연가스(CNG)  56695 non-null  int64  
 6   경유           56695 non-null  int64  
 7   가솔린          56695 non-null  int64  
 8   하이브리드        56695 non-null  int64  
 9   액화석유가스(LPG)  56695 non-null  int64  
 10  가격           56695 non-null  float64
 11  연평균주행거리      56695 non-null  float64
dtypes: float64(2), int64(8), object(2)
memory usage: 5.2+ MB


In [255]:
# 레이블 인코딩(Label Encoding) - 차량모델명 컬럼
label_encoder = LabelEncoder()
df['차량모델명'] = label_encoder.fit_transform(df['차량모델명'])

# 원핫 인코딩(One-Hot Encoding) - 차량모델명 컬럼
df = pd.get_dummies(df)

## 스케일링

In [256]:
# 로그 스케일링
df['주행거리'] = np.log(df['주행거리'])
df['연평균주행거리'] = np.log(df['연평균주행거리'])
df['배기량'] = np.log(df['배기량'])

In [257]:
# # 스탠다드 스케일링
# scaler = StandardScaler()
# scaled_data = scaler.fit_transform(df)
# scaled_df = pd.DataFrame(scaled_data, columns=df.columns)

In [258]:
y_car_df = df['가격']
X_car_df = df.drop('가격', axis=1)

In [259]:
X_train, X_test, y_train, y_test= train_test_split(X_car_df, y_car_df, test_size=0.2, random_state=78)

In [260]:
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)
linear_pred = linear_model.predict(X_test)
print(f'''
      정확도 : {r2_score(y_test, linear_pred)}
      RMSE : {np.sqrt(mean_squared_error(y_test, linear_pred))}
      MAE : {mean_absolute_error(y_test, linear_pred)}
      ''')


      정확도 : 0.7956653650827804
      RMSE : 15.70754991901913
      MAE : 11.484296756368353
      
