# [과제 2] 회귀분석
### - Ch 1, Ch 2를 토대로 자유롭게 회귀분석과 회귀진단을 진행해주세요.
### - 주석으로 설명 및 근거 자세하게 달아주시면 감사하겠습니다. :)

In [1]:
%matplotlib inline 

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings(action='ignore')

# Data 

데이터 출처 : https://www.kaggle.com/avikasliwal/used-cars-price-prediction 

< y > 
* **Price** : The price of the used car in INR Lakhs.



< X > 
* Name : The brand and model of the car
* Location : The location in which the car is being sold or is available for purchase.
* Year : The year or edition of the model.
* Kilometers_Driven : The total kilometres driven in the car by the previous owner(s) in KM.
* Fuel_Type : The type of fuel used by the car. (Petrol, Diesel, Electric, CNG, LPG)
* Transmission : The type of transmission used by the car. (Automatic / Manual)
* Owner_Type : Whether the ownership is Firsthand, Second hand or other.
* Mileage : The standard mileage offered by the car company in kmpl or km/kg
* Engine : The displacement volume of the engine in CC.
* Power : The maximum power of the engine in bhp.
* Seats : The number of seats in the car.
* New_Price : The price of a new car of the same model.

In [None]:
# 칼럼명
# 'Name', 'Location', 'Year', 'Kilometers_Driven', 'Fuel_Type', 'Transmission', 'Owner_Type', 'Mileage', 'Engine', 'Power', 'Seats'

In [87]:
# Load Data 
data = pd.read_csv("assignment2_data.csv", index_col=0)
data.head()

Unnamed: 0,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,New_Price,Price
0,Maruti Wagon R LXI CNG,Mumbai,2010,72000,CNG,Manual,First,26.6 km/kg,998 CC,58.16 bhp,5.0,,1.75
1,Hyundai Creta 1.6 CRDi SX Option,Pune,2015,41000,Diesel,Manual,First,19.67 kmpl,1582 CC,126.2 bhp,5.0,,12.5
2,Honda Jazz V,Chennai,2011,46000,Petrol,Manual,First,18.2 kmpl,1199 CC,88.7 bhp,5.0,8.61 Lakh,4.5
3,Maruti Ertiga VDI,Chennai,2012,87000,Diesel,Manual,First,20.77 kmpl,1248 CC,88.76 bhp,7.0,,6.0
4,Audi A4 New 2.0 TDI Multitronic,Coimbatore,2013,40670,Diesel,Automatic,Second,15.2 kmpl,1968 CC,140.8 bhp,5.0,,17.74


In [88]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6019 entries, 0 to 6018
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Name               6019 non-null   object 
 1   Location           6019 non-null   object 
 2   Year               6019 non-null   int64  
 3   Kilometers_Driven  6019 non-null   int64  
 4   Fuel_Type          6019 non-null   object 
 5   Transmission       6019 non-null   object 
 6   Owner_Type         6019 non-null   object 
 7   Mileage            6017 non-null   object 
 8   Engine             5983 non-null   object 
 9   Power              5983 non-null   object 
 10  Seats              5977 non-null   float64
 11  New_Price          824 non-null    object 
 12  Price              6019 non-null   float64
dtypes: float64(2), int64(2), object(9)
memory usage: 658.3+ KB


In [89]:
data.isnull().sum()

Name                    0
Location                0
Year                    0
Kilometers_Driven       0
Fuel_Type               0
Transmission            0
Owner_Type              0
Mileage                 2
Engine                 36
Power                  36
Seats                  42
New_Price            5195
Price                   0
dtype: int64

In [90]:
# 결측치 제거
data.dropna(subset=['Mileage', 'Engine', 'Power', 'Seats'], inplace=True)

# 결측치가 많은 칼럼 제거
data.drop(columns='New_Price', inplace=True)

In [91]:
# 실수값 활용을 위한 단위 제거
# 단위를 포함하고 있는 칼럼: Mileage(km//kg, kmpl), Engine(CC), Power(bhp)

data['Mileage'] = data['Mileage'].replace({'km/kg':'', 'kmpl':''}, regex=True).astype(float)
data['Engine'] = data['Engine'].replace('CC', '', regex=True).astype(float)
data['Power'] = data['Power'].replace('bhp', '', regex=True)

data.head()

Unnamed: 0,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,Price
0,Maruti Wagon R LXI CNG,Mumbai,2010,72000,CNG,Manual,First,26.6,998.0,58.16,5.0,1.75
1,Hyundai Creta 1.6 CRDi SX Option,Pune,2015,41000,Diesel,Manual,First,19.67,1582.0,126.2,5.0,12.5
2,Honda Jazz V,Chennai,2011,46000,Petrol,Manual,First,18.2,1199.0,88.7,5.0,4.5
3,Maruti Ertiga VDI,Chennai,2012,87000,Diesel,Manual,First,20.77,1248.0,88.76,7.0,6.0
4,Audi A4 New 2.0 TDI Multitronic,Coimbatore,2013,40670,Diesel,Automatic,Second,15.2,1968.0,140.8,5.0,17.74


In [92]:
# null 문자열을 포함하고 있는 데이터가 존재함.
print(data['Power'].str.contains('null').sum())

# 행 제거 후 형 변환
data.drop(data[data['Power'].str.contains('null')].index, inplace=True)
data['Power'] = data['Power'].astype(float)

103


In [93]:
data.corr()

Unnamed: 0,Year,Kilometers_Driven,Mileage,Engine,Power,Seats,Price
Year,1.0,-0.169369,0.285623,-0.068045,0.014531,0.007833,0.299475
Kilometers_Driven,-0.169369,1.0,-0.060608,0.09303,0.03349,0.083072,-0.008249
Mileage,0.285623,-0.060608,1.0,-0.637258,-0.538844,-0.331576,-0.341652
Engine,-0.068045,0.09303,-0.637258,1.0,0.866301,0.401116,0.658047
Power,0.014531,0.03349,-0.538844,0.866301,1.0,0.10146,0.772843
Seats,0.007833,0.083072,-0.331576,0.401116,0.10146,1.0,0.055547
Price,0.299475,-0.008249,-0.341652,0.658047,0.772843,0.055547,1.0


In [75]:
from sklearn.linear_model import LinearRegression

# 독립변수
predictors = ['Kilometers_Driven', 'Mileage', 'Engine', 'Power']

# 종속변수
outcome = 'Price'

model = LinearRegression()
model.fit(data[predictors], data[outcome])

print(f'Intercept: {model.intercept_:.3f}')
print('Coefficients:')
for name, coef in zip(predictors, model.coef_):
    print(f' {name}: {coef}')

Intercept: -15.621
Coefficients:
 Kilometers_Driven: -3.89107087808992e-06
 Mileage: 0.29101141464194175
 Engine: 0.0007361766034286732
 Power: 0.16716099590277736


In [95]:
from sklearn.metrics import mean_squared_error, r2_score

fitted = model.predict(data[predictors])
RMSE = np.sqrt(mean_squared_error(data[outcome], fitted))
r2 = r2_score(data[outcome], fitted)
print(f'RMSE: {RMSE:.0f}')
print(f'r2: {r2:.4f}')

RMSE: 7
r2: 0.6064


In [96]:
import statsmodels.api as sm

model = sm.OLS(data[outcome], data[predictors].assign(const=1))
results = model.fit()
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:                  Price   R-squared:                       0.606
Model:                            OLS   Adj. R-squared:                  0.606
Method:                 Least Squares   F-statistic:                     2259.
Date:                Tue, 31 Jan 2023   Prob (F-statistic):               0.00
Time:                        19:46:16   Log-Likelihood:                -19806.
No. Observations:                5872   AIC:                         3.962e+04
Df Residuals:                    5867   BIC:                         3.966e+04
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                        coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------
Kilometers_Driven -3.891e-06   1.01e-0

- R-squared: 1에 가까울수록 모델이 데이터를 잘 설명함을 의미함 => 선형성 만족
  - 해당 회귀모델은 데이터의 60.6%를 설명한다.
- F-statistic(F통계량): 모델의 유의성을 검정하는 검정 통계량
- Prob(F-statistic): 0.05보다 작으면 적합한 회귀모형으로 판단함
- Durbin-Watson]: 2에 가까움(1.5~2.5) => 독립성 만족