In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os 

In [6]:
# train_data = pd.read_csv('C:/SKH_python/project/archive/archive_india/train-data.csv')
# test_data = pd.read_csv('C:/SKH_python/project/archive/archive_india/test-data.csv')


train_data = pd.read_csv("./data/train-data.csv")
test_data = pd.read_csv('./data/test-data.csv')

In [94]:
train_data.info()

In [8]:
test_data.info()

In [9]:
# 전체 차 종류 개수
print(len(train_data['Name'].unique()))

In [10]:
train_data.head() # Unnamed:0 불필요함으로 삭제 

In [11]:
train_data.drop(['Unnamed: 0'],axis=1,inplace=True)

In [12]:
train_data.head()

In [13]:
test_data.drop(['Unnamed: 0'],axis=1,inplace=True)

In [14]:
test_data.head()

* Name : 차종

* Location : 차량 위치

* Kilometers_Driven : 주행거리(km)

* Fuel_Type: 연료 타입

* Transmission : 오토, 기어

* Owner_Type : 차주 

* Mileage : 연비 kmpl(km/L), km/kg
        *"Km/kg"은 차량이 1kg의 연료로 주행할 수 있는 거리를 측정하고 "km/l"은 차량이 1리터의 연료로 주행할 수 있는 km를 측정합니다.
        *"Km/kg"은 일반적으로 압축 천연 가스(CNG) 또는 액화 석유 가스(LPG)로 작동하는 차량에 사용되는 반면 "km/l"은 휘발유 또는 디젤로 작동하는 차량에 더 일반적으로 사용됩니다.

* Engine : CC 배기량을 표기할 때는 'cc'나 '리터'의 단위를 사용합니다.

* Power : bhp 보통 마력(馬力; Horse Power)이란 단어를 많이 쓰지만 이 제동마력(制動馬力)은 기관의 회전을 적정한 방법으로 제어했을 때 얻어지는 동력(動力)을 말한다.
* Seats : 좌석 개수
* Nwe_Price : Lakh(십만)
* Price : Lakh(십만)

In [15]:
# NaN data check
train_data.isnull().sum()

In [16]:
# NaN data check
test_data.isnull().sum()

In [17]:
# null -> Nan
train_data['Power'] = train_data['Power'].str.extract('(\d+\.\d+|\d+)').astype(float)
test_data['Power'] = test_data['Power'].str.extract('(\d+\.\d+|\d+)').astype(float) #숫자만 추출하는 코드
# Nan -> 0 
train_data.fillna(0, inplace=True)
test_data.fillna(0, inplace=True)
# New Price는 정보가 적어 삭제
train_data.drop(['New_Price'],axis=1,inplace=True)
test_data.drop(['New_Price'],axis=1,inplace=True)

In [18]:
# NaN data check
train_data.isnull().sum()

In [19]:
# NaN data check
test_data.isnull().sum()

In [20]:
train_data.head()

In [21]:
#Train data 단위 정보 제거 
data_len = int(len(train_data['Name']))
# Case 1
# 문자와 숫자가 섞여있는 경우 split 시 int형에서 Error 발생하기 때문에 list 형태로 저장하여 문자열로 변환
Mileage = list(train_data['Mileage'])
Engine = list(train_data['Engine'])

for i in range(data_len):
    
    Mileage[i] = float(str(Mileage[i]).split(' ')[0])
    Engine[i] = int(str(Engine[i]).split(' ')[0])
#     Power[i] = float(str(Power[i]).split(' ')[0])
    
# Case 2
# 데이터 처리 마다 str을 덮어서 Error 방지
# data_len = int(len(train_data['Name']))
# Mileage = []
# Engine = []
# Power = []
# for i in range(data_len):
    
#     Mileage.append(str(train_data['Mileage'][i]).split(' ')[0])
#     Engine.append(str(train_data['Engine'][i]).split(' ')[0])
#     Power.append(str(train_data['Power'][i]).split(' ')[0])

train_data['Mileage'] = Mileage
train_data['Engine'] = Engine


In [22]:
#Train data 단위 정보 제거 
data_len = int(len(test_data['Name']))
# Case 1
# 문자와 숫자가 섞여있는 경우 split 시 int형에서 Error 발생하기 때문에 list 형태로 저장하여 문자열로 변환
Mileage = list(test_data['Mileage'])
Engine = list(test_data['Engine'])

for i in range(data_len):
    
    Mileage[i] = float(str(Mileage[i]).split(' ')[0])
    Engine[i] = int(str(Engine[i]).split(' ')[0])
    
test_data['Mileage'] = Mileage
test_data['Engine'] = Engine



In [23]:
train_data.info()

In [24]:
test_data.info()

In [25]:
train_data.describe()

In [26]:
plt.figure(figsize=(10,10))
plt.boxplot(train_data['Kilometers_Driven'])

In [27]:
train_data[train_data['Kilometers_Driven']>6000000]

In [28]:
# 이상치 데이터 평균 값으로 대체
train_data['Kilometers_Driven'][2328] = int(train_data['Kilometers_Driven'].mean())

In [29]:
train_data['Kilometers_Driven'][2328] 

In [30]:
plt.figure(figsize=(10,10))
plt.boxplot(train_data['Kilometers_Driven'])

In [31]:
# 0인데이터가 없으므로 Pass
train_data[train_data['Kilometers_Driven']==0]

In [32]:
plt.figure(figsize=(10,10))
plt.boxplot(train_data['Mileage'])

In [33]:
# mileage 0인 차량 리스트
Mileage_zero_list = list(train_data[train_data['Mileage']==0]['Name'])

In [34]:
Mileage_zero_list

In [35]:
find_not_zero_name = []
for car_name in Mileage_zero_list:
    if train_data[train_data['Name'] == car_name]['Mileage'].sum() > 0:
        find_not_zero_name.append(car_name)

In [36]:
find_not_zero_name

In [37]:
train_data[train_data['Name']=='Honda City 1.5 GXI']

In [38]:
replace_data_list = train_data[train_data['Name']=='Honda City 1.5 GXI'].index

In [39]:
# 동일차량 정보에서 가져올 수 있는 데이터는 가져옴
train_data['Mileage'][replace_data_list] = 12.8
train_data['Engine'][replace_data_list] = 1493
train_data['Power'][replace_data_list] = 100
train_data['Seats'][replace_data_list] = 5


In [40]:
train_data[train_data['Name']=='Honda City 1.5 GXI']

In [41]:
train_data['Mileage'][replace_data_list]

In [42]:
Engine_zero_list = list(train_data[train_data['Engine']==0]['Name'])
Power_zero_list = list(train_data[train_data['Power']==0]['Name'])
Seats_list = list(train_data[train_data['Seats']==0]['Name'])
Price_zero_list = list(train_data[train_data['Price']==0]['Name'])

In [43]:
Engine_zero_list

In [44]:
Power_zero_list

In [45]:
Seats_list

In [46]:
Price_zero_list #Price는 결측지 대체할 필요가 없음

In [47]:
Engine_not_zero_name = []
Power_not_zero_name = []
Seats_not_zero_name = []
for car_name in Engine_zero_list:
    if train_data[train_data['Name'] == car_name]['Engine'].sum() > 0:
        Engine_not_zero_name.append(car_name)
for car_name in Power_zero_list:
    if train_data[train_data['Name'] == car_name]['Power'].sum() > 0:
        Power_not_zero_name.append(car_name)
for car_name in Seats_list:
    if train_data[train_data['Name'] == car_name]['Seats'].sum() > 0:
        Seats_not_zero_name.append(car_name)


In [48]:
Engine_not_zero_name = list(set(Engine_not_zero_name))
print(Engine_not_zero_name)

In [49]:
Power_not_zero_name = list(set(Power_not_zero_name))
print(Power_not_zero_name)

In [50]:
Seats_not_zero_name = list(set(Seats_not_zero_name))
print(Seats_not_zero_name)

In [51]:
train_data[train_data['Name']==Engine_not_zero_name[0]]

In [52]:
train_data['Engine'][2264] = 1197
train_data['Power'][2264] = 78.9
train_data['Seats'][2264] = 5

In [53]:
train_data[train_data['Name']==Engine_not_zero_name[1]]

In [54]:
# 직접 타이핑
train_data['Engine'][4604] = 1198
train_data['Power'][4604] = 88.8
train_data['Seats'][4604] = 5

In [55]:
# 한번에 처리하기 - Power
for name in Power_not_zero_name:
    idx_temp = list(train_data[train_data['Name']==name].index)
    temp = train_data[train_data['Name']==name]
    nonzero_power = temp[temp['Power'] != 0]['Power']
    temp_value = nonzero_power[nonzero_power.index[0]]
    for idx in idx_temp:
        train_data.loc[idx, 'Power'] = temp_value

In [56]:
# Seats
for name in Seats_not_zero_name:
    idx_temp = list(train_data[train_data['Name']==name].index)
    temp = train_data[train_data['Name']==name]
    nonzero_power = temp[temp['Seats'] != 0]['Seats']
    temp_value = nonzero_power[nonzero_power.index[0]]
    for idx in idx_temp:
        train_data.loc[idx, 'Seats'] = temp_value

In [57]:
# 다시 검증
Engine_zero_idx = list(train_data[train_data['Engine']==0].index)
Power_zero_idx = list(train_data[train_data['Power']==0].index)
Seats_zero_idx = list(train_data[train_data['Seats']==0].index)


In [58]:
# 회생불가 데이터
del_idx = list(set(Engine_zero_idx + Power_zero_idx + Seats_zero_idx))

In [59]:
len(del_idx)

In [60]:
train_data = train_data.drop(del_idx)

In [61]:
train_data.shape

In [62]:
# 결측치 제거 되었는지 확인
train_data[train_data==0].sum()

In [63]:
len(train_data['Name'].unique())

In [64]:
car_name = list(train_data['Name'])
for i in range(len(car_name)):
    car_name[i] = car_name[i].split(' ', 1)[0]

In [65]:
train_data['Name'] = car_name

In [66]:
train_data.head()

In [67]:
train_data = train_data.reset_index(drop=True)

In [68]:
train_data.head()

In [69]:
train_data['Name'].value_counts()

In [70]:
train_data['Name'][train_data['Name'] == 'ISUZU'] = 'Isuzu'

In [71]:
train_data['Name'].value_counts()

In [72]:
train_data.head()

In [73]:
# 연료 단위 맞추기
train_data['Mileage'][train_data['Fuel_Type'] == 'CNG'] = train_data[train_data['Fuel_Type'] == 'CNG']['Mileage']*1.64
train_data['Mileage'][train_data['Fuel_Type'] == 'LPG'] = train_data[train_data['Fuel_Type'] == 'LPG']['Mileage']*1.3

In [74]:
Car_list = list(train_data['Name'].unique())
Location_list = list(train_data['Location'].unique())
Fuel_Type_list = list(train_data['Fuel_Type'].unique())
Transmission = list(train_data['Owner_Type'].unique())
print(len(Car_list), len(Location_list), len(Fuel_Type_list), len(Transmission))

In [75]:
train_data = pd.get_dummies(train_data, columns=['Name', 'Location', 'Fuel_Type', 'Transmission', 'Owner_Type'])

In [76]:
train_data

In [77]:
from sklearn.model_selection import train_test_split

In [78]:
X = train_data.drop('Price', axis=1)
y = train_data['Price']
y_log = np.log(y)

In [79]:
X_train, X_test, y_train, y_test = train_test_split(X, y_log, test_size=0.1, random_state=10)

In [93]:
X_train.head()

In [92]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

vif = pd.DataFrame()
vif['features'] = X_train.columns
vif["VIF Factor"] = [variance_inflation_factor(X_train.values, i) for i in range(X_train.shape[1])]

vif.round(1)
plt


In [81]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score, confusion_matrix, r2_score, mean_absolute_error, mean_squared_error


In [82]:
# lr = LinearRegression(fit_intercept = True, normalize = True, copy_X = True)
lr = LinearRegression()
lr.fit(X_train, y_train)

In [83]:
y_predict = lr.predict(X_test)

In [84]:
lr.fit(X_train, y_train)
y_prediect = lr.predict(X_test)

In [85]:
print('Train data Accuracy : ', format(lr.score(X_train, y_train)))
print('Test data r-square : ', format(r2_score(y_test,y_prediect)))
print('Root mean squared error : ', format(mean_squared_error(y_test, y_prediect, squared = False)))

In [86]:
pip install xgboost

In [87]:
pip install lightgbm

In [88]:
pip install catboost

In [89]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor, plot_importance
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold, learning_curve
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [None]:
random_state = 10
classifiers = []


classifiers.append(LinearRegression())
classifiers.append(Ridge(alpha=0.1))
classifiers.append(Lasso(alpha=0.1))
classifiers.append(DecisionTreeRegressor(max_depth=5))
classifiers.append(RandomForestRegressor(n_estimators=100, max_depth=5))
classifiers.append(GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=5))
classifiers.append(XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=5))
classifiers.append(LGBMRegressor(n_estimators=100, learning_rate=0.1, max_depth=5))
classifiers.append(CatBoostRegressor(iterations=100, learning_rate=0.1, depth=5))
# classifiers.append(SVR(kernel='linear'))
classifiers.append(KNeighborsRegressor(n_neighbors=5))

cv_list = []
cv_results = []
for classifier in classifiers :
    cv_list.append(str(classifier).split('(')[0])
    print('classifier:',str(classifier))
    classifier.fit(X_train,y_train) # 모델 형성
    y_predict = classifier.predict(X_test) # 예측
    cv_results.append([classifier.score(X_train, y_train), r2_score(y_test,y_predict), mean_squared_error(y_test, y_predict, squared = False)])


In [None]:
result = pd.DataFrame(np.round(cv_results,2), columns=(['Train data Accuracy','Test data r-square','Root mean squared error']))
result.index = cv_list

In [None]:
result

In [None]:
XGB = XGBRegressor()
xgb_param_grid = { "n_estimators" : [100, 200, 300],
                  "learning_rate" : [0.1, 0.05, 0.01],
                  "max_depth" : [3, 5, 10]}

gsXGB = GridSearchCV(XGB,param_grid = xgb_param_grid, cv=5)
gsXGB.fit(X_train,y_train)
print("Best Parameters : ", gsXGB.best_params_)
print("Best Score : ", gsXGB.best_score_)
y_pred = gsXGB.predict(X_test)


In [None]:
XGB =XGBRegressor(n_estimators=300, learning_rate=0.1, max_depth=5)
XGB.fit(X_train, y_train)
y_prediect = XGB.predict(X_test)

print('Train data Accuracy : ', format(XGB.score(X_train, y_train)))
print('Test data r-square : ', format(r2_score(y_test,y_prediect)))
print('Root mean squared error : ', format(mean_squared_error(y_test, y_prediect, squared = False)))

In [None]:
LGB = LGBMRegressor()
lgb_param_grid = { "n_estimators" : [100, 200, 300],
                  "learning_rate" : [0.1, 0.05, 0.01],
                  "max_depth" : [3, 5, 10]}

gsLGB = GridSearchCV(LGB,param_grid = lgb_param_grid, cv=5)
gsLGB.fit(X_train,y_train)
print("Best Parameters : ", gsLGB.best_params_)
print("Best Score : ", gsLGB.best_score_)
y_pred = gsLGB.predict(X_test)


In [None]:
LGB = LGBMRegressor(n_estimators=300, learning_rate=0.1, max_depth=10)
LGB.fit(X_train, y_train)
y_prediect = LGB.predict(X_test)

print('Train data Accuracy : ', format(LGB.score(X_train, y_train)))
print('Test data r-square : ', format(r2_score(y_test,y_prediect)))
print('Root mean squared error : ', format(mean_squared_error(y_test, y_prediect, squared = False)))

In [None]:
plot_importance(XGB,max_num_features=10)

In [None]:
from lightgbm import LGBMRegressor, plot_importance
plot_importance(LGB,max_num_features=10)

In [None]:
print('Train data Accuracy : ', format(XGB.score(X_test, y_test)))
print('Train data Accuracy : ', format(LGB.score(X_test, y_test)))