In [2]:
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

font_path = '/System/Library/AssetsV2/com_apple_MobileAsset_Font7/bad9b4bf17cf1669dde54184ba4431c22dcad27b.asset/AssetData/NanumGothic.ttc'
fontprop = fm.FontProperties(fname = font_path, size = 10)

train = pd.read_csv('EV_Cost_data/train.csv').drop(columns = 'ID')
test = pd.read_csv('EV_Cost_data/test.csv')

# 제조사들의 가격을 분석하여 제조사들의 수준 카테고리화

In [92]:
# train data 가격 전처리

from target_preprocessing import categorize_by_quantiles, categorize_by_mean_std, categorize_by_kmeans, categorize_by_gmm

train['가격구간'] = categorize_by_mean_std(train, column="가격(백만원)")
train.head()

Mean: 62.331948779511805, Std: 36.64675928062657
Boundaries: Low <= 25.685189498885237, Mid = (25.685189498885237, 98.97870806013837], High > 98.97870806013837


Unnamed: 0,제조사,모델,차량상태,배터리용량,구동방식,주행거리(km),보증기간(년),사고이력,연식(년),가격(백만원),가격구간
0,P사,TayGTS,Nearly New,86.077,AWD,13642,0,No,2,159.66,고가
1,K사,Niro,Nearly New,56.0,FWD,10199,6,No,0,28.01,중가
2,A사,eT,Brand New,91.2,AWD,2361,7,No,0,66.27,중가
3,A사,RSeTGT,Nearly New,,AWD,21683,3,No,0,99.16,고가
4,B사,i5,Pre-Owned,61.018,AWD,178205,1,No,0,62.02,중가


In [93]:
train['제조사'].unique()

array(['P사', 'K사', 'A사', 'B사', 'H사', 'T사', 'V사'], dtype=object)

In [94]:
for manufacturer in train['제조사'].unique():
    unique_ranges = train[train['제조사'] == manufacturer]['가격구간'].unique()
    print(f"{manufacturer} 가격 구간: {unique_ranges}")

P사 가격 구간: ['고가' '중가']
K사 가격 구간: ['중가' '저가']
A사 가격 구간: ['중가' '고가']
B사 가격 구간: ['중가' '저가']
H사 가격 구간: ['중가' '저가']
T사 가격 구간: ['중가']
V사 가격 구간: ['중가']


In [95]:
for manufacturer in train['제조사'].unique():
    quality = train[train['제조사'] == manufacturer]['가격구간'].value_counts()
    print(f"{manufacturer} 가격 구간: {quality}")

P사 가격 구간: 가격구간
고가    984
중가     87
Name: count, dtype: int64
K사 가격 구간: 가격구간
중가    693
저가    471
Name: count, dtype: int64
A사 가격 구간: 가격구간
중가    854
고가    288
Name: count, dtype: int64
B사 가격 구간: 가격구간
중가    781
저가    388
Name: count, dtype: int64
H사 가격 구간: 가격구간
중가    947
저가    290
Name: count, dtype: int64
T사 가격 구간: 가격구간
중가    1109
Name: count, dtype: int64
V사 가격 구간: 가격구간
중가    605
Name: count, dtype: int64


In [96]:
# 제조사별 가격 구간 비율 계산 및 조건에 따라 분류
def categorize_manufacturer(row):
    if (row['고가'] > 0.2) & (row['고가'] < 0.7):
        return '고가 제조사'
    elif (row['저가'] < 0.35) & (row['중가'] > 0.5):
        return '중가 제조사'
    elif row['고가'] >= 0.7:
        return '초고가 제조사'
    else:
        return '저가 제조사'

def make_maun_cate(df):
    # 제조사별 비율 계산
    price_distribution = df.groupby('제조사')['가격구간'].value_counts(normalize=True).unstack(fill_value=0)

    # 제조사 카테고리 추가
    price_distribution['제조사_카테고리'] = price_distribution.apply(categorize_manufacturer, axis=1)

    # 결과 출력
    df = df.merge(price_distribution, on = '제조사', how = 'left')
    return df


make_maun_cate(train).head()


Unnamed: 0,제조사,모델,차량상태,배터리용량,구동방식,주행거리(km),보증기간(년),사고이력,연식(년),가격(백만원),가격구간,고가,저가,중가,제조사_카테고리
0,P사,TayGTS,Nearly New,86.077,AWD,13642,0,No,2,159.66,고가,0.918768,0.0,0.081232,초고가 제조사
1,K사,Niro,Nearly New,56.0,FWD,10199,6,No,0,28.01,중가,0.0,0.404639,0.595361,저가 제조사
2,A사,eT,Brand New,91.2,AWD,2361,7,No,0,66.27,중가,0.252189,0.0,0.747811,고가 제조사
3,A사,RSeTGT,Nearly New,,AWD,21683,3,No,0,99.16,고가,0.252189,0.0,0.747811,고가 제조사
4,B사,i5,Pre-Owned,61.018,AWD,178205,1,No,0,62.02,중가,0.0,0.331908,0.668092,중가 제조사


In [88]:
train.head()

Unnamed: 0,제조사,모델,차량상태,배터리용량,구동방식,주행거리(km),보증기간(년),사고이력,연식(년),가격(백만원),가격구간,고가,저가,중가,제조사_카테고리
0,P사,TayGTS,Nearly New,86.077,AWD,13642,0,No,2,159.66,고가,0.918768,0.0,0.081232,초고가 제조사
1,K사,Niro,Nearly New,56.0,FWD,10199,6,No,0,28.01,중가,0.0,0.404639,0.595361,저가 제조사
2,A사,eT,Brand New,91.2,AWD,2361,7,No,0,66.27,중가,0.252189,0.0,0.747811,고가 제조사
3,A사,RSeTGT,Nearly New,,AWD,21683,3,No,0,99.16,고가,0.252189,0.0,0.747811,고가 제조사
4,B사,i5,Pre-Owned,61.018,AWD,178205,1,No,0,62.02,중가,0.0,0.331908,0.668092,중가 제조사


# 주행거리 / 연식 + 1 -> 연간 주행거리 파생 변수 생성

In [75]:
train.head()

Unnamed: 0,제조사,모델,차량상태,배터리용량,구동방식,주행거리(km),보증기간(년),사고이력,연식(년),가격(백만원),가격구간,고가,저가,중가,제조사_카테고리
0,P사,TayGTS,Nearly New,86.077,AWD,13642,0,No,2,159.66,고가,0.918768,0.0,0.081232,초고가 제조사
1,K사,Niro,Nearly New,56.0,FWD,10199,6,No,0,28.01,중가,0.0,0.404639,0.595361,저가 제조사
2,A사,eT,Brand New,91.2,AWD,2361,7,No,0,66.27,중가,0.252189,0.0,0.747811,고가 제조사
3,A사,RSeTGT,Nearly New,,AWD,21683,3,No,0,99.16,고가,0.252189,0.0,0.747811,고가 제조사
4,B사,i5,Pre-Owned,61.018,AWD,178205,1,No,0,62.02,중가,0.0,0.331908,0.668092,중가 제조사


In [76]:
def annual_km(df, mileage_column='주행거리(km)', year_column='연식(년)', new_column='연간_주행거리'):
    df[new_column] = df[mileage_column] / (df[year_column] + 1)
    return df

train = annual_km(train)
train.head()

Unnamed: 0,제조사,모델,차량상태,배터리용량,구동방식,주행거리(km),보증기간(년),사고이력,연식(년),가격(백만원),가격구간,고가,저가,중가,제조사_카테고리,연간_주행거리
0,P사,TayGTS,Nearly New,86.077,AWD,13642,0,No,2,159.66,고가,0.918768,0.0,0.081232,초고가 제조사,4547.333333
1,K사,Niro,Nearly New,56.0,FWD,10199,6,No,0,28.01,중가,0.0,0.404639,0.595361,저가 제조사,10199.0
2,A사,eT,Brand New,91.2,AWD,2361,7,No,0,66.27,중가,0.252189,0.0,0.747811,고가 제조사,2361.0
3,A사,RSeTGT,Nearly New,,AWD,21683,3,No,0,99.16,고가,0.252189,0.0,0.747811,고가 제조사,21683.0
4,B사,i5,Pre-Owned,61.018,AWD,178205,1,No,0,62.02,중가,0.0,0.331908,0.668092,중가 제조사,178205.0


# 차량 상태 이상치 전처리

In [77]:
def calculate_annual_mileage(df, mileage_column='주행거리(km)', year_column='연식(년)', new_column='연간_주행거리'):
    df.loc[(df['차량상태'] == 'Nearly New') & (df[mileage_column] > 50000), '차량상태'] = 'Pre-Owned'
    df.loc[(df['차량상태'] == 'Pre-Owned') & (df[mileage_column] <= 50000), '차량상태'] = 'Nearly New'

    # Calculate annual mileage
    df[new_column] = df[mileage_column] / (df[year_column] + 1)
    return df

In [78]:
train = calculate_annual_mileage(train)
train.head()

Unnamed: 0,제조사,모델,차량상태,배터리용량,구동방식,주행거리(km),보증기간(년),사고이력,연식(년),가격(백만원),가격구간,고가,저가,중가,제조사_카테고리,연간_주행거리
0,P사,TayGTS,Nearly New,86.077,AWD,13642,0,No,2,159.66,고가,0.918768,0.0,0.081232,초고가 제조사,4547.333333
1,K사,Niro,Nearly New,56.0,FWD,10199,6,No,0,28.01,중가,0.0,0.404639,0.595361,저가 제조사,10199.0
2,A사,eT,Brand New,91.2,AWD,2361,7,No,0,66.27,중가,0.252189,0.0,0.747811,고가 제조사,2361.0
3,A사,RSeTGT,Nearly New,,AWD,21683,3,No,0,99.16,고가,0.252189,0.0,0.747811,고가 제조사,21683.0
4,B사,i5,Pre-Owned,61.018,AWD,178205,1,No,0,62.02,중가,0.0,0.331908,0.668092,중가 제조사,178205.0


# 범주형 변수 숫자 전처리

In [79]:
from sklearn.preprocessing import LabelEncoder

def encoding(df, columns):
    label_encoders = {}
    for col in columns:
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col])
        label_encoders[col] = le
    return df

encoding_col = train.select_dtypes(include=['object']).columns
train = encoding(train, encoding_col)
train.head()

Unnamed: 0,제조사,모델,차량상태,배터리용량,구동방식,주행거리(km),보증기간(년),사고이력,연식(년),가격(백만원),가격구간,고가,저가,중가,제조사_카테고리,연간_주행거리
0,4,16,1,86.077,0,13642,0,0,2,159.66,0,0.918768,0.0,0.081232,3,4547.333333
1,3,10,1,56.0,1,10199,6,0,0,28.01,2,0.0,0.404639,0.595361,1,10199.0
2,0,17,0,91.2,0,2361,7,0,0,66.27,2,0.252189,0.0,0.747811,0,2361.0
3,0,12,1,,0,21683,3,0,0,99.16,0,0.252189,0.0,0.747811,0,21683.0
4,1,19,2,61.018,0,178205,1,0,0,62.02,2,0.0,0.331908,0.668092,2,178205.0
