# 라이브러리 임포트

In [1]:
!pip install lightgbm
!pip install imbalanced-learn
!pip install xgboost
!pip install CatBoost
import pandas as pd
import numpy as np
import kagglehub
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                             f1_score, roc_auc_score, confusion_matrix, classification_report)
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import VotingClassifier
import matplotlib.pyplot as plt
import seaborn as sns
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import StratifiedKFold
from xgboost import XGBClassifier

Collecting CatBoost
  Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: CatBoost
Successfully installed CatBoost-1.2.8


# Kaggle 데이터셋 다운로드

In [2]:
# 'electric-vehicle-charging-patterns' 데이터셋 다운로드
path = kagglehub.dataset_download("valakhorasani/electric-vehicle-charging-patterns")

# 다운로드 경로 출력
print("Path to dataset files:", path)

# 데이터셋의 CSV 파일 경로 확인
file_path = path + "/ev_charging_patterns.csv"  # CSV 파일 경로

# 데이터를 Pandas로 읽기
df = pd.read_csv(file_path)

# 데이터 확인
print(df.head())

Downloading from https://www.kaggle.com/api/v1/datasets/download/valakhorasani/electric-vehicle-charging-patterns?dataset_version_number=1...


100%|██████████| 130k/130k [00:00<00:00, 52.8MB/s]

Extracting files...
Path to dataset files: /root/.cache/kagglehub/datasets/valakhorasani/electric-vehicle-charging-patterns/versions/1
  User ID Vehicle Model  Battery Capacity (kWh) Charging Station ID  \
0  User_1        BMW i3              108.463007         Station_391   
1  User_2  Hyundai Kona              100.000000         Station_428   
2  User_3    Chevy Bolt               75.000000         Station_181   
3  User_4  Hyundai Kona               50.000000         Station_327   
4  User_5  Hyundai Kona               50.000000         Station_108   

  Charging Station Location  Charging Start Time    Charging End Time  \
0                   Houston  2024-01-01 00:00:00  2024-01-01 00:39:00   
1             San Francisco  2024-01-01 01:00:00  2024-01-01 03:01:00   
2             San Francisco  2024-01-01 02:00:00  2024-01-01 04:48:00   
3                   Houston  2024-01-01 03:00:00  2024-01-01 06:42:00   
4               Los Angeles  2024-01-01 04:00:00  2024-01-01 05:46:00   





# openchargemap api 데이터 수집 / 전처리

In [3]:
import requests

# OpenChargeMap API 키 설정
api_key = "e90f9a6e-a6e1-4600-9bf3-87ed02344041"
url = "https://api.openchargemap.io/v3/poi/"

# API 요청 파라미터
params = {
    "output": "json",
    "countrycode": "KR",
    "maxresults": 100,
    "compact": "true",
    "key": "e90f9a6e-a6e1-4600-9bf3-87ed02344041"
}

# 데이터 요청
response = requests.get(url, params=params)
if response.status_code == 200:
    opencharge_data = response.json()
    print("OpenChargeMap 데이터 수집 성공!")
else:
    print("API 요청 실패:", response.status_code)

# JSON 데이터를 Pandas DataFrame으로 변환
opencharge_df = pd.json_normalize(
    opencharge_data,
    record_path=["Connections"],  # 'Connections' 키를 기반으로 확장
    meta=[
        "ID",
        ["AddressInfo", "Title"],
        ["AddressInfo", "Latitude"],
        ["AddressInfo", "Longitude"],
        ["AddressInfo", "StateOrProvince"],
        ["AddressInfo", "Postcode"],
        "UsageCost",
        "NumberOfPoints"
    ],
    meta_prefix="meta_",  # 메타데이터 열에 접두사 추가
    errors='ignore'
)

# 열 이름 정리
opencharge_df = opencharge_df.rename(columns={
    "meta_ID": "Charging Station ID",
    "meta_AddressInfo.Title": "Station Name",
    "meta_AddressInfo.Latitude": "Latitude",
    "meta_AddressInfo.Longitude": "Longitude",
    "meta_AddressInfo.StateOrProvince": "State/Province",
    "meta_AddressInfo.Postcode": "Postcode",
    "UsageCost": "Usage Cost",
    "NumberOfPoints": "Number of Connections",
    "PowerKW": "Power (kW)"
})

# 확인
print(opencharge_df.head())

# 2. 데이터 전처리
# 불필요한 열 제거 (예: User ID, Charging Start Time, Charging End Time)
if 'User ID' in df.columns:
    df = df.drop(columns=['User ID', 'Charging Start Time', 'Charging End Time'])

# 결측치 처리
numeric_cols = df.select_dtypes(include=[np.number]).columns
df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].mean())  # 숫자형 결측치는 평균값으로 채움

categorical_cols = df.select_dtypes(exclude=[np.number]).columns
for col in categorical_cols:
    if not df[col].mode().empty:  # mode()가 비어있지 않은지 확인
        most_frequent = df[col].mode()[0]  # 최빈값 추출
        df[col] = df[col].fillna(most_frequent)  # 결측값 채우기
    else:
        # Categorical 열의 경우 'Unknown'을 새로운 카테고리로 추가 후 채움
        if df[col].dtype.name == 'category':
            df[col] = df[col].cat.add_categories('Unknown').fillna('Unknown')
        else:
            df[col] = df[col].fillna('Unknown')  # 일반 문자열 열에 대해 결측값 채우기

print("결측치 처리 완료!")
print(df.head())

OpenChargeMap 데이터 수집 성공!
       ID  ConnectionTypeID ConnectionType Reference  StatusTypeID StatusType  \
0  569521                27           None      None            50       None   
1  569520                33           None      None            50       None   
2  569519                33           None      None            50       None   
3  569518                33           None      None            50       None   
4  569517                27           None      None            50       None   

   LevelID Level  Amps Voltage  ...  Quantity  Comments Charging Station ID  \
0        3  None  None    None  ...         6      None              302906   
1        3  None  None    None  ...         6      None              302905   
2        3  None  None    None  ...         8      None              302904   
3        3  None  None    None  ...         8      None              302903   
4        3  None  None    None  ...         6      None              302902   

             

# 데이터 병합


In [4]:
# OpenChargeMap 데이터 정리
opencharge_df_clean = opencharge_df[[
    "Charging Station ID", "Latitude", "Longitude",
    "Power (kW)", "meta_UsageCost", "meta_NumberOfPoints"
]].copy()

# 열 이름 정리
opencharge_df_clean = opencharge_df_clean.rename(columns={
    "meta_UsageCost": "Usage Cost",
    "meta_NumberOfPoints": "Number of Connections",
    "Power (kW)": "Power (kW)"
})

# 결측값 처리
opencharge_df_clean['Power (kW)'] = opencharge_df_clean['Power (kW)'].fillna(opencharge_df_clean['Power (kW)'].mean())
opencharge_df_clean['Usage Cost'] = opencharge_df_clean['Usage Cost'].fillna(0)
opencharge_df_clean['Number of Connections'] = opencharge_df_clean['Number of Connections'].fillna(1)

# Kaggle 데이터셋과 병합
columns_to_remove = ['Power (kW)', 'Usage Cost', 'Number of Connections', 'Latitude', 'Longitude']
df = df.drop(columns=[col for col in columns_to_remove if col in df.columns], errors='ignore')

# 병합 수행
df = df.merge(opencharge_df_clean, how='left', on='Charging Station ID')

# 중복 열 정리: 새 열을 우선 적용
df['Power (kW)'] = df['Power (kW)'].combine_first(opencharge_df_clean['Power (kW)'])
df['Usage Cost'] = df['Usage Cost'].combine_first(opencharge_df_clean['Usage Cost'])
df['Number of Connections'] = df['Number of Connections'].combine_first(opencharge_df_clean['Number of Connections'])
df['Latitude'] = df['Latitude'].combine_first(opencharge_df_clean['Latitude'])
df['Longitude'] = df['Longitude'].combine_first(opencharge_df_clean['Longitude'])

# 병합된 중간 열 제거
df.drop(columns=[
    'Power (kW)_x', 'Power (kW)_y',
    'Usage Cost_x', 'Usage Cost_y',
    'Number of Connections_x', 'Number of Connections_y',
    'Latitude_x', 'Latitude_y',
    'Longitude_x', 'Longitude_y'
], errors='ignore', inplace=True)

# 병합 후 확인
print("병합 및 정리 후 데이터프레임:")
print(df.head())

병합 및 정리 후 데이터프레임:
  Vehicle Model  Battery Capacity (kWh) Charging Station ID  \
0        BMW i3              108.463007         Station_391   
1  Hyundai Kona              100.000000         Station_428   
2    Chevy Bolt               75.000000         Station_181   
3  Hyundai Kona               50.000000         Station_327   
4  Hyundai Kona               50.000000         Station_108   

  Charging Station Location  Energy Consumed (kWh)  Charging Duration (hours)  \
0                   Houston              60.712346                   0.591363   
1             San Francisco              12.339275                   3.133652   
2             San Francisco              19.128876                   2.452653   
3                   Houston              79.457824                   1.266431   
4               Los Angeles              19.629104                   2.019765   

   Charging Rate (kW)  Charging Cost (USD) Time of Day Day of Week  ...  \
0           36.389181            13.08771

  opencharge_df_clean['Usage Cost'] = opencharge_df_clean['Usage Cost'].fillna(0)
  opencharge_df_clean['Number of Connections'] = opencharge_df_clean['Number of Connections'].fillna(1)


# 파생 변수 생성


In [5]:
# 파생 변수 생성
df['Cost Category'] = pd.cut(
    df['Usage Cost'], bins=[-1, 5, 20, 50, np.inf],
    labels=['Low', 'Medium', 'High', 'Very High']
)

df['Avg Power per Connection'] = df['Power (kW)'] / df['Number of Connections']

df['Charging Time Category'] = pd.cut(df['Charging Duration (hours)'],
                                      bins=[-1, 1, 3, 6, np.inf],
                                      labels=['Short', 'Medium', 'Long', 'Very Long'])

# 결측치 처리: 파생 변수에 대한 결측치 처리
df['Avg Power per Connection'] = df['Avg Power per Connection'].fillna(0)

# 결과 확인
print(df[['Charging Station ID', 'Usage Cost', 'Cost Category','Charging Time Category', 'Avg Power per Connection']].head())

  Charging Station ID  Usage Cost Cost Category Charging Time Category  \
0         Station_391         0.0           Low                  Short   
1         Station_428         0.0           Low                   Long   
2         Station_181         0.0           Low                 Medium   
3         Station_327         0.0           Low                 Medium   
4         Station_108         0.0           Low                 Medium   

   Avg Power per Connection  
0                 19.166667  
1                 41.666667  
2                 31.250000  
3                 31.250000  
4                 41.666667  


# 범주형 변수 인코딩 & 특성 및 타겟 변수 분리
- User Type - 범주형 데이터 (Label Encoding)

In [6]:
print("데이터프레임의 열 목록:")
print(df.columns)

# 특성 데이터와 타겟 변수 분리
X = df.drop(columns=['User Type', 'Charger Type','Battery Capacity (kWh)', 'Vehicle Age (years)'])  # 타겟 열 제거
y_user = df['User Type']
y_charger = df['Charger Type']

# 범주형 및 수치형 열 분리
categorical_features = ['Vehicle Model', 'Charging Station Location', 'Time of Day',
                        'Day of Week', 'Charging Station ID', 'Cost Category']
numeric_features = X.select_dtypes(include=['float64', 'int64']).columns.tolist()

# Latitude와 Longitude를 명확하게 수치형으로 변환 후 결측치 처리
X['Latitude'] = pd.to_numeric(X['Latitude'], errors='coerce').fillna(0)
X['Longitude'] = pd.to_numeric(X['Longitude'], errors='coerce').fillna(0)

# 결측치 처리: 범주형 데이터는 'Unknown', 수치형 데이터는 평균값 또는 0
for col in categorical_features:
    if isinstance(X[col].dtype, pd.CategoricalDtype):
        X[col] = X[col].cat.add_categories("Unknown").fillna("Unknown")
    else:
        X[col] = X[col].fillna("Unknown").astype(str)

for col in numeric_features:
    if col in ['Latitude', 'Longitude']:
        X[col] = X[col].fillna(0)  # Latitude와 Longitude는 결측치를 0으로 채움
    else:
        X[col] = X[col].fillna(X[col].mean())  # 다른 수치형 열은 평균값으로 채움

if 'Latitude' in numeric_features and 'Longitude' in numeric_features:
    X['Latitude'] = X['Latitude'].fillna(0)  # Latitude 결측치를 0으로 채움
    X['Longitude'] = X['Longitude'].fillna(0)  # Longitude 결측치를 0으로 채움

# 결측치 처리 확인
print("결측치 처리 후 데이터 확인:")
print(X.isnull().sum())  # 모든 열의 결측치 확인

# ColumnTransformer 정의: 범주형은 OneHotEncoder, 수치형은 그대로 유지
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

encoder = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features),
        ('num', 'passthrough', numeric_features)  # 수치형 열은 그대로 유지
    ]
)

# 인코딩 수행
X_encoded = encoder.fit_transform(X)

# SciPy 희소 행렬을 NumPy 배열로 변환
import scipy.sparse
if isinstance(X_encoded, scipy.sparse.spmatrix):
    X_encoded = X_encoded.toarray()

# SMOTE 실행 전에 데이터 확인
print("첫 번째 데이터 확인:", X_encoded[0])  # 인코딩된 첫 번째 행 확인

# 타겟 변수 인코딩
from sklearn.preprocessing import LabelEncoder

label_encoder_user = LabelEncoder()
y_user_encoded = label_encoder_user.fit_transform(y_user)

label_encoder_charger = LabelEncoder()
y_charger_encoded = label_encoder_charger.fit_transform(y_charger)

# 최종 확인
print("인코딩 완료. 결측치 및 데이터 준비가 완료되었습니다.")
print("X_encoded shape:", X_encoded.shape)
print("y_user_encoded shape:", y_user_encoded.shape)
print("y_charger_encoded shape:", y_charger_encoded.shape)

# 결측치 최종 확인
print("최종 결측치 확인 (Latitude & Longitude):")
print(X[['Latitude', 'Longitude']].isnull().sum())

print(df.columns)

데이터프레임의 열 목록:
Index(['Vehicle Model', 'Battery Capacity (kWh)', 'Charging Station ID',
       'Charging Station Location', 'Energy Consumed (kWh)',
       'Charging Duration (hours)', 'Charging Rate (kW)',
       'Charging Cost (USD)', 'Time of Day', 'Day of Week',
       'State of Charge (Start %)', 'State of Charge (End %)',
       'Distance Driven (since last charge) (km)', 'Temperature (°C)',
       'Vehicle Age (years)', 'Charger Type', 'User Type', 'Latitude',
       'Longitude', 'Power (kW)', 'Usage Cost', 'Number of Connections',
       'Cost Category', 'Avg Power per Connection', 'Charging Time Category'],
      dtype='object')
결측치 처리 후 데이터 확인:
Vehicle Model                               0
Charging Station ID                         0
Charging Station Location                   0
Energy Consumed (kWh)                       0
Charging Duration (hours)                   0
Charging Rate (kW)                          0
Charging Cost (USD)                         0
Time of Day     

# 데이터 분할 (훈련 세트와 테스트 세트)

In [8]:
# 1. SMOTE 적용 전에 NaN 값 처리
# NaN이 있으면 0 또는 다른 값으로 대체
X_encoded = np.nan_to_num(X_encoded, nan=0.0)

# 확인: NaN이 있는지 확인
print("NaN 값 처리 후 확인:", np.isnan(X_encoded).sum())

# 2. 데이터 분할 (User Type과 Charger Type 각각)
X_train_user, X_test_user, y_train_user, y_test_user = train_test_split(
    X_encoded, y_user_encoded, test_size=0.2, random_state=42
)
X_train_charger, X_test_charger, y_train_charger, y_test_charger = train_test_split(
    X_encoded, y_charger_encoded, test_size=0.2, random_state=42
)

# 3. 데이터 불균형 처리 (SMOTE 적용)
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)

# 데이터 불균형 처리 (SMOTE 적용)
smote = SMOTE(random_state=42)
X_train_user_smote, y_train_user_smote = smote.fit_resample(X_train_user, y_train_user)
X_train_charger_smote, y_train_charger_smote = smote.fit_resample(X_train_charger, y_train_charger)

NaN 값 처리 후 확인: 0


# 하이퍼파라미터 최적화 및 모델 학습

In [9]:
# 데이터 스케일링 (User Type)
scaler_user = StandardScaler()
X_train_user_scaled = scaler_user.fit_transform(X_train_user_smote)
X_test_user_scaled = scaler_user.transform(X_test_user)

scaler_charger = StandardScaler()
X_train_charger_scaled = scaler_charger.fit_transform(X_train_charger_smote)
X_test_charger_scaled = scaler_charger.transform(X_test_charger)

# 5. VotingClassifier 생성 및 학습
voting_clf_user = VotingClassifier(estimators=[
    ('lgbm', LGBMClassifier(random_state=42, n_estimators=200, max_depth=10)),
    ('rf', RandomForestClassifier(random_state=42, n_estimators=200, max_depth=10)),
    ('xgb', XGBClassifier(random_state=42, n_estimators=200, max_depth=10)),
    ('cat', CatBoostClassifier(random_state=42, iterations=200, depth=10, verbose=0)),
    ('lr', LogisticRegression(random_state=42, max_iter=1000))
], voting='soft')

voting_clf_user.fit(X_train_user_scaled, y_train_user_smote)

voting_clf_charger = VotingClassifier(estimators=[
    ('lgbm', LGBMClassifier(random_state=42, n_estimators=200, max_depth=10)),
    ('rf', RandomForestClassifier(random_state=42, n_estimators=200, max_depth=10)),
    ('xgb', XGBClassifier(random_state=42, n_estimators=200, max_depth=10)),
    ('cat', CatBoostClassifier(random_state=42, iterations=200, depth=10, verbose=0)),
    ('lr', LogisticRegression(random_state=42, max_iter=1000))
], voting='soft')

voting_clf_charger.fit(X_train_charger_scaled, y_train_charger_smote)



[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.007575 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2303
[LightGBM] [Info] Number of data points in the train set: 1137, number of used features: 34
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000665 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2191
[LightGBM] [Info] Number of data points in the train set: 1086, number of used features: 34
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


# 모델 평가

In [10]:
# User Type 예측
y_pred_user = voting_clf_user.predict(X_test_user_scaled)
print("User Type Evaluation:")
print("Accuracy:", accuracy_score(y_test_user, y_pred_user))
print("Classification Report:\n", classification_report(y_test_user, y_pred_user))

# Charger Type 예측
y_pred_charger = voting_clf_charger.predict(X_test_charger_scaled)
print("Charger Type Evaluation:")
print("Accuracy:", accuracy_score(y_test_charger, y_pred_charger))
print("Classification Report:\n", classification_report(y_test_charger, y_pred_charger))



User Type Evaluation:
Accuracy: 0.3409090909090909
Classification Report:
               precision    recall  f1-score   support

           0       0.32      0.40      0.36        75
           1       0.36      0.35      0.35        97
           2       0.35      0.28      0.31        92

    accuracy                           0.34       264
   macro avg       0.34      0.34      0.34       264
weighted avg       0.34      0.34      0.34       264

Charger Type Evaluation:
Accuracy: 0.38257575757575757
Classification Report:
               precision    recall  f1-score   support

           0       0.38      0.35      0.36        84
           1       0.41      0.44      0.43        97
           2       0.35      0.35      0.35        83

    accuracy                           0.38       264
   macro avg       0.38      0.38      0.38       264
weighted avg       0.38      0.38      0.38       264



# 최종 모델 평가

In [12]:
# 최종 모델 평가 함수
def evaluate_model(model, X_test, y_test, target_names, title):
    print(f"\n--- {title} Evaluation ---")
    # 예측
    y_pred = model.predict(X_test)
    # 정확도 출력
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {accuracy:.4f}")
    # 정밀도, 재현율, F1-Score 출력
    print("Classification Report:")
    print(classification_report(y_test, y_pred, target_names=target_names))

# User Type 최종 모델 평가
target_names_user = ['Casual Driver', 'Commuter', 'Long-Distance Traveler']
evaluate_model(voting_clf_user, X_test_user_scaled, y_test_user, target_names_user, "User Type")

# Charger Type 최종 모델 평가
target_names_charger = ['Slow Charger', 'Fast Charger', 'Ultra-fast Charger']
evaluate_model(voting_clf_charger, X_test_charger_scaled, y_test_charger, target_names_charger, "Charger Type")


--- User Type Evaluation ---




Accuracy: 0.3409
Classification Report:
                        precision    recall  f1-score   support

         Casual Driver       0.32      0.40      0.36        75
              Commuter       0.36      0.35      0.35        97
Long-Distance Traveler       0.35      0.28      0.31        92

              accuracy                           0.34       264
             macro avg       0.34      0.34      0.34       264
          weighted avg       0.34      0.34      0.34       264


--- Charger Type Evaluation ---




Accuracy: 0.3826
Classification Report:
                    precision    recall  f1-score   support

      Slow Charger       0.38      0.35      0.36        84
      Fast Charger       0.41      0.44      0.43        97
Ultra-fast Charger       0.35      0.35      0.35        83

          accuracy                           0.38       264
         macro avg       0.38      0.38      0.38       264
      weighted avg       0.38      0.38      0.38       264

