In [1]:
import os

import numpy as np
import pandas as pd

import plotly.express as px
import matplotlib.pyplot as plt

# load data

In [2]:
train_df = pd.read_csv('data/train.csv')
print(train_df.shape)
train_df.head()

(84406, 20)


Unnamed: 0,ID,월,요일,시간,소관경찰서,소관지역,사건발생거리,강수량(mm),강설량(mm),적설량(cm),풍향,안개,짙은안개,번개,진눈깨비,서리,연기/연무,눈날림,범죄발생지,TARGET
0,TRAIN_00000,9,화요일,10,137,8.0,2.611124,0.0,0.0,0.0,245.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,차도,2
1,TRAIN_00001,11,화요일,6,438,13.0,3.209093,0.0,0.0,0.0,200.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,차도,0
2,TRAIN_00002,8,일요일,6,1729,47.0,1.619597,0.0,0.0,0.0,40.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,인도,1
3,TRAIN_00003,5,월요일,6,2337,53.0,1.921615,11.375,0.0,0.0,225.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,주거지,1
4,TRAIN_00004,9,일요일,11,1439,41.0,1.789721,0.0,0.0,0.0,255.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,주유소,2


In [3]:
test_df = pd.read_csv('data/test.csv')
print(test_df.shape)
test_df.head()

(17289, 19)


Unnamed: 0,ID,월,요일,시간,소관경찰서,소관지역,사건발생거리,강수량(mm),강설량(mm),적설량(cm),풍향,안개,짙은안개,번개,진눈깨비,서리,연기/연무,눈날림,범죄발생지
0,TEST_00000,9,금요일,5,927,28.0,1.570654,19.625,0.0,0.0,165.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,차도
1,TEST_00001,5,수요일,3,926,28.0,1.712457,21.444444,0.0,0.0,175.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,식당
2,TEST_00002,5,월요일,6,1437,33.0,0.447496,25.2,0.0,0.0,290.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,주거지
3,TEST_00003,11,화요일,1,1739,31.0,0.878585,0.0,0.0,0.0,285.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,주거지
4,TEST_00004,10,목요일,10,830,15.0,0.496423,26.142857,0.0,0.0,95.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,주거지


# preprocess data

In [4]:
cat_cols = []
num_cols = []

for col in test_df.columns[1:]:
    if train_df[col].dtype == 'object':
        cat_cols.append(col)
    else:
        num_cols.append(col)

print('categorical columns: ', cat_cols)
print('numeric columns: ', num_cols)

categorical columns:  ['요일', '범죄발생지']
numeric columns:  ['월', '시간', '소관경찰서', '소관지역', '사건발생거리', '강수량(mm)', '강설량(mm)', '적설량(cm)', '풍향', '안개', '짙은안개', '번개', '진눈깨비', '서리', '연기/연무', '눈날림']


## preprocess cat data

In [5]:
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

In [6]:
train_cat_array = encoder.fit_transform(train_df[cat_cols])
test_cat_array = encoder.transform(test_df[cat_cols])

encoded_cols = np.concatenate(encoder.categories_)
encoded_cols

array(['금요일', '목요일', '수요일', '월요일', '일요일', '토요일', '화요일', '공원', '백화점', '병원',
       '식당', '약국', '은행', '인도', '주거지', '주유소', '주차장', '차도', '편의점', '학교',
       '호텔/모텔'], dtype=object)

In [7]:
train_df = pd.concat([
    train_df.drop(columns=cat_cols),
    pd.DataFrame(train_cat_array, columns=encoded_cols)], axis=1)
    
test_df = pd.concat([
    test_df.drop(columns=cat_cols),
    pd.DataFrame(test_cat_array, columns=encoded_cols)], axis=1)

train_df.shape, test_df.shape

((84406, 39), (17289, 38))

## preprocess numeric data

In [8]:
train_df[num_cols].describe()

Unnamed: 0,월,시간,소관경찰서,소관지역,사건발생거리,강수량(mm),강설량(mm),적설량(cm),풍향,안개,짙은안개,번개,진눈깨비,서리,연기/연무,눈날림
count,84406.0,84406.0,84406.0,84406.0,84406.0,84406.0,84406.0,84406.0,84406.0,84406.0,84406.0,84406.0,84406.0,84406.0,84406.0,84406.0
mean,6.430195,6.769507,1060.027581,26.881726,1.912424,24.608776,2.284407,23.430503,186.926107,0.385423,0.017842,0.144042,0.02033,0.01026,0.210755,0.008921
std,3.108302,3.56639,698.380485,13.870968,0.958556,62.711211,15.852881,85.199896,98.299485,0.486698,0.132379,0.351134,0.141128,0.100771,0.407847,0.09403
min,1.0,1.0,26.0,5.0,0.012269,0.0,0.0,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,4.0,4.0,526.0,13.0,1.209985,0.0,0.0,0.0,95.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,7.0,7.0,937.0,27.0,1.822279,0.625,0.0,0.0,205.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,9.0,10.0,1638.0,38.0,2.476528,18.571429,0.0,0.0,260.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
max,12.0,12.0,2450.0,54.0,4.998936,614.875,295.0,649.8,360.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [9]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

In [10]:
train_df[num_cols] = scaler.fit_transform(train_df[num_cols])
test_df[num_cols] = scaler.transform(test_df[num_cols])

train_df[num_cols].describe()

Unnamed: 0,월,시간,소관경찰서,소관지역,사건발생거리,강수량(mm),강설량(mm),적설량(cm),풍향,안개,짙은안개,번개,진눈깨비,서리,연기/연무,눈날림
count,84406.0,84406.0,84406.0,84406.0,84406.0,84406.0,84406.0,84406.0,84406.0,84406.0,84406.0,84406.0,84406.0,84406.0,84406.0,84406.0
mean,1.522002e-16,1.146132e-16,3.7039880000000003e-17,7.176477000000001e-17,2.200506e-16,7.66052e-18,-1.0438510000000002e-17,4.3227220000000003e-17,5.2192560000000005e-17,-3.493534e-18,-4.4447850000000005e-17,1.056794e-16,-4.2848410000000005e-17,7.500576000000001e-17,-1.052269e-17,-6.759778000000001e-17
std,1.000006,1.000006,1.000006,1.000006,1.000006,1.000006,1.000006,1.000006,1.000006,1.000006,1.000006,1.000006,1.000006,1.000006,1.000006,1.000006
min,-1.747008,-1.617754,-1.480617,-1.577529,-1.98232,-0.3924166,-0.1441013,-0.2750079,-1.799879,-0.7919185,-0.134783,-0.4102213,-0.1440563,-0.101815,-0.5167533,-0.09487608
25%,-0.7818446,-0.7765621,-0.7646702,-1.000782,-0.7328137,-0.3924166,-0.1441013,-0.2750079,-0.9351692,-0.7919185,-0.134783,-0.4102213,-0.1440563,-0.101815,-0.5167533,-0.09487608
50%,0.1833184,0.06462963,-0.1761623,0.008526751,-0.09404287,-0.3824502,-0.1441013,-0.2750079,0.1838667,-0.7919185,-0.134783,-0.4102213,-0.1440563,-0.101815,-0.5167533,-0.09487608
75%,0.8267604,0.9058214,0.8275945,0.8015547,0.5884964,-0.09627278,-0.1441013,-0.2750079,0.7433846,1.262756,-0.134783,-0.4102213,-0.1440563,-0.101815,-0.5167533,-0.09487608
max,1.791923,1.466616,1.990291,1.95505,3.219977,9.412507,18.46461,7.351807,1.76069,1.262756,7.419332,2.437709,6.941732,9.821737,1.93516,10.54006


In [11]:
train_X = train_df[np.concatenate([encoded_cols, num_cols])]
test_X = test_df[np.concatenate([encoded_cols, num_cols])]
train_X.shape, test_X.shape

((84406, 37), (17289, 37))

In [12]:
train_y = train_df['TARGET']
train_y_multi = pd.get_dummies(train_df['TARGET'])
train_y.shape, train_y_multi.shape

((84406,), (84406, 3))

# Oversampling

In [13]:
from imblearn.over_sampling import RandomOverSampler

# 원본 데이터 X와 레이블 y가 있다고 가정
# X: (n_samples, n_features) 크기의 2D 배열
# y: (n_samples,) 크기의 1D 배열

# RandomOverSampler 객체 생성
oversampler = RandomOverSampler()

# 오버샘플링 수행
X_resampled, y_resampled = oversampler.fit_resample(train_X, train_y)

# 오버샘플링된 데이터 확인
print("오버샘플링된 데이터 수:", len(X_resampled))
print("오버샘플링된 클래스별 분포:", np.bincount(y_resampled))

오버샘플링된 데이터 수: 109359
오버샘플링된 클래스별 분포: [36453 36453 36453]


In [14]:
y_resampled_multi = pd.get_dummies(y_resampled)
y_resampled.shape, y_resampled_multi.shape

((109359,), (109359, 3))

# Split train data into train/val with StratifiedKFold

In [15]:
# from sklearn.model_selection import StratifiedKFold

# skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [16]:
# for i, (train_index, val_index) in enumerate(skf.split(train_X, train_y)):
#     print(f"Fold {i}:")
#     print(f"  Train: index={train_index}")
#     print(f"  Test:  index={val_index}")

In [17]:
## 나중에 꼭 잘라서 교차검증하자

In [18]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score

# 데이터 분할
X_resampled_train, X_resampled_val, y_resampled_train, y_resampled_val = \
    train_test_split(X_resampled, y_resampled, test_size=0.2, stratify=y_resampled, random_state=42)

In [19]:
data = {
    'X_train_all': X_resampled,
    'y_train_all': y_resampled,
    'X_train': X_resampled_train,
    'y_train': y_resampled_train,
    'X_val': X_resampled_val,
    'y_val': y_resampled_val,
    'X_test': test_X,
}

# Ensembles

In [34]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, \
    GradientBoostingClassifier, ExtraTreesClassifier, HistGradientBoostingClassifier

from sklearn.metrics import accuracy_score, f1_score

In [35]:
from scipy.stats import uniform as sp_randFloat
from scipy.stats import randint as sp_randInt

In [81]:
max_depth = sp_randInt(5, 8)
n_estimators = sp_randInt(100, 500)
# n_estimators = [int(x) for x in np.linspace(start = 10, stop = 80, num = 10)]
max_features = [None, 'sqrt', 'log2']
min_samples_split = [2, 5]
max_iter = sp_randInt(100, 500)
max_leaf_nodes = [1, 10, 31, 50, 100]
min_samples_leaf = [1, 2, 10, 20, 50]
bootstrap = [True, False]

ensemble_params = {
    'ada': {
        "n_estimators" : n_estimators,
        "learning_rate": sp_randFloat(),
    },
    'gbm': {
        "n_estimators" : n_estimators,
        "max_depth"    : max_depth,
        "learning_rate": sp_randFloat(),
        "subsample"    : sp_randFloat(),
    },
    'histgbm': {
        "warm_start"   : [True, False],
        "max_depth"    : max_depth,
        "learning_rate": sp_randFloat(),
        "max_iter"     : max_iter,
        "max_leaf_nodes": max_leaf_nodes,
        "l2_regularization": sp_randFloat(),
        "min_samples_leaf": min_samples_leaf,
    },
    'random_forest': {
        "max_depth"    : max_depth,
        'n_estimators': n_estimators,
        'max_features': max_features,
        'min_samples_split': min_samples_split,
        'min_samples_leaf': min_samples_leaf,
        'bootstrap': bootstrap
    }
}

In [88]:
def ml_train_and_eval(modelname, ensemble_params, data):
    scoring = 'accuracy'
    
    model_dict = {
        'ada': AdaBoostClassifier(), 
        'gbm': GradientBoostingClassifier(),
        'histgbm': HistGradientBoostingClassifier(loss='categorical_crossentropy'), 
        'rf':  RandomForestClassifier(),
    }

    model = model_dict[modelname]
    RandomGrid = RandomizedSearchCV(
        estimator=model, 
        param_distributions=ensemble_params[modelname], 
        cv=5,  
        n_jobs=30,
        scoring=scoring, 
        refit=True,
        random_state=42,
        verbose=1,
    )

    RandomGrid.fit(data['X_train'], data['y_train'])
    
    # 앙상블 모델 예측 & 성능 평가
    val_pred = RandomGrid.predict(data['X_val'])
    accuracy = accuracy_score(data['y_val'], val_pred)
    f1_macro = f1_score(data['y_val'], val_pred, average='macro')

    print("앙상블 모델 정확도:", accuracy)
    print("앙상블 모델 F1:", f1_macro)

    # 앙상블 모델 예측 & 성능 평가
    train_pred = RandomGrid.predict(data['X_train'])
    accuracy = accuracy_score(data['y_train'], train_pred)
    f1_macro = f1_score(data['y_train'], train_pred, average='macro')

    # 앙상블 모델 성능 평가
    accuracy = accuracy_score(y_resampled_train, train_pred)
    f1_macro = f1_score(y_resampled_train, train_pred, average='macro')

    print("(train) 앙상블 모델 정확도:", accuracy)
    print("(train) 앙상블 모델 F1:", f1_macro)

    #전체 데이터로 재학습
    RandomGrid = RandomizedSearchCV(
        estimator=model, 
        param_distributions=ensemble_params[modelname], 
        cv=5,  
        n_jobs=30,
        scoring=scoring, 
        refit=True,
        random_state=42,
        verbose=1,
    )
    RandomGrid.fit(data['X_train_all'], data['y_train_all'])

    # 앙상블 모델 예측 & 성능 평가
    val_pred = RandomGrid.predict(data['X_val'])
    accuracy = accuracy_score(data['y_val'], val_pred)
    f1_macro = f1_score(data['y_val'], val_pred, average='macro')

    print("앙상블 모델 정확도:", accuracy)
    print("앙상블 모델 F1:", f1_macro)

    # 앙상블 모델 예측 & 성능 평가
    train_pred = RandomGrid.predict(data['X_train'])
    accuracy = accuracy_score(data['y_train'], train_pred)
    f1_macro = f1_score(data['y_train'], train_pred, average='macro')

    # 앙상블 모델 성능 평가
    accuracy = accuracy_score(y_resampled_train, train_pred)
    f1_macro = f1_score(y_resampled_train, train_pred, average='macro')

    print("(train) 앙상블 모델 정확도:", accuracy)
    print("(train) 앙상블 모델 F1:", f1_macro)

    test_pred = RandomGrid.predict(data['X_test'])

    return test_pred

---
# Ada

In [89]:
# test_pred = ml_train_and_eval('ada', ensemble_params, data)

In [90]:
# mname = 'adaboost'
# desc = 'oversampled'

In [91]:
# from datetime import datetime as dt

# def make_report(template, test_pred, mname):
#     template['TARGET'] = test_pred
#     now = dt.strftime(dt.now(), '%y-%m-%d')
#     template.to_csv(f'results/{mname}-{desc}-{now}.csv', index=False)

# sample_submission_df = pd.read_csv("data/sample_submission.csv")    
# make_report(sample_submission_df, test_pred, mname)

---
# GBM - 3

In [92]:
test_pred = ml_train_and_eval('histgbm', ensemble_params, data)

Fitting 5 folds for each of 10 candidates, totalling 50 fits




앙상블 모델 정확도: 0.5699067300658376
앙상블 모델 F1: 0.5706280439730712
(train) 앙상블 모델 정확도: 0.6671619783510693
(train) 앙상블 모델 F1: 0.6677983065791294
Fitting 5 folds for each of 10 candidates, totalling 50 fits




앙상블 모델 정확도: 0.6578730797366495
앙상블 모델 F1: 0.6587292055623268
(train) 앙상블 모델 정확도: 0.653754272063278
(train) 앙상블 모델 F1: 0.6544637558501981


In [93]:
mname = 'histgbm'
desc = 'oversampled'

In [94]:
from datetime import datetime as dt

def make_report(template, test_pred, mname):
    template['TARGET'] = test_pred
    now = dt.strftime(dt.now(), '%y-%m-%d')
    template.to_csv(f'results/{mname}-{desc}-{now}-3.csv', index=False)

sample_submission_df = pd.read_csv("data/sample_submission.csv")    
make_report(sample_submission_df, test_pred, mname)

0.5249471955

---
# GBM - 2 (train, val 나눈 것에 대해서)

In [86]:
test_pred = ml_train_and_eval('histgbm', ensemble_params, data)

Fitting 5 folds for each of 10 candidates, totalling 50 fits




앙상블 모델 정확도: 0.5698610095098756
앙상블 모델 F1: 0.5707603215871634
(train) 앙상블 모델 정확도: 0.6669105124189879
(train) 앙상블 모델 F1: 0.6676039049950903


In [None]:
mname = 'histgbm'
desc = 'oversampled'

In [87]:
from datetime import datetime as dt

def make_report(template, test_pred, mname):
    template['TARGET'] = test_pred
    now = dt.strftime(dt.now(), '%y-%m-%d')
    template.to_csv(f'results/{mname}-{desc}-{now}-2.csv', index=False)

sample_submission_df = pd.read_csv("data/sample_submission.csv")    
make_report(sample_submission_df, test_pred, mname)

0.5204318447

---
# GBM - 0 (train, val 나눈 것에 대해서 - 0.5175542331), 1 (train all에 대해서 - 0.5182909804)

In [78]:
test_pred = ml_train_and_eval('histgbm', ensemble_params, data)

Fitting 5 folds for each of 10 candidates, totalling 50 fits




앙상블 모델 정확도: 0.6206108266276518
앙상블 모델 F1: 0.6212064091177937
(train) 앙상블 모델 정확도: 0.6149256460959914
(train) 앙상블 모델 F1: 0.615504317667967


In [67]:
mname = 'histgbm'
desc = 'oversampled'

In [79]:
from datetime import datetime as dt

def make_report(template, test_pred, mname):
    template['TARGET'] = test_pred
    now = dt.strftime(dt.now(), '%y-%m-%d')
    template.to_csv(f'results/{mname}-{desc}-{now}-1.csv', index=False)

sample_submission_df = pd.read_csv("data/sample_submission.csv")    
make_report(sample_submission_df, test_pred, mname)

0.5182909804