In [1]:
import os

import numpy as np
import pandas as pd

import plotly.express as px
import matplotlib.pyplot as plt

# load data

In [2]:
train_df = pd.read_csv('data/train.csv')
print(train_df.shape)
train_df.head()

(84406, 20)


Unnamed: 0,ID,월,요일,시간,소관경찰서,소관지역,사건발생거리,강수량(mm),강설량(mm),적설량(cm),풍향,안개,짙은안개,번개,진눈깨비,서리,연기/연무,눈날림,범죄발생지,TARGET
0,TRAIN_00000,9,화요일,10,137,8.0,2.611124,0.0,0.0,0.0,245.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,차도,2
1,TRAIN_00001,11,화요일,6,438,13.0,3.209093,0.0,0.0,0.0,200.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,차도,0
2,TRAIN_00002,8,일요일,6,1729,47.0,1.619597,0.0,0.0,0.0,40.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,인도,1
3,TRAIN_00003,5,월요일,6,2337,53.0,1.921615,11.375,0.0,0.0,225.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,주거지,1
4,TRAIN_00004,9,일요일,11,1439,41.0,1.789721,0.0,0.0,0.0,255.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,주유소,2


In [3]:
test_df = pd.read_csv('data/test.csv')
print(test_df.shape)
test_df.head()

(17289, 19)


Unnamed: 0,ID,월,요일,시간,소관경찰서,소관지역,사건발생거리,강수량(mm),강설량(mm),적설량(cm),풍향,안개,짙은안개,번개,진눈깨비,서리,연기/연무,눈날림,범죄발생지
0,TEST_00000,9,금요일,5,927,28.0,1.570654,19.625,0.0,0.0,165.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,차도
1,TEST_00001,5,수요일,3,926,28.0,1.712457,21.444444,0.0,0.0,175.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,식당
2,TEST_00002,5,월요일,6,1437,33.0,0.447496,25.2,0.0,0.0,290.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,주거지
3,TEST_00003,11,화요일,1,1739,31.0,0.878585,0.0,0.0,0.0,285.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,주거지
4,TEST_00004,10,목요일,10,830,15.0,0.496423,26.142857,0.0,0.0,95.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,주거지


# feature exploration

In [4]:
train_df.TARGET.value_counts()

TARGET
0    36453
1    25397
2    22556
Name: count, dtype: int64

# preprocess data

In [5]:
cat_cols = []
num_cols = []

for col in test_df.columns[1:]:
    if train_df[col].dtype == 'object':
        cat_cols.append(col)
    else:
        num_cols.append(col)

print('categorical columns: ', cat_cols)
print('numeric columns: ', num_cols)

categorical columns:  ['요일', '범죄발생지']
numeric columns:  ['월', '시간', '소관경찰서', '소관지역', '사건발생거리', '강수량(mm)', '강설량(mm)', '적설량(cm)', '풍향', '안개', '짙은안개', '번개', '진눈깨비', '서리', '연기/연무', '눈날림']


## preprocess cat data

In [6]:
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

In [7]:
train_cat_array = encoder.fit_transform(train_df[cat_cols])
test_cat_array = encoder.transform(test_df[cat_cols])

encoded_cols = np.concatenate(encoder.categories_)
encoded_cols

array(['금요일', '목요일', '수요일', '월요일', '일요일', '토요일', '화요일', '공원', '백화점', '병원',
       '식당', '약국', '은행', '인도', '주거지', '주유소', '주차장', '차도', '편의점', '학교',
       '호텔/모텔'], dtype=object)

In [8]:
train_df = pd.concat([
    train_df.drop(columns=cat_cols),
    pd.DataFrame(train_cat_array, columns=encoded_cols)], axis=1)
    
test_df = pd.concat([
    test_df.drop(columns=cat_cols),
    pd.DataFrame(test_cat_array, columns=encoded_cols)], axis=1)

train_df.shape, test_df.shape

((84406, 39), (17289, 38))

## preprocess numeric data

In [9]:
train_df[num_cols].describe()

Unnamed: 0,월,시간,소관경찰서,소관지역,사건발생거리,강수량(mm),강설량(mm),적설량(cm),풍향,안개,짙은안개,번개,진눈깨비,서리,연기/연무,눈날림
count,84406.0,84406.0,84406.0,84406.0,84406.0,84406.0,84406.0,84406.0,84406.0,84406.0,84406.0,84406.0,84406.0,84406.0,84406.0,84406.0
mean,6.430195,6.769507,1060.027581,26.881726,1.912424,24.608776,2.284407,23.430503,186.926107,0.385423,0.017842,0.144042,0.02033,0.01026,0.210755,0.008921
std,3.108302,3.56639,698.380485,13.870968,0.958556,62.711211,15.852881,85.199896,98.299485,0.486698,0.132379,0.351134,0.141128,0.100771,0.407847,0.09403
min,1.0,1.0,26.0,5.0,0.012269,0.0,0.0,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,4.0,4.0,526.0,13.0,1.209985,0.0,0.0,0.0,95.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,7.0,7.0,937.0,27.0,1.822279,0.625,0.0,0.0,205.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,9.0,10.0,1638.0,38.0,2.476528,18.571429,0.0,0.0,260.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
max,12.0,12.0,2450.0,54.0,4.998936,614.875,295.0,649.8,360.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [10]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

In [11]:
train_df[num_cols] = scaler.fit_transform(train_df[num_cols])
test_df[num_cols] = scaler.transform(test_df[num_cols])

train_df[num_cols].describe()

Unnamed: 0,월,시간,소관경찰서,소관지역,사건발생거리,강수량(mm),강설량(mm),적설량(cm),풍향,안개,짙은안개,번개,진눈깨비,서리,연기/연무,눈날림
count,84406.0,84406.0,84406.0,84406.0,84406.0,84406.0,84406.0,84406.0,84406.0,84406.0,84406.0,84406.0,84406.0,84406.0,84406.0,84406.0
mean,1.522002e-16,1.146132e-16,3.7039880000000003e-17,7.176477000000001e-17,2.200506e-16,7.66052e-18,-1.0438510000000002e-17,4.3227220000000003e-17,5.2192560000000005e-17,-3.493534e-18,-4.4447850000000005e-17,1.056794e-16,-4.2848410000000005e-17,7.500576000000001e-17,-1.052269e-17,-6.759778000000001e-17
std,1.000006,1.000006,1.000006,1.000006,1.000006,1.000006,1.000006,1.000006,1.000006,1.000006,1.000006,1.000006,1.000006,1.000006,1.000006,1.000006
min,-1.747008,-1.617754,-1.480617,-1.577529,-1.98232,-0.3924166,-0.1441013,-0.2750079,-1.799879,-0.7919185,-0.134783,-0.4102213,-0.1440563,-0.101815,-0.5167533,-0.09487608
25%,-0.7818446,-0.7765621,-0.7646702,-1.000782,-0.7328137,-0.3924166,-0.1441013,-0.2750079,-0.9351692,-0.7919185,-0.134783,-0.4102213,-0.1440563,-0.101815,-0.5167533,-0.09487608
50%,0.1833184,0.06462963,-0.1761623,0.008526751,-0.09404287,-0.3824502,-0.1441013,-0.2750079,0.1838667,-0.7919185,-0.134783,-0.4102213,-0.1440563,-0.101815,-0.5167533,-0.09487608
75%,0.8267604,0.9058214,0.8275945,0.8015547,0.5884964,-0.09627278,-0.1441013,-0.2750079,0.7433846,1.262756,-0.134783,-0.4102213,-0.1440563,-0.101815,-0.5167533,-0.09487608
max,1.791923,1.466616,1.990291,1.95505,3.219977,9.412507,18.46461,7.351807,1.76069,1.262756,7.419332,2.437709,6.941732,9.821737,1.93516,10.54006


In [12]:
train_X = train_df[np.concatenate([encoded_cols, num_cols])]
test_X = test_df[np.concatenate([encoded_cols, num_cols])]
train_X.shape, test_X.shape

((84406, 37), (17289, 37))

In [13]:
train_y = train_df['TARGET']
train_y_multi = pd.get_dummies(train_df['TARGET'])
train_y.shape, train_y_multi.shape

((84406,), (84406, 3))

# Oversampling

In [14]:
from imblearn.over_sampling import RandomOverSampler

# 원본 데이터 X와 레이블 y가 있다고 가정
# X: (n_samples, n_features) 크기의 2D 배열
# y: (n_samples,) 크기의 1D 배열

# RandomOverSampler 객체 생성
oversampler = RandomOverSampler()

# 오버샘플링 수행
X_resampled, y_resampled = oversampler.fit_resample(train_X, train_y)

# 오버샘플링된 데이터 확인
print("오버샘플링된 데이터 수:", len(X_resampled))
print("오버샘플링된 클래스별 분포:", np.bincount(y_resampled))

오버샘플링된 데이터 수: 109359
오버샘플링된 클래스별 분포: [36453 36453 36453]


In [15]:
y_resampled_multi = pd.get_dummies(y_resampled)
y_resampled.shape, y_resampled_multi.shape

((109359,), (109359, 3))

# Split train data into train/val with StratifiedKFold

In [16]:
# from sklearn.model_selection import StratifiedKFold

# skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [17]:
# for i, (train_index, val_index) in enumerate(skf.split(train_X, train_y)):
#     print(f"Fold {i}:")
#     print(f"  Train: index={train_index}")
#     print(f"  Test:  index={val_index}")

In [18]:
## 나중에 꼭 잘라서 교차검증하자

# Ensembles

In [19]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score

# 데이터 분할
X_resampled_train, X_resampled_val, y_resampled_train, y_resampled_val = \
    train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

In [20]:
# 앙상블 모델 생성
ensemble = RandomForestClassifier(random_state=42, verbose=2)

# 앙상블 모델 훈련
ensemble.fit(X_resampled_train, y_resampled_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s


building tree 1 of 100
building tree 2 of 100
building tree 3 of 100
building tree 4 of 100
building tree 5 of 100
building tree 6 of 100
building tree 7 of 100
building tree 8 of 100
building tree 9 of 100
building tree 10 of 100
building tree 11 of 100
building tree 12 of 100
building tree 13 of 100
building tree 14 of 100
building tree 15 of 100
building tree 16 of 100
building tree 17 of 100
building tree 18 of 100
building tree 19 of 100
building tree 20 of 100
building tree 21 of 100
building tree 22 of 100
building tree 23 of 100
building tree 24 of 100
building tree 25 of 100
building tree 26 of 100
building tree 27 of 100
building tree 28 of 100
building tree 29 of 100
building tree 30 of 100
building tree 31 of 100
building tree 32 of 100
building tree 33 of 100
building tree 34 of 100
building tree 35 of 100
building tree 36 of 100
building tree 37 of 100
building tree 38 of 100
building tree 39 of 100
building tree 40 of 100
building tree 41 of 100
building tree 42 of 100
b

[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:   13.5s finished


In [21]:
# 앙상블 모델 예측
val_pred = ensemble.predict(X_resampled_val)

# 앙상블 모델 성능 평가
accuracy = accuracy_score(y_resampled_val, val_pred)
f1_macro = f1_score(y_resampled_val, val_pred, average='macro')

print("앙상블 모델 정확도:", accuracy)
print("앙상블 모델 F1:", f1_macro)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s


앙상블 모델 정확도: 0.6833851499634236
앙상블 모델 F1: 0.682348151525427


[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.8s finished


In [22]:
# 앙상블 모델 예측
train_pred = ensemble.predict(X_resampled_train)

# 앙상블 모델 성능 평가
accuracy = accuracy_score(y_resampled_train, train_pred)
f1_macro = f1_score(y_resampled_train, train_pred, average='macro')

print("앙상블 모델 정확도:", accuracy)
print("앙상블 모델 F1:", f1_macro)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s


앙상블 모델 정확도: 0.9992113113948358
앙상블 모델 F1: 0.9992112590762517


[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    2.7s finished


In [23]:
# 앙상블 모델 예측
test_pred = ensemble.predict(test_X)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.6s finished


In [24]:
sample_submission_df = pd.read_csv("data/sample_submission.csv")

train_df.shape, test_df.shape, sample_submission_df.shape

((84406, 39), (17289, 38), (17289, 2))

In [25]:
mname = 'randomforest'
desc = 'oversampled'

In [26]:
from datetime import datetime as dt

def make_report(template, test_pred, mname):
    template['TARGET'] = test_pred
    now = dt.strftime(dt.now(), '%y-%m-%d')
    template.to_csv(f'results/{mname}-{desc}-{now}.csv', index=False)
    
make_report(sample_submission_df, test_pred, mname)

In [27]:
## optimizer
## early stopping
## compile
## train
## evaluate

In [28]:
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.datasets import make_classification
# from sklearn.model_selection import train_test_split
# from sklearn.metrics import accuracy_score

# # 분류 데이터 생성
# X, y = make_classification(n_samples=1000, n_features=10, random_state=42)

# # 데이터 분할
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# # 앙상블 모델 생성
# ensemble = RandomForestClassifier(random_state=42)

# # 앙상블 모델 훈련
# ensemble.fit(X_train, y_train)

# # 앙상블 모델 예측
# y_pred = ensemble.predict(X_test)

# # 앙상블 모델 성능 평가
# accuracy = accuracy_score(y_test, y_pred)
# print("앙상블 모델 정확도:", accuracy)

????