In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# import libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style('darkgrid')

import missingno as msno
import warnings
warnings.filterwarnings('ignore')

from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score , f1_score , precision_score , recall_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [None]:
# load data

path = '/content/drive/MyDrive/ETC/감염병 대응 빅데이터 아이디어 공모전/dataset'

df = pd.read_csv(path + '/df.csv')

In [None]:
df.index = df['Unnamed: 0']
df.drop(['Unnamed: 0'] , axis = 1 , inplace = True)

In [None]:
df = df.rename_axis(index = {'Unnamed: 0' : '지역'})

## 1.군집분석

### 1-1. Elbow Method

In [None]:
# 엘보우 함수 정의

def elbow(df):
    sse = []
    for i in range(1 , len(df)):
        km = KMeans(n_clusters = i)
        km.fit(df)
        sse.append(km.inertia_)
    sns.lineplot(
        x = range(1 , len(df)) ,
        y = sse ,
        marker = 'o' ,
        color = 'g'
    )
    plt.title("Elbow Method")
    plt.xlabel("number of cluster")
    plt.ylabel("sum of square error")

In [None]:
elbow(df)

### 1-2. Silhouette Method

In [None]:
def silhouette(df):
    silhouette = []
    for i in range(2 , len(df)):
        km = KMeans(n_clusters = i)
        km.fit(df)
        score = silhouette_score(df , km.labels_)
        silhouette.append(score)

    sns.lineplot(
        x = range(2 , len(df)) ,
        y = silhouette ,
        marker = 'o' ,
        color = 'g'
    )
    plt.title("Silhouette Method")
    plt.xlabel("number of cluster")
    plt.ylabel("Score")

In [None]:
silhouette(df)

### 1-3. Clustering

In [None]:
# 전처리

scaler = StandardScaler()

scaled_data = scaler.fit_transform(df)

scaled_df = pd.DataFrame(scaled_data , columns = df.columns)
scaled_df.index = df.index

In [None]:
# 군집 분석

k = 3

kmeans = KMeans(n_clusters = k)
kmeans.fit(scaled_df)

df['Cluster'] = kmeans.fit_predict(scaled_df)

In [None]:
# 시각화

from sklearn.decomposition import PCA

plt.figure(figsize = (8 , 8))

pca = PCA(n_components = 2)
data_pca = pca.fit_transform(df.drop(['Cluster'] , axis = 1))

data_pca = pd.DataFrame(data_pca , columns = ['PC1' , 'PC2'])
data_pca['Cluster'] = list(df['Cluster'])

sns.scatterplot(x = 'PC1' ,
                y = 'PC2' ,
                hue = 'Cluster' ,
                data = data_pca ,
                palette = 'viridis')
plt.title("PCA K-means")

## 2.분류 모델

In [None]:
train_df = df.copy()
train_df.index = range(0 , 17)

target = train_df['Cluster']
train_df.drop(['Cluster'] , axis = 1 , inplace = True)

# split data

trainX , testX , trainY , testY = train_test_split(train_df , target , test_size = 0.3 , random_state = 42)

In [None]:
RF_model = RandomForestClassifier(
    n_estimators = 300 ,
    max_depth = 10 ,
    random_state = 42
)

RF_model.fit(trainX , trainY)

In [None]:
prediction = RF_model.predict(testX)

In [None]:
# 평가

print(f"f1 : {f1_score(testY , prediction , average = 'weighted')}")
print(f"precision : {precision_score(testY , prediction , average = 'weighted')}")
print(f"recall : {recall_score(testY , prediction , average = 'weighted')}")