In [None]:
import pandas as pd
import numpy as np
import warnings
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
import xgboost as xgb
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
df = pd.read_excel('/content/b21_kor_V3.xlsx')
df = df.dropna()

In [None]:
def reg(x, y):
  x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=156)

  lr = LinearRegression().fit(x_train, y_train)
  ridge = Ridge(alpha=1.0).fit(x_train, y_train)
  lasso = Lasso().fit(x_train, y_train)
  elastic = ElasticNet(alpha=1.0, l1_ratio=0.5).fit(x_train, y_train)

  lr_preds = lr.predict(x_test)
  ridge_preds = ridge.predict(x_test)
  lasso_preds = lasso.predict(x_test)
  elastic_preds = elastic.predict(x_test)

  lr_mse = mean_squared_error(y_test, lr_preds)
  ridge_mse = mean_squared_error(y_test, ridge_preds)
  lasso_mse = mean_squared_error(y_test, lasso_preds)
  elastic_mse = mean_squared_error(y_test, elastic_preds)

  lr_rmse = np.sqrt(lr_mse)
  ridge_rmse = np.sqrt(ridge_mse)
  lasso_rmse = np.sqrt(lasso_mse)
  elastic_rmse = np.sqrt(elastic_mse)

  print("PCA_clus")
  print("lr")
  print('MSE: {0:.3f} , RMSE: {1:.3F}'.format(lr_mse, lr_rmse))
  print('Variance score: {0:.3f}'.format(r2_score(y_test, lr_preds)))
  print('-'*30)
  print("ridge")
  print('MSE: {0:.3f} , RMSE: {1:.3F}'.format(ridge_mse, ridge_rmse))
  print('Variance score: {0:.3f}'.format(r2_score(y_test, ridge_preds)))
  print('-'*30)
  print("lasso")
  print('MSE: {0:.3f} , RMSE: {1:.3F}'.format(lasso_mse, lasso_rmse))
  print('Variance score: {0:.3f}'.format(r2_score(y_test, lasso_preds)))
  print('-'*30)
  print("elastic")
  print('MSE: {0:.3f} , RMSE: {1:.3F}'.format(elastic_mse, elastic_rmse))
  print('Variance score: {0:.3f}'.format(r2_score(y_test, elastic_preds)), '\n\n')

## Cluster

In [None]:
clus = df[[
'시간단위습도_scld',
'시간단위기온_scld'
]]
clus = clus.dropna()

In [None]:
warnings.filterwarnings('ignore')
k_values = range(1, 11)
inertia_values = []

for k in k_values:
    kmeans = KMeans(n_clusters=k, random_state=0)
    cluster_res = kmeans.fit(clus)
    inertia_values.append(kmeans.inertia_)

plt.plot(k_values, inertia_values, marker='o')
plt.xlabel('cluster_num')
plt.ylabel('Inertia')
plt.title('Elbow Method')
plt.show()

In [None]:
kmeans = KMeans(n_clusters=3, random_state=42)
kmeans.fit(clus)
label = kmeans.labels_
label = pd.Series(label)
clus['label'] = label.values
clus = pd.concat((df, clus['label']), axis=1)

In [None]:
df_to_split= clus.copy()

df_clus1 = df_to_split[df_to_split['label'] == 0]
df_clus1 = df_clus1.dropna()
df_clus2 = df_to_split[df_to_split['label'] == 1]
df_clus2 = df_clus2.dropna()
df_clus3 = df_to_split[df_to_split['label'] == 2]
df_clus3 = df_clus3.dropna()

label_clus1= pd.DataFrame(df_clus1['사망인명피해수_scld']).astype(int)
label_clus2 = pd.DataFrame(df_clus2['사망인명피해수_scld']).astype(int)
label_clus3 = pd.DataFrame(df_clus3['사망인명피해수_scld']).astype(int)

df_clus1 = df_clus1.drop(['사망인명피해수_scld'],axis=1,inplace=False)
df_clus2 = df_clus2.drop(['사망인명피해수_scld'],axis=1,inplace=False)
df_clus3 = df_clus3.drop(['사망인명피해수_scld'],axis=1,inplace=False)

pca1 = PCA(n_components=10)
pca1.fit(df_clus1)
df_c1_pca = pca1.transform(df_clus1)

pca2 = PCA(n_components=10)
pca2.fit(df_clus2)
df_c2_pca = pca2.transform(df_clus2)

pca3 = PCA(n_components=10)
pca3.fit(df_clus3)
df_c3_pca = pca3.transform(df_clus3)

df_c1_pca_columns = []
df_c2_pca_columns = []
df_c3_pca_columns = []

for i in range(1, 11):
  df_c1_pca_columns.append(f'pca_component_{i}')
  df_c2_pca_columns.append(f'pca_component_{i}')
  df_c3_pca_columns.append(f'pca_component_{i}')

df_c1_pca = pd.DataFrame(df_c1_pca, columns = df_c1_pca_columns)
df_c2_pca = pd.DataFrame(df_c2_pca, columns = df_c2_pca_columns)
df_c3_pca = pd.DataFrame(df_c3_pca, columns = df_c3_pca_columns)

In [None]:
rcf1 = RandomForestClassifier(random_state=156)
pca_X1 = df_c1_pca[df_c1_pca_columns]
scores_c1 = cross_val_score(rcf1, df_c1_pca, label_clus1.사망인명피해수_scld, scoring='accuracy', cv=3, error_score='raise')
scores_c1_pca = cross_val_score(rcf1, pca_X1, label_clus1.사망인명피해수_scld, scoring='accuracy', cv=3)

rcf2 = RandomForestClassifier(random_state=156)
pca_X2 = df_c2_pca[df_c2_pca_columns]
scores_c2 = cross_val_score(rcf1, df_c2_pca, label_clus2.사망인명피해수_scld, scoring='accuracy', cv=3, error_score='raise')
scores_c2_pca = cross_val_score(rcf1, pca_X2, label_clus2.사망인명피해수_scld, scoring='accuracy', cv=3)

rcf2 = RandomForestClassifier(random_state=156)
pca_X3 = df_c3_pca[df_c3_pca_columns]
scores_c3 = cross_val_score(rcf1, df_c3_pca, label_clus3.사망인명피해수_scld, scoring='accuracy', cv=3, error_score='raise')
scores_c3_pca = cross_val_score(rcf1, pca_X3, label_clus3.사망인명피해수_scld, scoring='accuracy', cv=3)

print('clus1')
print('before_PCA 교차 검증 개별 정확도:', scores_c1)
print('before_PCA 교차 평균 정확도:', np.mean(scores_c1))
print('after_PCA 교차 검증 개별 정확도:', scores_c1_pca)
print('after_PCA 교차 평균 정확도:', np.mean(scores_c1_pca),'\n')

print('clus2')
print('before_PCA 교차 검증 개별 정확도:', scores_c2)
print('before_PCA 교차 평균 정확도:', np.mean(scores_c2))
print('after_PCA 교차 검증 개별 정확도:', scores_c2_pca)
print('after_PCA 교차 평균 정확도:', np.mean(scores_c2_pca),'\n')

print('clus3')
print('before_PCA 교차 검증 개별 정확도:', scores_c3)
print('before_PCA 교차 평균 정확도:', np.mean(scores_c3))
print('after_PCA 교차 검증 개별 정확도:', scores_c3_pca)
print('after_PCA 교차 평균 정확도:', np.mean(scores_c3_pca),'\n')

In [None]:
y_data = label_clus1['사망인명피해수_scld']
x_data = df_c1_pca
reg(x_data, y_data)

In [None]:
y_data = label_clus2['사망인명피해수_scld']
x_data = df_c2_pca
reg(x_data, y_data)

In [None]:
y_data = label_clus2['사망인명피해수_scld']
x_data = df_c2_pca
reg(x_data, y_data)

In [None]:
y_data = label_clus3['사망인명피해수_scld']
x_data = df_c3_pca
reg(x_data, y_data)

## Non_Cluster

In [None]:
df = pd.read_excel('/content/b21_kor_V3.xlsx')
df = df.dropna()
label = pd.DataFrame(df['사망인명피해수_scld']).astype(int)
df = df.drop(['사망인명피해수_scld'],axis=1,inplace=False)

In [None]:
pca = PCA(n_components=10)
pca.fit(df)
df_pca = pca.transform(df)

pca_columns = []
for i in range(1, 11):
  pca_columns.append(f'pca_component_{i}')
df_pca = pd.DataFrame(df_pca, columns = pca_columns)

rcf = RandomForestClassifier(random_state=156)
pca_X = df_pca[pca_columns]
scores = cross_val_score(rcf, df, label.사망인명피해수_scld, scoring='accuracy', cv=3)
scores_pca = cross_val_score(rcf, pca_X, label.사망인명피해수_scld, scoring='accuracy', cv=3)

print('before_PCA_Dummied 교차 검증 개별 정확도:', scores)
print('before_PCA_Dummied 교차 평균 정확도:', np.mean(scores))
print('after_PCA_Dummied 교차 검증 개별 정확도:', scores_pca)
print('after_PCA_Dummied 교차 평균 정확도:', np.mean(scores_pca))

In [None]:
y_data = label['사망인명피해수_scld']
x_data = df_pca
reg(x_data, y_data)