# 2022-1 빅데이터와 인공지능 14주차 Reports

In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import silhouette_samples
from sklearn.metrics import silhouette_score
from sklearn.tree import DecisionTreeRegressor
from sklearn import tree

from scipy.cluster.hierarchy import linkage, fcluster

import graphviz

import os
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Load datasets
from sklearn.datasets import load_iris
iris_data = load_iris()

## 14주차 1차시
### Question 1
* IRIS(붓꽃) 데이터에 대해 하이퍼파라미터(거리 개념)를 변경하면서 계층적군집화의 분류(군집화) 성능을 점검하시오.

In [None]:
# Create dataframes
df_X = pd.DataFrame(data=iris_data.data, columns=iris_data.feature_names)
df_y = pd.DataFrame(data=iris_data.target, columns=['Species'])
df_iris = pd.concat([df_X, df_y], axis=1)
df_iris.head()

In [None]:
# Feature Scaling - Normalization
iris_scaler = MinMaxScaler().fit(df_X)
iris_new = pd.DataFrame(iris_scaler.transform(df_X), columns=iris_data.feature_names)
iris_new = pd.concat([iris_new, df_y], axis=1)
iris_new.head()

In [None]:
# Split train and test datasets
iris_X = iris_new.drop('Species', axis=1)
iris_y = iris_new['Species']
iris_X_train, iris_X_test, iris_y_train, iris_y_test = train_test_split(iris_X, iris_y, test_size=0.2, random_state=0, stratify=df_y)
print(">>> X_train shape:", iris_X_train.shape)
print(">>> X_test shape:", iris_X_test.shape)
print(">>> y_train shape:", iris_y_train.shape)
print(">>> y_test shape:", iris_y_test.shape)

In [None]:
# Clustering
Cluster = linkage(y=iris_new, method='single', metric='euclidean')
Cut_tree = fcluster(Cluster, t=0.3, criterion='distance')
Labels = iris_new['Species']
df_iris_cluster = pd.DataFrame({'pred': Cut_tree, 'labels': Labels})
Con_Mat = pd.crosstab(df_iris_cluster['pred'], df_iris_cluster['labels'])
Con_Mat

In [None]:
# Set hyperparameters for GridSerachCV
list_k = [i for i in range(1, 6)]
list_weights = ['uniform', 'distance']
list_metric = ['minkowski', 'manhattan', 'euclidean']

parameters = {
    'n_neighbors': list_k, 
    'weights': list_weights,
    'metric': list_metric
    }

In [None]:
# GridSearchCV
knn_iris = KNeighborsClassifier()
GridSearchCV_iris = GridSearchCV(knn_iris, parameters, cv=5, scoring='accuracy')
GridSearchCV_iris.fit(X_train, y_train)

df_iris_score = pd.DataFrame(data=GridSearchCV_iris.cv_results_)
df_iris_score = df_iris_score[['params', 'mean_test_score', 'std_test_score']]
df_iris_score

In [None]:
print(">>> Best Parameters: ", GridSearchCV_iris.best_params_)
print(">>> Best Score: ", GridSearchCV_iris.best_score_)
print(">>> Best Test Score: ", GridSearchCV_iris.score(X_test, y_test))

## 14주차 2차시
### Question 2
* IRIS(붓꽃) 데이터의 설명변수 4종 중 군집분석의 효율성과 성능을 고려할 때, 선택할 수 있는 최소의 변수를 탐색하시오.

In [None]:
path_data_muliple_reg = os.getcwd() + '/datasets/Multiple_regression.csv'
data_muliple_reg = pd.read_csv(path_data_muliple_reg)
data_muliple_reg.head()

In [None]:
# Split datasets
X = data_muliple_reg.iloc[:, :2]
y = data_muliple_reg.iloc[:, 2]
Scaler_Multiple = MinMaxScaler().fit(X)
X = Scaler_Multiple.transform(X)

In [None]:
# Linear Regression - fit
Multi_LM = LinearRegression()
Multi_LM.fit(X, y)

In [None]:
# Linear Regression - predict
Multi_LM.predict(X)
print(">>> R2 Square score: ", Multi_LM.score(X, y))
print(">>> 회귀 계수 = ", Multi_LM.coef_)
print(">>> 절편 = ", Multi_LM.intercept_)

## 12주차 3차시
### Question 3
* 추천 시스템에서 사용되고 있는 군집 분석 알고리즘과 하이퍼 파라미터(거리 개념) 등에 대해 조사 하시오.

In [None]:
# Load datasets
path_data_airquality = os.getcwd() + '/datasets/airquality.csv'
data_airquality = pd.read_csv(path_data_airquality)
data_airquality.drop(['Unnamed: 0'], axis=1, inplace=True)
data_airquality.head()

In [None]:
# Check whether dataset contains Null or NaN
data_airquality.isnull().sum()

In [None]:
# Split feautre and target datasets
data_airquality.dropna(axis=0, inplace=True)
X = data_airquality.iloc[:, 1:4]
y = data_airquality['Ozone']

In [None]:
# Decision Tree Regression Example
airquality_scaler = MinMaxScaler()
tree_reg = DecisionTreeRegressor(max_depth=5)
Full_Pipeline_Tree_Reg = Pipeline(steps=[('scaler', airquality_scaler), ('Regressor', tree_reg)])
Full_Pipeline_Tree_Reg.fit(X, y)
airquality_pred = Full_Pipeline_Tree_Reg.predict(X)
print(">>> R_Squared = ", Full_Pipeline_Tree_Reg.score(X, y))

In [None]:
# Split datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [None]:
# GridSearchCV
parameters = {'criterion':['squared_error', 'friedman_mse', 'absolute_error', 'poisson'],\
            'splitter':['best', 'random'],\
            'max_depth':[2, 3, 5, 10],\
            'min_samples_split':[2, 3, 5]}

grid_dreg = GridSearchCV(tree_reg, param_grid=parameters, scoring='r2', cv=5)
grid_dreg.fit(X_train, y_train)

df_scores = pd.DataFrame(grid_dreg.cv_results_)
df_scores = df_scores[['params', 'mean_test_score', 'std_test_score']]
df_scores

In [None]:
# Decision Tree Visualization
dot_data_2 = tree.export_graphviz(tree_reg, out_file=None, \
    feature_names=X.columns, class_names='Ozone', \
    filled=True, rounded=True, special_characters=True)

graph_2 = graphviz.Source(dot_data_2)
graph_2