# 期中練習
---
1. 對 y 做 label encoding；把 chem 和 bio 合併成 X
2. 切割 training and testing 資料 (9:1)
3. 配適模型
4. 預測測試集
5. 紀錄準確度
6. 對x做log轉換 砍太多0的變數
7. 再紀錄一次精準度

## 匯入套件
---

In [1]:
import numpy as np
import pandas as pd
import warnings

warnings.filterwarnings("ignore")

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

## 讀取資料
---

In [3]:
bio = pd.read_csv("bio.csv")
chem = pd.read_csv("chem.csv")
injury = pd.read_csv("injury.csv")

bio = bio.iloc[:, 1:]
chem = chem.iloc[:, 1:]
injury = injury.iloc[:, 1:]

In [4]:
print("shape of bio:", bio.shape)
print("shape of chem:", chem.shape)
print("shape of injury:", injury.shape)

shape of bio: (281, 184)
shape of chem: (281, 192)
shape of injury: (281, 1)


##  資料預處理
---

* Bind “Severe” & “Mild” into “1”, “None” into “0”

In [5]:
injury["tidy_x"] = [0 if x == "None" else 1 for x in injury["x"]]
injury["tidy_x"].unique()

array([1, 0], dtype=int64)

* Combine data to a large X matrixunique

In [6]:
X = pd.concat([bio, chem], axis=1)
print("shape of X:", X.shape)

shape of X: (281, 376)


## 特徵工程
---

* Split data to Training  & Prediction set. (9:1)

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, injury["tidy_x"], test_size=0.1, 
                                                    random_state=99, shuffle=True)

In [8]:
print("shape of X_train:", X_train.shape)
print("shape of y_train:", y_train.shape)
print("shape of X_test:", X_test.shape)
print("shape of y_test:", y_test.shape)

shape of X_train: (252, 376)
shape of y_train: (252,)
shape of X_test: (29, 376)
shape of y_test: (29,)


In [9]:
# 匯出原始切割數據集

X_train.to_csv("X_train.csv")
y_train.to_csv("y_train.csv")
X_test.to_csv("X_test.csv")
y_test.to_csv("y_test.csv")

## 模型建置
---
* Logistic Regression
* Decision Tree
* Random Forest

In [10]:
LR = LogisticRegression(random_state=42, n_jobs=-1)
model_1 = LR.fit(X_train, y_train)
y_train_hat = model_1.predict(X_train)
print(f"Accuracy(train): {accuracy_score(y_train, y_train_hat): .4f}")

Accuracy(train):  0.8373


In [11]:
DT = DecisionTreeClassifier(max_depth=6, random_state=42)
model_2 = DT.fit(X_train, y_train)
y_train_hat = model_2.predict(X_train)
print(f"Accuracy(train): {accuracy_score(y_train, y_train_hat): .4f}")

Accuracy(train):  0.8730


In [12]:
RF = RandomForestClassifier(max_depth=6, random_state=42, n_jobs=-1)
model_3 = RF.fit(X_train, y_train)
y_train_hat = model_3.predict(X_train)
print(f"Accuracy(train): {accuracy_score(y_train, y_train_hat): .4f}")

Accuracy(train):  0.8770


## 預測分析
---

In [13]:
# LR
y_test_hat = model_1.predict(X_test)
print(f"LR Accuracy(test): {accuracy_score(y_test, y_test_hat): .4f}")

# DT
y_test_hat = model_2.predict(X_test)
print(f"DT Accuracy(test): {accuracy_score(y_test, y_test_hat): .4f}")

# RF
y_test_hat = model_3.predict(X_test)
print(f"RF Accuracy(test): {accuracy_score(y_test, y_test_hat): .4f}")

LR Accuracy(test):  0.6552
DT Accuracy(test):  0.6207
RF Accuracy(test):  0.5862


## 資料預處理（1）
---
* 對x做log轉換 砍太多0的變數

In [14]:
temp_list = []
for name in list(X.columns):
    length_with_zero = len(X[X[name] <= 1])
    if length_with_zero > 0:
        temp_list.append(name)
    else:
        continue
print(len(temp_list))

287


In [15]:
X_del = X.drop(temp_list, axis=1)
print("shape of X_del:", X_del.shape)

shape of X_del: (281, 89)


In [16]:
X_log = np.log(X_del)

## 特徵工程(1)
---

* Split data to Training  & Prediction set. (9:1)

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X_log, injury["tidy_x"], test_size=0.1, 
                                                    random_state=99, shuffle=True)

In [19]:
print("shape of X_train:", X_train.shape)
print("shape of y_train:", y_train.shape)
print("shape of X_test:", X_test.shape)
print("shape of y_test:", y_test.shape)

shape of X_train: (252, 89)
shape of y_train: (252,)
shape of X_test: (29, 89)
shape of y_test: (29,)


In [20]:
# 匯出轉換後的切割數據集

X_train.to_csv("X_train.csv")
y_train.to_csv("y_train.csv")
X_test.to_csv("X_test.csv")
y_test.to_csv("y_test.csv")

## 模型建置(1)
---
* Logistic Regression
* Decision Tree
* Random Forest

In [21]:
LR = LogisticRegression(random_state=42, n_jobs=-1)
model_1 = LR.fit(X_train, y_train)
y_train_hat = model_1.predict(X_train)
print(f"Accuracy(train): {accuracy_score(y_train, y_train_hat): .4f}")

Accuracy(train):  0.6905


In [22]:
DT = DecisionTreeClassifier(max_depth=6, random_state=42)
model_2 = DT.fit(X_train, y_train)
y_train_hat = model_2.predict(X_train)
print(f"Accuracy(train): {accuracy_score(y_train, y_train_hat): .4f}")

Accuracy(train):  0.7302


In [23]:
RF = RandomForestClassifier(max_depth=6, random_state=42, n_jobs=-1)
model_3 = RF.fit(X_train, y_train)
y_train_hat = model_3.predict(X_train)
print(f"Accuracy(train): {accuracy_score(y_train, y_train_hat): .4f}")

Accuracy(train):  0.7897


## 預測分析
---

In [24]:
# LR
y_test_hat = model_1.predict(X_test)
print(f"LR Accuracy(test): {accuracy_score(y_test, y_test_hat): .4f}")

# DT
y_test_hat = model_2.predict(X_test)
print(f"DT Accuracy(test): {accuracy_score(y_test, y_test_hat): .4f}")

# RF
y_test_hat = model_3.predict(X_test)
print(f"RF Accuracy(test): {accuracy_score(y_test, y_test_hat): .4f}")

LR Accuracy(test):  0.6207
DT Accuracy(test):  0.5517
RF Accuracy(test):  0.5172
