In [2]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import matplotlib.ticker as ticker
from PIL import Image
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import Lasso
from sklearn.metrics import confusion_matrix, roc_curve, roc_auc_score
from sklearn.metrics import mean_squared_error
import warnings
warnings.filterwarnings('ignore')

# Q1

In [327]:
#讀取圖片資料並轉為dataframe
data_p = []
for i in range(1,41):
    for j in range(1,11):
        data_p.append(list(Image.open(f'C:/Users/USER/Desktop/Course/Data Analytics/DA05/ORL Faces/ORL Faces/{i}_{j}.png').getdata()))

data_p = pd.DataFrame(np.array(data_p))

#新增性別欄位，女性為0男性則為1
for i in data_p.index:
    if (i <= 9) | (70 <= i <= 79) | (90 <= i <= 99) | (310 <= i <= 319):
        data_p.loc[i, 'sex'] = 0
    else:
        data_p.loc[i, 'sex'] = 1
        
X = data_p.iloc[:,:-1]
y = data_p['sex']

#建立訓練及測試資料
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_train_std = (X_train - X_train.mean())/X_train.std()
X_test_std = (X_test - X_test.mean())/X_test.std()

LR

In [263]:
#訓練並預測test資料
model = LogisticRegression().fit(X_train_std, y_train)
y_pred = model.predict(X_test_std)

#建立confusion matrix並呈現指標
cm = confusion_matrix(y_test, y_pred)
accuracy_LR = (cm[1,1]+cm[0,0])/cm.sum()
sensitivity_LR = cm[1,1]/(cm[1,1]+cm[1,0])
specificity_LR = cm[0,0]/(cm[0,0]+cm[0,1])
precision_LR = cm[1,1]/(cm[1,1]+cm[0,1])
LR_ind = [accuracy_LR, sensitivity_LR, specificity_LR, precision_LR]

KNN

In [264]:
#訓練並預測test資料
model = KNeighborsClassifier(n_neighbors=5).fit(X_train_std, y_train)
y_pred = model.predict(X_test_std)

#建立confusion matrix並呈現指標
cm = confusion_matrix(y_test, y_pred)
accuracy_KNN = (cm[1,1]+cm[0,0])/cm.sum()
sensitivity_KNN = cm[1,1]/(cm[1,1]+cm[1,0])
specificity_KNN = cm[0,0]/(cm[0,0]+cm[0,1])
precision_KNN = cm[1,1]/(cm[1,1]+cm[0,1])
KNN_ind = [accuracy_KNN, sensitivity_KNN, specificity_KNN, precision_KNN]

SVM

In [266]:
#訓練並預測test資料
model = svm.LinearSVC().fit(X_train_std, y_train)
y_pred = model.predict(X_test_std)

#建立confusion matrix並呈現指標
cm = confusion_matrix(y_test, y_pred)
accuracy_SVM = (cm[1,1]+cm[0,0])/cm.sum()
sensitivity_SVM = cm[1,1]/(cm[1,1]+cm[1,0])
specificity_SVM = cm[0,0]/(cm[0,0]+cm[0,1])
precision_SVM = cm[1,1]/(cm[1,1]+cm[0,1])
SVM_ind = [accuracy_SVM, sensitivity_SVM, specificity_SVM, precision_SVM]

In [267]:
results = [LR_ind, KNN_ind, SVM_ind]

df_results = pd.DataFrame(results,
                          columns=['accuracy', 'sensitivity', 'specificity', 'precision'],
                          index=['LR', 'KNN', 'SVM']).T

df_results


Unnamed: 0,LR,KNN,SVM
accuracy,0.958333,0.95,0.95
sensitivity,1.0,1.0,0.970874
specificity,0.705882,0.647059,0.823529
precision,0.953704,0.944954,0.970874


# Q2

利用LASSO regression 篩選出重要的pixel

In [328]:
lasso = Lasso(alpha=0.01)
lasso.fit(X_train_std , y_train)

coef = pd.Series(lasso.coef_, index=X.columns)
important_features = coef[coef.abs() > 0].index.tolist()

X_train_std = X_train_std[important_features]
X_test_std = X_test_std[important_features]

In [329]:
len(important_features)

95

LR

In [330]:
#訓練並預測test資料
model = LogisticRegression().fit(X_train_std, y_train)
y_pred = model.predict(X_test_std)

#建立confusion matrix並呈現指標
cm = confusion_matrix(y_test, y_pred)
accuracy_LR = (cm[1,1]+cm[0,0])/cm.sum()
sensitivity_LR = cm[1,1]/(cm[1,1]+cm[1,0])
specificity_LR = cm[0,0]/(cm[0,0]+cm[0,1])
precision_LR = cm[1,1]/(cm[1,1]+cm[0,1])
LR_ind = [accuracy_LR, sensitivity_LR, specificity_LR, precision_LR]

KNN

In [331]:
#訓練並預測test資料
model = KNeighborsClassifier(n_neighbors=5).fit(X_train_std, y_train)
y_pred = model.predict(X_test_std)

#建立confusion matrix並呈現指標
cm = confusion_matrix(y_test, y_pred)
accuracy_KNN = (cm[1,1]+cm[0,0])/cm.sum()
sensitivity_KNN = cm[1,1]/(cm[1,1]+cm[1,0])
specificity_KNN = cm[0,0]/(cm[0,0]+cm[0,1])
precision_KNN = cm[1,1]/(cm[1,1]+cm[0,1])
KNN_ind = [accuracy_KNN, sensitivity_KNN, specificity_KNN, precision_KNN]

SVM

In [332]:
#訓練並預測test資料
model = svm.LinearSVC().fit(X_train_std, y_train)
y_pred = model.predict(X_test_std)

#建立confusion matrix並呈現指標
cm = confusion_matrix(y_test, y_pred)
accuracy_SVM = (cm[1,1]+cm[0,0])/cm.sum()
sensitivity_SVM = cm[1,1]/(cm[1,1]+cm[1,0])
specificity_SVM = cm[0,0]/(cm[0,0]+cm[0,1])
precision_SVM = cm[1,1]/(cm[1,1]+cm[0,1])
SVM_ind = [accuracy_SVM, sensitivity_SVM, specificity_SVM, precision_SVM]

In [333]:
results = [LR_ind, KNN_ind, SVM_ind]

df_results = pd.DataFrame(results,
                          columns=['accuracy', 'sensitivity', 'specificity', 'precision'],
                          index=['LR', 'KNN', 'SVM']).T

df_results

Unnamed: 0,LR,KNN,SVM
accuracy,0.95,0.958333,0.95
sensitivity,1.0,1.0,1.0
specificity,0.647059,0.705882,0.647059
precision,0.944954,0.953704,0.944954


# Q3

In [3]:
data_a = pd.read_csv('C:/Users/USER/Desktop/Course/Data Analytics/DA07/auto-mpg.data.csv')
data_a = data_a[data_a['horsepower'] != '?']
data_a['horsepower'] = data_a['horsepower'].astype(float)

X = data_a.drop(columns=['origin', 'car name'])
y = data_a['origin']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_train_std = (X_train - X_train.mean())/X_train.std()
X_test_std = (X_test - X_test.mean())/X_test.std()

LR

In [7]:
#訓練並預測test資料
model = LogisticRegression().fit(X_train_std, y_train)
y_pred = model.predict(X_test_std)

#建立confusion matrix並呈現指標
cm = confusion_matrix(y_test, y_pred)
accuracy_LR = (cm[1,1]+cm[0,0]+cm[2,2])/cm.sum()
sensitivity_LR = cm[1,1]/(cm[1,1]+cm[1,0])
specificity_LR = cm[0,0]/(cm[0,0]+cm[0,1])
precision_LR = cm[1,1]/(cm[1,1]+cm[0,1])
LR_ind = [accuracy_LR, sensitivity_LR, specificity_LR, precision_LR]

In [6]:
cm

array([[62,  1,  5],
       [ 9,  9, 10],
       [ 7,  1, 14]], dtype=int64)

KNN

In [11]:
#訓練並預測test資料
model = KNeighborsClassifier(n_neighbors=5).fit(X_train_std, y_train)
y_pred = model.predict(X_test_std)

#建立confusion matrix並呈現指標
cm = confusion_matrix(y_test, y_pred)
accuracy_KNN = (cm[1,1]+cm[0,0])/cm.sum()
sensitivity_KNN = cm[1,1]/(cm[1,1]+cm[1,0])
specificity_KNN = cm[0,0]/(cm[0,0]+cm[0,1])
precision_KNN = cm[1,1]/(cm[1,1]+cm[0,1])
KNN_ind = [accuracy_KNN, sensitivity_KNN, specificity_KNN, precision_KNN]

SVM

In [13]:
#訓練並預測test資料
model = svm.LinearSVC().fit(X_train_std, y_train)
y_pred = model.predict(X_test_std)

#建立confusion matrix並呈現指標
cm = confusion_matrix(y_test, y_pred)
accuracy_SVM = (cm[1,1]+cm[0,0])/cm.sum()
sensitivity_SVM = cm[1,1]/(cm[1,1]+cm[1,0])
specificity_SVM = cm[0,0]/(cm[0,0]+cm[0,1])
precision_SVM = cm[1,1]/(cm[1,1]+cm[0,1])
SVM_ind = [accuracy_SVM, sensitivity_SVM, specificity_SVM, precision_SVM]

In [338]:
results = [LR_ind, KNN_ind, SVM_ind]

df_results = pd.DataFrame(results,
                          columns=['accuracy', 'sensitivity', 'specificity', 'precision'],
                          index=['LR', 'KNN', 'SVM']).T

df_results

Unnamed: 0,LR,KNN,SVM
accuracy,0.601695,0.610169,0.627119
sensitivity,0.5,0.478261,0.625
specificity,0.984127,0.968254,0.984615
precision,0.9,0.846154,0.909091
