In [1]:
# sklearn
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [2]:
import numpy as np
# mat
import matplotlib.pyplot as plt
# pandas
import pandas as pd

In [3]:
iris = load_iris()
x = iris.data
y = iris.target

# split dataset for train and test
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=13)

### accurate score

In [4]:
def cal_score(y_true, y_pred):
    if len(y_pred) == 0 : return 0
    correct_num = 0
    for i in range(len(y_pred)):
        if y_true[i] == y_pred[i]:
            correct_num+=1
    return f"{100*correct_num/len(y_pred):.2f}%"

### Q2. #1.
#### Fit a logistic regression model to the raw dataset.

In [5]:
model = LogisticRegression(max_iter = 1000)
model.fit(x_train,y_train)
y_pred = model.predict(x_test)
score_raw_data = cal_score(y_test,y_pred)
print("Score with raw data:", score_raw_data)

Score with raw data: 97.37%


### Q2. #2.
#### Z-score Normalization

In [6]:
feature_nums = 4

def cal_mean(x):
    feature_sums = [0]*feature_nums
    for row in x:
        for i in range(feature_nums):
            feature_sums[i] += row[i]
    return np.array([ s / len(x) for s in feature_sums ])

def cal_std(x,mean_v):
    variance = [0]*feature_nums
    # cal variance
    for row in x:
        for i in range(feature_nums):
            variance[i] += (row[i]-mean_v[i])**2
    variance = [v/(len(x)-1) for v in variance ]
    return np.array([ v**0.5 for v in variance ])

def z_score_norm(x,mean_v,std_v):    
    return (x-mean_v)/std_v


### Q2. #3.
#### Fit a logistic regression model to the normalized dataset 

In [13]:
mean_v = cal_mean(x_train)
std_v = cal_std(x_train,mean_v)

x_train_z_norm = z_score_norm(x_train,mean_v,std_v)
x_test_z_norm = z_score_norm(x_test,mean_v,std_v)

model2 = LogisticRegression(max_iter = 1000)
model2.fit(x_train_z_norm,y_train)
y_pred_norm = model2.predict(x_test_z_norm)
score_z_norm = cal_score(y_test,y_pred_norm)

print("Score with raw data:", score_raw_data)
print("Score with Q2 #3 z-score normalization:",score_z_norm)

Score with raw data: 97.37%
Score with Q2 #3 z-score normalization: 97.37%


### Q3. #1
#### add outlier to x_train
#### random pick 5 data from x then multiply all the picked data with 10

In [8]:
x_outlier = x.copy()
y_outlier = y.copy()
print(type(x_outlier))
random_x_outliers = x_outlier[np.random.choice(x_outlier.shape[0], size=5, replace=False)]
random_y = y_outlier[np.random.choice(y_outlier.shape[0], size=5, replace=False)]
# x_sample = random.sample(x_outlier,10)
print(f"random data picked:{random_x_outliers}")
random_x_outliers = (random_x_outliers+1.0)*10
print(f"random x outlier data:{random_x_outliers}")
print(f"random y outlier data:{random_y}")
x_outlier = np.append(x_outlier,random_x_outliers,axis=0)
y_outlier = np.append(y_outlier,random_y)

<class 'numpy.ndarray'>
random data picked:[[5.5 2.4 3.7 1. ]
 [6.2 3.4 5.4 2.3]
 [7.3 2.9 6.3 1.8]
 [6.3 2.9 5.6 1.8]
 [6.  3.  4.8 1.8]]
random x outlier data:[[65. 34. 47. 20.]
 [72. 44. 64. 33.]
 [83. 39. 73. 28.]
 [73. 39. 66. 28.]
 [70. 40. 58. 28.]]
random y outlier data:[0 2 2 1 1]


### Q3 #2
#### apply normalization to x_outlier

In [14]:
# update split train test data
x_outlier_train, x_outlier_test, y_outlier_train, y_outlier_test = train_test_split(x_outlier, y_outlier, random_state=13)
mean_outlier_v = cal_mean(x_outlier_train)
std_outlier_v = cal_std(x_outlier_train,mean_outlier_v)

print(f"mean outlier:{mean_outlier_v}")
print(f"std outlier:{std_outlier_v}")

x_outlier_train_z_norm = z_score_norm(x_outlier_train,mean_outlier_v,std_outlier_v)
x_outlier_test_z_norm = z_score_norm(x_outlier_test,mean_outlier_v,std_outlier_v)

model3 = LogisticRegression(max_iter = 1000)
model3.fit(x_outlier_train_z_norm,y_outlier_train)
y_outlier_pred_z_norm = model3.predict(x_outlier_test_z_norm)
score_outlier_z_norm = cal_score(y_outlier_test,y_outlier_pred_z_norm)

print("Score with raw data:", score_raw_data)
print("Score with Q2 #3 z-score normalization:",score_z_norm)
print("Score with Q3 #2 outlier z-score normalization:",score_outlier_z_norm)

mean outlier:[8.12155172 4.26982759 5.68362069 2.05689655]
std outlier:[12.44719494  6.63721026 11.07004532  4.91926513]
Score with raw data: 97.37%
Score with Q2 #3 z-score normalization: 97.37%
Score with Q3 #2 outlier z-score normalization: 79.49%


### Q4 #1 
#### Implement the robust variant of normalization

In [10]:
def cal_median(x):
    medians = []
    for col in range(x.shape[1]):
        sorted_col = sorted(x[:,col])
        if len(x)%2 == 0:
            medians.append( sorted_col[len(x)//2] )
        else :
            medians.append( sorted_col[len(x)//2] + sorted_col[len(x)//2-1] )
    return np.array(medians)

# quatile can be calulate from median of split data set 
def cal_IQR(x):
    iqrs = []
    sorted_x = np.sort(x,axis=0)
    # split data to half
    if len(x)%2 == 0 :
        lower_x = sorted_x[:len(x)//2]
        upper_x = sorted_x[len(x)//2:]
    else :
        lower_x = sorted_x[:len(x)//2]
        upper_x = sorted_x[len(x)//2+1:]
    return cal_median(upper_x) - cal_median(lower_x)

def robust_norm(x,medians,IQRs):
    return (x - medians)/ IQRs

### Q4 #2
#### Apply robust normalization on the dataset 

In [15]:
# on normal data
median_v = cal_median(x_train)
IQR_v = cal_IQR(x_train)
# print(f"median_v:{median_v}")
# print(f"IQR_v:{IQR_v}")

x_train_robust_norm = robust_norm(x_train,median_v,IQR_v)
x_test_robust_norm = robust_norm(x_test,median_v,IQR_v)

model4_1 = LogisticRegression(max_iter = 1000)
model4_1.fit(x_train_robust_norm,y_train)
y_pred_robust_norm = model4_1.predict(x_test_robust_norm)
score_robust_norm = cal_score(y_test,y_pred_robust_norm)

print("Score with raw data:", score_raw_data)
print("Score with Q2 #3 z-score normalization:",score_z_norm)
print("Score with Q3 #2 outlier z-score normalization:",score_outlier_z_norm)
print("Score with Q4 #2 robust score normalization:",score_robust_norm)

Score with raw data: 97.37%
Score with Q2 #3 z-score normalization: 97.37%
Score with Q3 #2 outlier z-score normalization: 79.49%
Score with Q4 #2 robust score normalization: 97.37%


In [12]:
# on outlier data
median_outlier_v = cal_median(x_outlier_train)
IQR_outlier_v = cal_IQR(x_outlier_train)
# print(f"median_outlier_v: {median_outlier_v}")
# print(f"IQR_outlier_v: {IQR_outlier_v}")

x_outlier_train_robust_norm = robust_norm(x_outlier_train,median_v,IQR_v)
x_outlier_test_robust_norm = robust_norm(x_outlier_test,median_v,IQR_v)

model4_2 = LogisticRegression(max_iter = 1000)
model4_2.fit(x_outlier_train_robust_norm,y_outlier_train)
y_outlier_pred_robust_norm = model4_2.predict(x_outlier_test_robust_norm)
score_outlier_robust_norm = cal_score(y_outlier_test,y_outlier_pred_robust_norm)

print("Score with raw data:", score_raw_data)
print("Score with Q2 #2 z-score normalization:",score_z_norm)
print("Score with Q3 #2 outlier z-score normalization:",score_outlier_z_norm)
print("Score with Q4 #2 robust normalization:",score_robust_norm)
print("Score with Q4 #2 outlier robust normalization:",score_outlier_robust_norm)

Score with raw data: 97.37%
Score with Q2 #2 z-score normalization: 97.37%
Score with Q3 #2 outlier z-score normalization: 79.49%
Score with Q4 #2 robust normalization: 97.37%
Score with Q4 #2 outlier robust normalization: 94.87%
