In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [4]:
def decision_stump(X, Y, U, theta):
  n = theta.shape[0]
  N = X.shape[0]
  # 沿y軸複製n倍，沿x軸複製1倍
  # 複製十次整份training data
  # X.shape (1000,)
  # theta.shape (1000, 1)
  X = np.tile(X, (n, 1))
  # X.shape (1000, 1000)
  # h(x)=s*sign(xi-theta)，s=-1/+1
  y1 = np.sign(X - theta)
  y2 = np.sign(X - theta) * (-1)
  # Ein(u) weighted error
  error1 = np.sum((y1 != Y) * U, axis=1)
  error2 = np.sum((y2 != Y) * U, axis=1)
  # index of min error
  i1 = np.argmin(error1)
  i2 = np.argmin(error2)

  if error1[i1] < error2[i2]:
      s = 1
      index = i1
      error = error1[i1]
      # error = error1[i1] / N
  else:
      s = -1
      index = i2
      error = error2[i2]
      # error = error2[i2] / N
  return s, index, error

In [5]:
def decision_stump_all(X, Y, U, theta):
  # 對所有維度做 decision_stump 後取誤差最小的
  x = [[] for i in range(10)]
  thetai = [[] for i in range(10)]
  s = [[] for i in range(10)]
  i = [[] for i in range(10)]
  e = [[] for i in range(10)]
  for k in range(10):
    x[k] = X[:, k]
    thetai[k] = theta[:, k].reshape(-1, 1)
    s[k], i[k], e[k] = decision_stump(x[k], Y, U, thetai[k])

  mine = e[0]
  midx = 0
  for k in range(1, 10):
    if e[k] < mine:
        midx = k
        mine = e[k]
  return e[midx], s[midx], midx, i[midx]

In [6]:
def AdaBoost(X, Y, theta, T=500):
  # init
  N = X.shape[0]
  ut = np.ones(N) / N
  ut_1 = np.array([])
  alpha = np.array([])
  epsilon = np.array([])
  Ein = np.array([])
  G = np.array([])

  for t in range(T):
    # comput current optimal result
    ein, s, d, index = decision_stump_all(X, Y, ut, theta)
    # 每50輪印一次，共印十次
    if t % 50 == 0:
        print(ein, s, d, index)
    # epsilon_t
    epsilon_t = ut.dot((s * np.sign(X[:, d] - theta[:, d][index])) != Y) / np.sum(ut)
    # 方塊 t
    cube_t = np.sqrt((1 - epsilon_t) / epsilon_t)
    # re-scale incorrect u_t
    i1 = s * np.sign(X[:, d] - theta[:, d][index]) != Y
    ut[i1] = ut[i1] * cube_t
    # re-scale correct u_t
    i2 = s * np.sign(X[:, d] - theta[:, d][index]) == Y
    ut[i2] = ut[i2] / cube_t
    # update alpha
    alpha_t = np.log(cube_t)
    
    # update variable
    Ein = np.r_[Ein, ein]
    if(t == 0):
        ut_1 = np.array([ut])
    else:
        ut_1 = np.r_[ut_1, np.array([ut])]
    epsilon = np.r_[epsilon, epsilon_t]
    alpha = np.r_[alpha, alpha_t]
    g = [[s, d, index]]

    # G(x)
    if(t == 0):
        G = np.array(g)
    else:
        G = np.r_[G, np.array(g)]
  return Ein, ut_1, epsilon, alpha, G

In [7]:
if __name__ == '__main__':
  # Get Data
  train = np.genfromtxt('/content/drive/MyDrive/Colab Notebooks/ML/HW6/hw6_train.dat.txt')
  test = np.genfromtxt('/content/drive/MyDrive/Colab Notebooks/ML/HW6/hw6_test.dat.txt')
  # print(train)
  # prepare theta
  train_dt = [[] for i in range(10)]
  x = [[] for i in range(10)]
  theta = [[] for i in range(10)]
  for i in range(10):
    # 按每個feature去排序所有資料，共十個feature，所以會有十個
    train_dt[i] = np.array(sorted(train, key=lambda x:x[i]))
    # 塞每個排序後的那一欄feature值給x[i]，x[0]就是1000筆資料排序後的所有第0個feature值，共1000個
    x[i] = train_dt[i][:, i]
    # 第一個為負無限大，然後是所有的中點
    theta[i] = np.append(np.array(x[i][0] - 1), (x[i][:-1] + x[i][1:])/2)
    # theta[i] = np.append(theta[i], x[i][-1] + 0.1)

  theta = np.c_[theta[0], theta[1], theta[2], theta[3], theta[4], theta[5], theta[6], theta[7], theta[8], theta[9]]
  # print(len(theta))
  # print(theta.shape)
  # Traing data
  Y = train[:, 10]
  X = train[:, :10]

  # Testing data
  Ytest = test[:, 10]
  Xtest = test[:, :10]

  Ein, U, epsilon, alpha, G = AdaBoost(X, Y, theta, 500)


0.374 -1 9 694
0.25089357000866486 -1 9 861
0.21490346446178551 1 1 225
0.18947194377251314 -1 1 810
0.17577715396255406 -1 6 168
0.1626653813681176 -1 0 340
0.15357846343223844 -1 1 407
0.14251443889877227 1 5 997
0.13444852045664263 -1 1 908
0.1291357407345058 -1 4 362


In [8]:
# 11 
print("Problem 11: ", Ein[0])
print(Ein.shape)

Problem 11:  0.374
(500,)


In [9]:
# 12
s = G[:, 0]
d = G[:, 1]
theta_ = G[:, 2]
g = []
for i in range(500):
  s_ = s[i]
  d_ = d[i]
  t_ = theta_[i]
  g.append(np.mean(s_*np.sign(X[:, d_] - theta[:, d_][t_]) != Y))
print("Problem 12: ", max(g))

Problem 12:  0.591


In [14]:
# 13
# compute E_{in|out}(Gt)
def predict(X, Y, G, alpha, t, theta):
  s = G[:t, 0]
  d = G[:t, 1]
  theta_ = G[:t, 2]
  alpha_ = alpha[:t]
  
  result = []
  for i in range(t):
      s_ = s[i]
      d_ = d[i]
      t_ = theta_[i]
      result.append(s_*np.sign(X[:, d_] - theta[:, d_][t_]))
  r = alpha_.dot(np.array(result))

  return np.mean(np.sign(r) != Y)

for i in range(500):
  e_in = predict(X, Y, G, alpha, i, theta)
  if e_in <= 0.05:
    print("Problem 13: ")
    print(i)
    break

Problem 13: 
355


In [11]:
# 14
print("Problem 14: ", np.mean(s[0]*np.sign(Xtest[:, d[0]] - theta[:, d[0]][theta_[0]]) != Ytest))

Problem 14:  0.455


In [12]:
# 15
result = []
for i in range(500):
  s_ = s[i]
  d_ = d[i]
  t_ = theta_[i]
  result.append(s_*np.sign(Xtest[:, d_] - theta[:, d_][t_]))
print("Problem 15: ", np.mean(np.sign(np.array(result)) != Ytest))

Problem 15:  0.484212


In [13]:
# 16
print("Problem 16: ", predict(Xtest, Ytest, G, alpha, 500, theta))

Problem 16:  0.188
