In [1]:
import numpy as np
np.set_printoptions(suppress=True)

In [2]:
col_names = ['Ages', 'Sector', 'Education','Marital_Status','Occupation','Race','Sex','Hours','Country','Target']

In [3]:
train = np.genfromtxt("hw1-data/income.train.txt.5k", delimiter = ',', dtype=None, names = col_names, autostrip=True, encoding=None)
dev = np.genfromtxt("hw1-data/income.dev.txt", delimiter = ',', dtype=None, names = col_names, autostrip=True, encoding=None)

In [4]:
train

array([(50, 'Self-emp-not-inc', 'Bachelors', 'Married-civ-spouse', 'Exec-managerial', 'White', 'Male', 13, 'United-States', '<=50K'),
       (38, 'Private', 'HS-grad', 'Divorced', 'Handlers-cleaners', 'White', 'Male', 40, 'United-States', '<=50K'),
       (53, 'Private', '11th', 'Married-civ-spouse', 'Handlers-cleaners', 'Black', 'Male', 40, 'United-States', '<=50K'),
       ...,
       (61, 'Private', 'HS-grad', 'Married-civ-spouse', 'Adm-clerical', 'White', 'Female',  5, 'United-States', '<=50K'),
       (42, 'Private', 'Bachelors', 'Never-married', 'Sales', 'White', 'Male', 40, 'United-States', '<=50K'),
       (21, 'Private', 'Assoc-voc', 'Never-married', 'Adm-clerical', 'White', 'Female', 40, 'United-States', '<=50K')],
      dtype=[('Ages', '<i8'), ('Sector', '<U16'), ('Education', '<U12'), ('Marital_Status', '<U21'), ('Occupation', '<U17'), ('Race', '<U18'), ('Sex', '<U6'), ('Hours', '<i8'), ('Country', '<U26'), ('Target', '<U5')])

In [5]:
# One-hot encoding
mapping = {}
def One_hot_encoding(f):
    target = []
    new_data = []
    for r in f:
        new_row = []
        for i, c in enumerate(r):
            if i==9 and c=='<=50K':
                target.append(0)
            elif i==9 and c=='>50K':
                target.append(1)
            elif not c.dtype is np.dtype('i8'):
                if (i, c) not in mapping:
                    mapping[(i,c)] = len(mapping)
                new_row.append(mapping[(i,c)])
                
        new_data.append(new_row)
            
    m,n = f.shape[0], len(mapping)
    bindata = np.zeros([m,n])
    for i, r in enumerate(new_data):
        for x in r:
            bindata[i][x] = 1

    bindata = np.concatenate((bindata,np.reshape(f['Ages']/50,(-1,1)),np.reshape(f['Hours']/50,(-1,1))),1)
    
    return bindata, np.array(target)
                

In [6]:
train, train_label = One_hot_encoding(train)
dev, dev_label = One_hot_encoding(dev)

In [7]:
train = np.c_[train[:,:90], np.zeros(5000), train[:,90:]]

In [8]:
# calculate distance with simply append all distance to array
def get_k_neighborhood(f, t, k, method=None):
    result = np.zeros(len(f))
    for (i, r) in enumerate(f):
        result[i] = np.linalg.norm(r-t, ord=method)
    top_k = np.argpartition(result, k, axis=0)[:k]
    return top_k, result[top_k]

In [9]:
euclidean_list = get_k_neighborhood(train, dev[-1], 5, None)
euclidean_list

(array([1713, 3769, 2003, 1010, 2450]),
 array([0.16      , 0.26      , 0.28284271, 0.06      , 0.34      ]))

In [10]:
get_k_neighborhood(train, dev[-1], 5)

(array([1713, 3769, 2003, 1010, 2450]),
 array([0.16      , 0.26      , 0.28284271, 0.06      , 0.34      ]))

In [11]:
manhattan_list = get_k_neighborhood(train, dev[-1], 1)

In [12]:
def knn(train_data, target_data, target_index, k, distance_method=None):
    result = 0

    k_list = get_k_neighborhood(train_data, target_data[target_index], k)

    result = train_label[k_list[0].astype('int')].sum()
    if result >= k/2:
        return 1
    else:
        return 0

In [13]:
print(knn(train, dev, 999, 5, None) == dev_label[-1])
print(knn(train, dev, 999, 5, 1) == dev_label[-1])

True
True


In [14]:
print(knn(train, dev, 999, 9, None) == dev_label[-1])
print(knn(train, dev, 999, 9, 1) == dev_label[-1])

True
True


In [15]:
def knn_pred(target_data, k, metric=None):
    pred = np.zeros(len(target_data))
    for i, r in enumerate(target_data):
        pred[i] = knn(train, target_data, i, k, metric)
    return pred

In [16]:
import time

In [17]:
result =[]
for k in [1,3,5,7,9,99,999]:
    start = time.time()
    predict = knn_pred(dev, k, metric=None)
    err = np.abs(predict-dev_label).sum()/predict.shape*100
    positive_rate = predict.sum()/predict.shape*100
    result.append([k, err, positive_rate])
    end = time.time()
    print("k="+str(k)+", dev_err="+str(err)+"%, positive_rate=" + str(positive_rate)+"%, time=" + str(end-start))

k=1, dev_err=[23.3]%, positive_rate=[26.9]%, time=71.52361512184143
k=3, dev_err=[19.2]%, positive_rate=[26.]%, time=76.28715205192566
k=5, dev_err=[17.7]%, positive_rate=[25.1]%, time=78.42572689056396
k=7, dev_err=[16.5]%, positive_rate=[24.5]%, time=77.9968740940094
k=9, dev_err=[16.]%, positive_rate=[22.6]%, time=78.238028049469
k=99, dev_err=[15.8]%, positive_rate=[19.4]%, time=78.38519883155823
k=999, dev_err=[17.9]%, positive_rate=[11.1]%, time=78.36374807357788


In [None]:
predict = knn_pred(dev, 1, metric=None)
err = np.abs(predict-dev_label).sum()/predict.shape*100
positive_rate = predict.sum()/predict.shape*100
print("k="+str(1)+", dev_err="+str(err)+"%, positive_rate=" + str(positive_rate)+"%")