In [4]:
from ai_base import List2CSV, CSV2List2
from typing import List, Tuple
import csv
import numpy as np
trainX = np.float_(CSV2List2('doc2vecTrainSet50D24000L.csv'))
trainYData = CSV2List2('data/2/clean/trainLabel.txt')
testData = np.float_(CSV2List2('doc2vecTestSet50D6000L.csv'))
KNNtrainX = np.array(trainX[0:20000])
KNNvaildX = np.array(trainX[20000:24000])
KNNtrainY = np.array(np.float_(trainYData[0:20000]))
KNNvaildY = np.array(np.float_(trainYData[20000:24000]))


In [5]:
import ai_base
import numpy as np
from typing import List, Tuple, Callable
from tqdm import tnrange, tqdm_notebook
from scipy.stats.stats import pearsonr
from time import time
from collections import OrderedDict

# 一范数
Dis1 = lambda v1, v2: np.linalg.norm(v1 - v2, 1)
# 二范数
Dis2 = lambda v1, v2: np.linalg.norm(v1 - v2, 2)
# 无穷范数
DisInf = lambda v1, v2: np.linalg.norm(v1 - v2, np.inf)
# 余弦距离（1-余弦相关度）
def DisCosine(v1, v2):
    t1 = np.dot(v1,v2)
    t2 = np.linalg.norm(v1)
    t3 = np.linalg.norm(v2)
    ret = 1 - t1 / (t2*t3)
    return ret
    
def DisInvNormAvg(distances: np.array, Y: np.array) -> np.array:
    '''
    按照归一化的距离倒数加权求和，返回均值
    '''
    # 如果训练集中有向量距离和待预测向量完全一致（距离为0）
    for idx, dis in enumerate(distances):
        if np.isclose(dis, 0):
            # 则直接返回该训练集向量对应的Y
            return Y[idx]
    # 求距离的倒数
    distances = np.array(1.0) / distances
    # 归一化
    s = np.sum(distances)
    distances = distances / s
    # 分别作为权值乘以K个最邻近的训练集向量对应的Y
    tmp = np.diag(distances) @ Y  
    # 加权后Y的个分量求和
    if len(tmp.shape) is 1:
        return np.array([tmp])
    else:
        return np.sum(tmp,  axis = (0))

def classifyParseY(ydata: List[str], n: int)->np.array:
    '''
    Convert Y data from raw string list to matrix consisted of Y vectors
    e.g.
    ["anger", "disgust", ..., "surprise"] -> 
    |1, 0, 0, 0, 0, 0|
    |0, 1, 0, 0, 0, 0|
    |0, 0, ...,  0, 0|
    |0, 0, 0, 0, 1, 0|
    |0, 0, 0, 0, 0, 1|
    '''
    D = len(ydata)
    
    #fast hash ydata from strings ["anger", "disgust", ...] to [1, 2, ...]^T
    #ydata = np.array(ydata).reshape((-1,1))
    
    '''
    ymat is the column-wise repeat of ydata.
    e.g.
    |0|      |0, 0, 0, 0, 0, 0|
    |1|   -> |1, 1, 1, 1, 1, 1|
    ...      |................|
    |5|      |5, 5, 5, 5, 5, 5|
    ydata -> ymat
    '''
    ymat  = np.tile(ydata, (1, n))
    
    '''
    ycmp is a matrix of which each row is [0, 1, 2, 3, 4, 5]
    |0, 1, 2, 3, 4, 5|
    |0, 1, 2, 3, 4, 5|
    |................|
    |0, 1, 2, 3, 4, 5|
    '''
    ycmp  = np.tile(np.array(range(n)), (D, 1))
    return np.int_(np.equal(ymat, ycmp))
def KNN_old(trainSet: Tuple[np.array, np.array],
        testVec: np.array,
        DisFunc: Callable[[np.array, np.array], float],
        K: int,
        WeightFunc: Callable[[np.array, np.array], float]) -> np.array: 
    '''
    一个通用的KNN接口
    trainSet: 二元元组，第一个元素是训练集的X，第二个是Y
    testVec: 待预测向量
    DisFunc: 距离函数
    K: K值
    WeightFunc: 依据第一个参数list<距离>,对第二个参数list<Y值>进行加权，返回预测值
    '''
    #对于多个要预测的值，逐一预测
    if len(testVec.shape) > 1:
        return np.array([KNN(trainSet, vec, DisFunc, K, WeightFunc) for vec in testVec])
    else:
        #测量待预测向量到训练集中每个向量的距离
        #distances是一个list<tuple(index, distance)>
        distances = list(enumerate(map(lambda trainVec: DisFunc(trainVec, testVec), trainSet[0])))
        #依据距离从小到大排序
        distances.sort(key=lambda t: t[1])
        #获取最临近的K个训练样本的下标和对应的距离，输出值
        tmp = list(zip(*distances[:K]))
        kNearIdx = list(tmp[0])
        kNearDis = list(tmp[1])
        kNearY   = trainSet[1][kNearIdx, :]
        #对输出值根据距离加权作为预测输出
        return WeightFunc(kNearDis, kNearY)
    
def KNN(trainSet: Tuple[np.array, np.array],
        testVec: np.array,
        DisFunc: Callable[[np.array, np.array], float],
        K: int,
        WeightFunc: Callable[[np.array, np.array], float]) -> np.array: 
    '''
    一个通用的KNN接口
    trainSet: 二元元组，第一个元素是训练集的X，第二个是Y
    testVec: 待预测向量
    DisFunc: 距离函数
    K: K值
    WeightFunc: 依据第一个参数list<距离>,对第二个参数list<Y值>进行加权，返回预测值
    '''
    #对于多个要预测的值，逐一预测
#     if len(testVec.shape) > 1:
#         n = len(testVec)
#         ret = list(range(n))
#         for i in tnrange(n):
#             ret[i] = KNN(trainSet, testVec[i], DisFunc, K, WeightFunc)
#         return np.array(ret)
#     else:
        #测量待预测向量到训练集中每个向量的距离
        #distances是一个list<tuple(index, distance)>
        
#     distances = list(enumerate(map(lambda trainVec: DisFunc(trainVec, testVec), trainSet[0])))
    
    trainSum = np.sum(np.square(trainSet[0]), axis=1)
    testSum = np.sum(np.square(testVec), axis=1) 
    t0 = np.dot(testVec, trainSet[0].T)
    dists = np.sqrt(-2 * t0 + testSum.reshape(-1, 1)+ trainSum)
    n = len(testVec)
    ret = list(range(n))
    for i in tnrange(n):
        distances = list(enumerate(dists[i]))
        #依据距离从小到大排序
        distances.sort(key=lambda t: t[1])
        #获取最临近的K个训练样本的下标和对应的距离，输出值
        tmp = list(zip(*distances[:K]))
        kNearIdx = list(tmp[0])
        kNearDis = list(tmp[1])
        kNearY   = trainSet[1][kNearIdx, :]
        #对输出值根据距离加权作为预测输出
        ret[i] = WeightFunc(kNearDis, kNearY)
    return np.array(ret)

def get_regress(predictY, vaildY):
    r = [pearsonr(predictY[:, i], vaildY[:, i])[0] for i in range(vaildY.shape[1])]
    average = np.average(r)
    print("Correlation Coefficient: ", average)
    return average

def get_classify(predictY, vaildY):
    classifyY = np.zeros_like(predictY)
#     for i, row in enumerate(predictY):
#         m = 0
#         idx = 0
#         for j, v in enumerate(row):
#             if v > m:
#                 m = v
#                 idx = j
#         classifyY[i][idx] = 1
    for i in range(len(predictY)):
        if predictY[i][0] > 0.5:
            classifyY[i][0] = 1
        else:
            classifyY[i][0] = 0
    ret = np.sum(np.logical_and(classifyY, vaildY)) / vaildY.shape[0]
    print("Classification Accuracy: ", ret)
    return ret

def autoTrain(trainSet: Tuple, vaildSet:Tuple):
    trainX, trainY = trainSet
    vaildX, vaildY = vaildSet
    print("Start training...")
    t = time()
    K_val = range(1, 20)
#     DisFuncs = {"Dis1": Dis1, "Dis2": Dis2, "DisInf": DisInf, "DisCosine": DisCosine}
    DisFuncs = {"Dis2": Dis2}
    results_reg = OrderedDict()
    results_cla = OrderedDict()
    for K in K_val:
        for dfname, DisFunc in DisFuncs.items():
            predictY = KNN((trainX,trainY), vaildX, DisFunc, K, DisInvNormAvg)
            cla_ret = get_classify(predictY, vaildY)
            reg_ret = get_regress(predictY, vaildY)
            results_reg[(K, dfname)] = reg_ret
            results_cla[(K, dfname)] = cla_ret
            print(K, dfname, ":", cla_ret, reg_ret)
    print("{} groups of argument tested, spent {}s".format(len(K_val) * len(DisFuncs), time() - t))
    return results_cla, results_reg

def vaild(trainSet: Tuple, vaildSet: Tuple, K, DisFunc):
    trainX, trainY = trainSet
    vaildX, vaildY = vaildSet
    predictY = KNN(trainSet,vaildX,DisFunc,K,DisInvNormAvg)
    cla_ret = get_classify(predictY, vaildY)
    reg_ret = get_regress(predictY, vaildY)
    print(pfname, K, dfname, ":", cla_ret, reg_ret)


In [6]:
autoTrain((KNNtrainX, KNNtrainY), (KNNvaildX, KNNvaildY))

Start training...


HBox(children=(IntProgress(value=0, max=4000), HTML(value='')))


Classification Accuracy:  0.347
Correlation Coefficient:  0.3233941870744647
1 Dis2 : 0.347 0.3233941870744647


HBox(children=(IntProgress(value=0, max=4000), HTML(value='')))


Classification Accuracy:  0.347
Correlation Coefficient:  0.39860719582398535
2 Dis2 : 0.347 0.39860719582398535


HBox(children=(IntProgress(value=0, max=4000), HTML(value='')))


Classification Accuracy:  0.36875
Correlation Coefficient:  0.45443114629809883
3 Dis2 : 0.36875 0.45443114629809883


HBox(children=(IntProgress(value=0, max=4000), HTML(value='')))


Classification Accuracy:  0.372
Correlation Coefficient:  0.4879191160267574
4 Dis2 : 0.372 0.4879191160267574


HBox(children=(IntProgress(value=0, max=4000), HTML(value='')))


Classification Accuracy:  0.379
Correlation Coefficient:  0.5065701726232688
5 Dis2 : 0.379 0.5065701726232688


HBox(children=(IntProgress(value=0, max=4000), HTML(value='')))


Classification Accuracy:  0.38075
Correlation Coefficient:  0.5196372992684534
6 Dis2 : 0.38075 0.5196372992684534


HBox(children=(IntProgress(value=0, max=4000), HTML(value='')))


Classification Accuracy:  0.3905
Correlation Coefficient:  0.5314685203001267
7 Dis2 : 0.3905 0.5314685203001267


HBox(children=(IntProgress(value=0, max=4000), HTML(value='')))


Classification Accuracy:  0.39175
Correlation Coefficient:  0.5413212439310982
8 Dis2 : 0.39175 0.5413212439310982


HBox(children=(IntProgress(value=0, max=4000), HTML(value='')))


Classification Accuracy:  0.39175
Correlation Coefficient:  0.5472540995187428
9 Dis2 : 0.39175 0.5472540995187428


HBox(children=(IntProgress(value=0, max=4000), HTML(value='')))


Classification Accuracy:  0.392
Correlation Coefficient:  0.5526629747277266
10 Dis2 : 0.392 0.5526629747277266


HBox(children=(IntProgress(value=0, max=4000), HTML(value='')))


Classification Accuracy:  0.39975
Correlation Coefficient:  0.5603469705151668
11 Dis2 : 0.39975 0.5603469705151668


HBox(children=(IntProgress(value=0, max=4000), HTML(value='')))


Classification Accuracy:  0.39725
Correlation Coefficient:  0.5657200898999728
12 Dis2 : 0.39725 0.5657200898999728


HBox(children=(IntProgress(value=0, max=4000), HTML(value='')))


Classification Accuracy:  0.40025
Correlation Coefficient:  0.5679477213125027
13 Dis2 : 0.40025 0.5679477213125027


HBox(children=(IntProgress(value=0, max=4000), HTML(value='')))


Classification Accuracy:  0.39875
Correlation Coefficient:  0.5697925043946138
14 Dis2 : 0.39875 0.5697925043946138


HBox(children=(IntProgress(value=0, max=4000), HTML(value='')))


Classification Accuracy:  0.4015
Correlation Coefficient:  0.5700461673219002
15 Dis2 : 0.4015 0.5700461673219002


HBox(children=(IntProgress(value=0, max=4000), HTML(value='')))


Classification Accuracy:  0.4005
Correlation Coefficient:  0.57462938753293
16 Dis2 : 0.4005 0.57462938753293


HBox(children=(IntProgress(value=0, max=4000), HTML(value='')))


Classification Accuracy:  0.4025
Correlation Coefficient:  0.5779907098574331
17 Dis2 : 0.4025 0.5779907098574331


HBox(children=(IntProgress(value=0, max=4000), HTML(value='')))


Classification Accuracy:  0.4005
Correlation Coefficient:  0.5802634337323745
18 Dis2 : 0.4005 0.5802634337323745


HBox(children=(IntProgress(value=0, max=4000), HTML(value='')))


Classification Accuracy:  0.40325
Correlation Coefficient:  0.5819222581213568
19 Dis2 : 0.40325 0.5819222581213568
19 groups of argument tested, spent 742.4912481307983s


(OrderedDict([((1, 'Dis2'), 0.347),
              ((2, 'Dis2'), 0.347),
              ((3, 'Dis2'), 0.36875),
              ((4, 'Dis2'), 0.372),
              ((5, 'Dis2'), 0.379),
              ((6, 'Dis2'), 0.38075),
              ((7, 'Dis2'), 0.3905),
              ((8, 'Dis2'), 0.39175),
              ((9, 'Dis2'), 0.39175),
              ((10, 'Dis2'), 0.392),
              ((11, 'Dis2'), 0.39975),
              ((12, 'Dis2'), 0.39725),
              ((13, 'Dis2'), 0.40025),
              ((14, 'Dis2'), 0.39875),
              ((15, 'Dis2'), 0.4015),
              ((16, 'Dis2'), 0.4005),
              ((17, 'Dis2'), 0.4025),
              ((18, 'Dis2'), 0.4005),
              ((19, 'Dis2'), 0.40325)]),
 OrderedDict([((1, 'Dis2'), 0.3233941870744647),
              ((2, 'Dis2'), 0.39860719582398535),
              ((3, 'Dis2'), 0.45443114629809883),
              ((4, 'Dis2'), 0.4879191160267574),
              ((5, 'Dis2'), 0.5065701726232688),
              ((6, 'Dis2'), 0.5196