In [21]:
#距离
import pandas as pd
import math

#缩放马氏距离
def trainScaledManhattan(in_vec):
    mean = in_vec.mean()
    std = in_vec.std()
    return mean,std


def classifyScaledManhattan(mean,std, q, prob_list):

    if(len(mean) != len(q)):
        print (len(mean))
        print(len(q))
        print("Be sure that both vectors are the same dimension!")
        return
    for i in range(len(mean)):
        if (std[i] == 0):
            std[i] = 1e-10

    return sum([prob_list[i] * abs(mean[i]-q[i])/std[i] for i in range(len(q))])

#调整的余弦相似度量

from scipy import spatial
import numpy as np


def adjusted_cos_distance_matrix(size, matrix, row_column, prob_list):
    distances = np.zeros((size,size))
    if row_column == 0:
        M_u = matrix.mean(axis=1)
        #print('M_u is:',M_u)
        m_sub = matrix.as_matrix() - M_u[:,None]
        #print('m_sub is:',m_sub)
    if row_column == 1:
        M_u = matrix.df.as_matrix().T.mean(axis=1)
        m_sub = matrix.T - M_u[:,None]

    for first in range(0,size):
        for sec in range(0,size):
            distance = spatial.distance.cosine(m_sub[first],m_sub[sec],prob_list)
            distances[first,sec] = distance
            if distance < 0:
                print('distance matrix',distance)
    return distances


def adjusted_cos_distance(vector1,vector2,prob_list):
    M_u1 = vector1.mean()
    M_u2 = vector2.mean()
    m_sub1 = vector1 - M_u1
    m_sub2 = vector2 - M_u2    
    distance = spatial.distance.cosine(m_sub1,m_sub2,prob_list)
    if distance < 0:
        print(distance)
    return distance

def cos_distance_matrix(size, matrix, row_column,prob_list):
    distances = np.zeros((size,size))
    if row_column == 0:
        m_sub = matrix.as_matrix() 
        #print('m_sub is:',m_sub)
    if row_column == 1:
        M_u = matrix.df.as_matrix().T.mean(axis=1)
        m_sub = matrix.T

    for first in range(0,size):
        for sec in range(0,size):
            distance = spatial.distance.cosine(m_sub[first],m_sub[sec],prob_list)
            distances[first,sec] = distance
            if distance < 0:
                print('distance cos',distance)
    return distances

def cos_distance(vector1,vector2,prob_list):
    m_sub1 = vector1
    m_sub2 = vector2
    distance = spatial.distance.cosine(m_sub1,m_sub2,prob_list)
    if distance < 0:
        print(distance)
    return distance


#中心点对应的index,0.05false alarm的distance阀值
def center (subject,vectors,prob_list):    #根据前200个样本计算中心点
    subject_vecs = vectors[vectors['subject']==subject].drop(['subject'],axis=1)[0:200]
    distances = adjusted_cos_distance_matrix(200,subject_vecs,0,prob_list)
    #print(distances)
    distances_sum = pd.DataFrame(distances).sum().tolist()
    #print(distances_sum)
    center_index  = distances_sum.index(min(distances_sum))
    return subject_vecs.iloc[center_index]

def cosine_threshold (center_vec,vectors,prob_list):
    subject_vecs = vectors[vectors['subject']==subject].drop(['subject'],axis=1)[0:200]
    possitive_scores = []
    for i in range(len(subject_vecs)):
        distance = adjusted_cos_distance(center_vec,subject_vecs.iloc[i],prob_list)
        possitive_scores.append(distance)
    #print(sorted(possitive_scores))
    threshold = search_threshold(possitive_scores,0.05)
    return threshold
    

def test_possitive(subject,vectors,center_vec,distance_threshold,prob_list): 
    positive_vecs = vectors[vectors['subject']==subject].drop(['subject'],axis=1)[200:400]
    possitive_scores = []
    for i in range(len(positive_vecs)):
        distance = adjusted_cos_distance(center_vec,positive_vecs.iloc[i],prob_list)
        possitive_scores.append(distance)
    #print(possitive_scores)
    negative1 = sum(possitive_scores>distance_threshold)
    #print(negative1)
    return negative1

def test_passitive(subject,vectors,center_vec,distance_threshold,prob_list):
    negative_vecs = vectors[vectors['subject']!=subject].drop(['subject'],axis=1)    
    negative_scores = []
    for i in range(len(negative_vecs)):
        distance = adjusted_cos_distance(center_vec,negative_vecs.iloc[i],prob_list)
        negative_scores.append(distance)
    #print(sorted(negative_scores))
    negative2 = sum(negative_scores<distance_threshold)
    #print(negative2)
    return negative2

#from sklearn.preprocessing import StandardScaler
#ss = StandardScaler()

def train (subject,vectors,prob_list):
    subject_vecs = vectors[vectors['subject']==subject].drop(['subject'],axis=1)[0:200]  #前200个样本训练，后200个样本测试
    mean,std = trainScaledManhattan(subject_vecs)
    
    scores = []   
    for i in range(len(subject_vecs)):
        score = classifyScaledManhattan(mean,std,subject_vecs.iloc[i],prob_list)
        #print(possitive_score)
        scores.append(score)
        
    threshold = search_threshold(scores,0.05)
    #subject_vecs = ss.fit_transform(subject_vecs)    
    
    return mean,std,threshold

def test(subject,vectors,prob_list):
    mean,std,threshold = train(subject,vectors,prob_list)    #调用训练
    #print('mean:',mean)
    #print('std:',std)
    possitive_samples = vectors[vectors['subject']==subject].drop(['subject'],axis=1)[200:400]  #正样本
    #possitive_samples = ss.fit_transform(possitive_samples)
    negative_samples = vectors[vectors['subject']!=subject].drop(['subject'],axis=1)
    #negative_samples = ss.fit_transform(negative_samples)
    
    #print(possitive_samples)
    #print(negative_samples)
    
    possitive_scores = []
    negative_scores = []
    
    for i in range(len(possitive_samples)):
        possitive_score = classifyScaledManhattan(mean,std,possitive_samples.iloc[i],prob_list)
        #print(possitive_score)
        possitive_scores.append(possitive_score)
        
    for i in range(len(negative_samples)):
        negative_score = classifyScaledManhattan(mean,std,negative_samples.iloc[i],prob_list)
        #print(negative_score)
        negative_scores.append(negative_score)
    return possitive_scores,negative_scores,threshold

#纯余弦度量
#中心点对应的index,0.05false alarm的distance阀值
def center2 (subject,vectors,prob_list):    #根据前200个样本计算中心点
    subject_vecs = vectors[vectors['subject']==subject].drop(['subject'],axis=1)[0:200]
    distances = cos_distance_matrix(200,subject_vecs,0,prob_list)
    #print(distances)
    distances_sum = pd.DataFrame(distances).sum().tolist()
    #print(distances_sum)
    center_index  = distances_sum.index(min(distances_sum))
    return subject_vecs.iloc[center_index]

def cosine_threshold2 (center_vec,vectors,prob_list):
    subject_vecs = vectors[vectors['subject']==subject].drop(['subject'],axis=1)[0:200]
    possitive_scores = []
    for i in range(len(subject_vecs)):
        distance = cos_distance(center_vec,subject_vecs.iloc[i],prob_list)
        possitive_scores.append(distance)
        if distance < 0:
            print('distance threshold2:',distance)
    #print(sorted(possitive_scores))
    threshold = search_threshold(possitive_scores,0.05)
    return threshold
    

def test_possitive2(subject,vectors,center_vec,distance_threshold,prob_list): 
    positive_vecs = vectors[vectors['subject']==subject].drop(['subject'],axis=1)[200:400]
    possitive_scores = []
    for i in range(len(positive_vecs)):
        distance = cos_distance(center_vec,positive_vecs.iloc[i],prob_list)
        possitive_scores.append(distance)
        if distance < 0:
            print(distance)
    #print(possitive_scores)
    negative1 = sum(possitive_scores>distance_threshold)
    #print(negative1)
    return negative1

def test_passitive2(subject,vectors,center_vec,distance_threshold,prob_list):
    negative_vecs = vectors[vectors['subject']!=subject].drop(['subject'],axis=1)    
    negative_scores = []
    for i in range(len(negative_vecs)):
        distance = cos_distance(center_vec,negative_vecs.iloc[i],prob_list)
        negative_scores.append(distance)
        if distance < 0:
            print(distance)
    #print(sorted(negative_scores))
    negative2 = sum(negative_scores<distance_threshold)
    #print(negative2)
    return negative2

Hawkes模型提出的特征，表明激励有没有有，有的话，强度是多少。考虑有没有，我用训练集中的特征出现的概率p（发生率）的导数作为惩罚，即，如果训练集中这个特征基本没有值，如果测试集中，该特征有值，那么判断很可以能不是该用户，所以这种情况，需要对该特征产生的距离进行惩罚。惩罚系数为beta/p。beta控制惩罚的力度。
下面在计算距离时，每个特征的距离要乘以这个惩罚系数。

In [29]:
from sklearn.preprocessing import MinMaxScaler

#求训练集中的各特征出现的概率
# inf_pen 是特征不出现，zero_counts = 0的情况，在 zero_counts = 1的基础上再加一个惩罚inf_pen, beta是惩罚项的影响比率
def prob(subject, vectors, samples_start, samples_end, beta=1, inf_pen = 10):
    subject_vecs = vectors[vectors['subject']==subject].drop(['subject'],axis=1)[samples_start:samples_end]
    prob_list = []
    for columns in [column for column in subject_vecs]:
        zero_counts = len(subject_vecs[subject_vecs[columns]==0])
        if zero_counts == 0:
            prob = 0
        else:
            prob = zero_counts/(samples_end - samples_start)
        prob_list.append(1-prob**beta)
        prob_list_ret = [x/sum(prob_list) for x in prob_list]
        #print(prob_list_ret)
    return prob_list_ret     

s002 miss_rate1 is: 0.1
s002 miss_rate2 is: 0.525
s002 miss_rate is: 0.4777777777777778
s032 miss_rate1 is: 0.05
s032 miss_rate2 is: 0.535625
s032 miss_rate is: 0.4816666666666667
s036 miss_rate1 is: 0.175
s036 miss_rate2 is: 0.000625
s036 miss_rate is: 0.02
s047 miss_rate1 is: 0.135
s047 miss_rate2 is: 0.339375
s047 miss_rate is: 0.31666666666666665
s052 miss_rate1 is: 0.105
s052 miss_rate2 is: 0.34125
s052 miss_rate is: 0.315

In [28]:
import pandas as pd

file_name = './data/CMU_exp_shrink_output_zero_2subject.xls'
df = pd.read_excel(file_name)
vectors = df.drop(['key','session'],axis=1)

samples_start = 0
samples_end = 200
beta = 1
inf_pen = 1

subject_list = ['s002','s032','s036','s047','s052']
#subject_list = ['s002']
for subject in subject_list:
    prob_list  = prob(subject,vectors,samples_start,samples_end,beta,inf_pen)
    center_vec = center2(subject,vectors,prob_list)    
    distance_threshold =  cosine_threshold2(center_vec,vectors,prob_list)
    #print(distance_threshold)
    negative1 = test_possitive2(subject,vectors,center_vec,distance_threshold,prob_list)
    negative2 = test_passitive2(subject,vectors,center_vec,distance_threshold,prob_list)

    miss_rate1 = negative1/200
    miss_rate2 = negative2/1600
    miss_rate = (negative1+negative2)/1800
    
    print (subject+' miss_rate1 is:',miss_rate1)
    print (subject+' miss_rate2 is:',miss_rate2)
    print (subject+' miss_rate is:',miss_rate)



s002 miss_rate1 is: 0.14
s002 miss_rate2 is: 0.96625
s002 miss_rate is: 0.8744444444444445
s032 miss_rate1 is: 0.17
s032 miss_rate2 is: 0.596875
s032 miss_rate is: 0.5494444444444444
s036 miss_rate1 is: 0.085
s036 miss_rate2 is: 0.42625
s036 miss_rate is: 0.3883333333333333
s047 miss_rate1 is: 0.17
s047 miss_rate2 is: 0.715625
s047 miss_rate is: 0.655
s052 miss_rate1 is: 0.105
s052 miss_rate2 is: 0.385625
s052 miss_rate is: 0.35444444444444445


In [31]:
file_name = './data/CMU_exp_shrink_output_zero_2subject.xls'
#file_name = './data/CMU_exp_shrink_output_rep50_2subject.xls'
df = pd.read_excel(file_name)
vectors = df.drop(['key','session'],axis=1)

samples_start = 0
samples_end = 200
beta = 1
inf_pen = 200


subject_list = ['s002','s032','s036','s047','s052']
#subject_list = ['s002']
for subject in subject_list:
    prob_list  = prob(subject,vectors,samples_start,samples_end,beta,inf_pen)
    center_vec = center(subject,vectors,prob_list)
    distance_threshold =  cosine_threshold(center_vec,vectors,prob_list)
    print(distance_threshold)
    negative1 = test_possitive(subject,vectors,center_vec,distance_threshold,prob_list)
    negative2 = test_passitive(subject,vectors,center_vec,distance_threshold,prob_list)

    miss_rate1 = negative1/200
    miss_rate2 = negative2/1600
    miss_rate = (negative1+negative2)/1800
    
    print (subject+' miss_rate1 is:',miss_rate1)
    print (subject+' miss_rate2 is:',miss_rate2)
    print (subject+' miss_rate is:',miss_rate)



0.5181482106232984
s002 miss_rate1 is: 0.07
s002 miss_rate2 is: 0.585
s002 miss_rate is: 0.5277777777777778
0.6630175995026989
s032 miss_rate1 is: 0.065
s032 miss_rate2 is: 0.503125
s032 miss_rate is: 0.45444444444444443
0.007862655790093931
s036 miss_rate1 is: 0.195
s036 miss_rate2 is: 0.004375
s036 miss_rate is: 0.025555555555555557
0.9124160547539729
s047 miss_rate1 is: 0.01
s047 miss_rate2 is: 0.56125
s047 miss_rate is: 0.5
1.0127029338662443
s052 miss_rate1 is: 0.1
s052 miss_rate2 is: 0.441875
s052 miss_rate is: 0.4038888888888889


In [13]:
a = spatial.distance.cosine([1,10000000,1,100000],[1,1,1,1])
a

-121.96928946530988