In [1]:
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import torch
import gensim
import torch.nn as nn
import torch.nn.functional as F
from gensim.corpora import Dictionary
from tqdm import tqdm
import re
import copy
import scipy.stats
import nltk
import pickle
import math

stopwords = nltk.corpus.stopwords.words('english')
class WAE(nn.Module):
    def __init__(self, encode_dims=[2000, 1024, 512, 20], decode_dims=[20, 1024, 2000], dropout=0.0, nonlin='relu'):
        super(WAE, self).__init__()
        self.encoder = nn.ModuleDict({
            f'enc_{i}': nn.Linear(encode_dims[i], encode_dims[i+1])
            for i in range(len(encode_dims)-1)
        })

        self.decoder = nn.ModuleDict({
            f'dec_{i}': nn.Linear(decode_dims[i], decode_dims[i+1])
            for i in range(len(decode_dims)-1)
        })
        self.latent_dim = encode_dims[-1]
        self.dropout = nn.Dropout(p=dropout)
        self.nonlin = {'relu': F.relu, 'sigmoid': torch.sigmoid}[nonlin]
        self.z_dim = encode_dims[-1]
        
    def encode(self, x):
        hid = x
        for i, (_,layer) in enumerate(self.encoder.items()):
            hid = self.dropout(layer(hid))
            if i < len(self.encoder)-1:
                hid = self.nonlin(hid)
        return hid
    
class VAE(nn.Module):
    def __init__(self, encode_dims=[2000,1024,512,20],decode_dims=[20,1024,2000],dropout=0.0):

        super(VAE, self).__init__()
        self.encoder = nn.ModuleDict({
            f'enc_{i}':nn.Linear(encode_dims[i],encode_dims[i+1]) 
            for i in range(len(encode_dims)-2)
        })
        self.fc_mu = nn.Linear(encode_dims[-2],encode_dims[-1])
        self.fc_logvar = nn.Linear(encode_dims[-2],encode_dims[-1])

        self.decoder = nn.ModuleDict({
            f'dec_{i}':nn.Linear(decode_dims[i],decode_dims[i+1])
            for i in range(len(decode_dims)-1)
        })
        self.latent_dim = encode_dims[-1]
        self.dropout = nn.Dropout(p=dropout)
        self.fc1 = nn.Linear(encode_dims[-1],encode_dims[-1])
        
    def encode(self, x):
        hid = x
        for i,layer in self.encoder.items():
            hid = F.relu(self.dropout(layer(hid)))
        mu, log_var = self.fc_mu(hid), self.fc_logvar(hid)
        return mu, log_var
        
class WTM:
    def __init__(self, bow_dim=10000, n_topic=20, device=None, dist='gmm_std', taskname=None, dropout=0.0):
        self.bow_dim = bow_dim
        self.n_topic = n_topic
        self.wae = WAE(encode_dims=[bow_dim, 1024, 512, n_topic], decode_dims=[n_topic, 512, bow_dim], dropout=dropout, nonlin='relu')
        self.device = device
        self.id2token = None
        self.dist = dist
        self.taskname = taskname
        if device != None:
            self.wae = self.wae.to(device)
            
    def inference(self, doc_bow):
    # doc_bow: torch.tensor [vocab_size]; optional: np.array [vocab_size]
        with torch.no_grad():
            theta = F.softmax(self.wae.encode(doc_bow),dim=1)
            return theta.detach().cpu().numpy().reshape(75,)
        

class GSM:
    def __init__(self,bow_dim=10000,n_topic=20,taskname=None,device=None):
        self.bow_dim = bow_dim
        self.n_topic = n_topic
        #TBD_fc1
        self.vae = VAE(encode_dims=[bow_dim,1024,512,n_topic],decode_dims=[n_topic,512,bow_dim],dropout=0.0)
        self.device = device
        self.id2token = None
        self.taskname = taskname
        if device!=None:
            self.vae = self.vae.to(device)

    def inference(self,doc_bow):
        # doc_bow: torch.tensor [vocab_size]; optional: np.array [vocab_size]
        with torch.no_grad():
            mu,log_var =self.vae.encode(doc_bow)
            mu = self.vae.fc1(mu)
            theta = F.softmax(mu,dim=1)
            return theta.detach().cpu().squeeze(0).numpy()   
        
        
class LDA:
    def __init__(self):
        self.lda_model = gensim.models.ldamodel.LdaModel.load('D:\毕设\ML算法实现\主题模型攻击\\nips_model\LDA_nips.ckpt')
    
    def inference(self,doc):
        doc_bow = self.lda_model.id2word.doc2bow(doc)      #文档转换成bow
        doc_lda = self.lda_model[doc_bow] 
        doc_tp = np.zeros(75)+0.001
        for i in doc_lda:
            doc_tp[i[0]] = i[1]
        return doc_tp
    


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  dtype=np.int):
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  eps=np.finfo(np.float).eps,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  eps=np.finfo(np.float).eps, copy_X=True, fit_path=True,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  eps=np.finfo(np.float).eps, copy_X=True, fit_path=True,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  eps=np.finfo(np.float).eps, positive=False):
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  max_n_alphas=1000, n_jobs=None, eps=np.finfo(np.float).e

In [2]:
class EVAE(VAE):
    def __init__(self, encode_dims=[2000,1024,512,20],decode_dims=[20,1024,2000],dropout=0.0,emb_dim=300):
        super(EVAE,self).__init__(encode_dims=encode_dims,decode_dims=decode_dims,dropout=dropout)
        self.emb_dim = emb_dim
        self.vocab_size = encode_dims[0]
        self.n_topic = encode_dims[-1]
        self.rho = nn.Linear(emb_dim,self.vocab_size)
        self.alpha = nn.Linear(emb_dim,self.n_topic)
        self.decoder = None

    def decode(self,z):
        wght_dec = self.alpha(self.rho.weight) #[K,V]
        beta = F.softmax(wght_dec,dim=0).transpose(1,0)
        res = torch.mm(z,beta)
        logits = torch.log(res+1e-6)
        return logits


class ETM:
    def __init__(self,bow_dim=10000,n_topic=20,taskname=None,device=None,emb_dim=300):
        self.bow_dim = bow_dim
        self.n_topic = n_topic
        self.emb_dim = emb_dim
        #TBD_fc1
        self.vae = EVAE(encode_dims=[bow_dim,1024,512,n_topic],decode_dims=[n_topic,512,bow_dim],dropout=0.0,emb_dim=emb_dim)
        self.device = device
        self.id2token = None
        self.taskname = taskname
        if device!=None:
            self.vae = self.vae.to(device)
            
    def inference(self,doc_bow):
    # doc_bow: torch.tensor [vocab_size]; optional: np.array [vocab_size]
        if isinstance(doc_bow,np.ndarray):
            doc_bow = torch.from_numpy(doc_bow)
        doc_bow = doc_bow.reshape(-1,self.bow_dim).to(self.device)
        with torch.no_grad():
            mu,log_var = self.vae.encode(doc_bow)
            mu = self.vae.fc1(mu) 
            theta = F.softmax(mu,dim=1)
            return theta.detach().cpu().squeeze(0).numpy()

# BTM

In [3]:
class Biterm():
    wi = 0
    wj = 0
    z = 0

    def __init__(self,w1=None,w2=None,s=None):
        if w1 != None and w2 != None:
            self.wi = min(w1,w2)
            self.wj = max(w1,w2)
        elif w1 == None and w2 == None and s != None:
            w = s.split(' ')
            self.wi = w[0]
            self.wj = w[1]
            self.z = w[2]

    def get_wi(self):
        return self.wi

    def get_wj(self):
        return self.wj

    def get_z(self):
        return self.z

    def set_z(self,k):
        self.z = k

    def reset_z(self):
        self.z = -1

    def str(self):
        _str = ""
        _str += str(self.wi) + '\t' + str(self.wj) + '\t\t' + str(self.z)
        return _str
class Doc():
    '''
    @description: 处理文本的类
    @param {type} 
    @return: 
    '''
    ws = []

    def __init__(self,s,voc):
        self.ws = []
        self.dict = dict()
        with open(voc, 'r') as f:
            voclist = f.read().splitlines()
        for i in range(len(voclist)):
            self.dict[voclist[i]] = i
        self.read_doc(s)

    def read_doc(self,s):
        if s != '\n':
            for w in s.split(' '):
                if w not in self.dict:
                    continue
                self.ws.append(int(self.dict[w]))
                
    def show(self):
        print('ceshi:', self.ws)
        print('字典:', self.dict)

    def size(self):
        return len(self.ws)

    def get_w(self,i):
        assert(i<len(self.ws))
        return self.ws[i]

    ''' 
      Extract biterm from a document
        'win': window size for biterm extraction
        'bs': the output biterms
    '''
    def gen_biterms(self,bs,win=15):
        if(len(self.ws)<2):
            return
        for i in range(len(self.ws)-1):
            for j in range(i+1,min(i+win,len(self.ws))):
                bs.append(Biterm(self.ws[i],self.ws[j]))
                
class BTModel:
    def __init__(self, model_dir, voca_path):
        self.zw_pt = list()
        with open(model_dir+'k75.pw_z') as f:
            for line in f.readlines():
                self.zw_pt.append([float(p) for p in line.split()])
        self.pz = [float(p) for p in open(model_dir+'k75.pz').readline().split()]
        self.voca_path = voca_path
        
    def inference(self, test_path):
        def softmax(x):
            exp_x = np.exp(x)
            sum_exp_x = np.sum(exp_x)
            y = exp_x/sum_exp_x
            return y
        
        def load_docs(docs_pt):
            bs = []
#             print("load docs: " + docs_pt)
#             rf = open(docs_pt)
#             if not rf:
#                 print("file not found: " + docs_pt)
#             for line in rf.readlines():
            line = docs_pt
            d = Doc(line, self.voca_path)
#             d.show()
#             print('d in load_docs:', d)
            biterms = []
            d.gen_biterms(biterms)
            for b in biterms:
                bs.append(b)
            # print(len(bs))
            return bs
        bs = load_docs(test_path)
        K = len(self.pz)
        #print('KKKK', K)
        zw_pt = self.zw_pt
        pz_d = [0.0]*K
        if len(bs) == 1:
            for k in range(K):
                pz_d[k] = math.log(self.pz[k]) + math.log(zw_pt[k][bs[0].get_wi()])
            pz_d = softmax(pz_d)
        else:
            for bs_i in bs:
                w1 = bs_i.get_wi()
                w2 = bs_i.get_wj()
                pz_b = [0.0]*K
                for k in range(K):
    #                 print(pz[k])
                    pz_b[k] = math.log(self.pz[k]) + math.log(zw_pt[k][w1]) + math.log(zw_pt[k][w2])
    #             print(pz_b)
                pz_b = softmax(pz_b)
                for kk in range(K):
                    pz_d[kk] += pz_b[kk]
            pz_d = softmax(pz_d)
        return pz_d

# CTM

In [3]:
class CTModel:
    def __init__(self, path):
        # path : 模型的存储路径 model-50
        self.model = pickle.load(open(path, "rb"))
    def inference(self, text):
        # text: str
        log_likelihood, lambda_values, nu_square_values = self.model.inference([text])
        def softmax(x):
            exp_x = np.exp(x)
            sum_exp_x = np.sum(exp_x)
            y = exp_x/sum_exp_x
            return y
        return softmax(lambda_values)[0]

# PT and DMR

In [4]:
import tomotopy as tp # 载入库

class PTModel:
    def __init__(self, path):
        # path : 模型的存储路径
        self.model = tp.PTModel.load(path)
    def inference(self, text):
        # text: 以空格间隔开的文本
        #txt = text.split(' ')
        doc_inst = self.model.make_doc(text)
        topic_dist, ll = self.model.infer(doc_inst)
        return topic_dist
class DMRModel:
    def __init__(self, path):
        # path : 模型的存储路径
        self.model = tp.DMRModel.load(path)
    def inference(self, text):
        # text: 以空格间隔开的文本
        #txt = text.split(' ')
        doc_inst = self.model.make_doc(text)
        topic_dist, ll = self.model.infer(doc_inst)
        return topic_dist

# 同义词

In [6]:
import pandas as pd
we = pd.read_csv("D:\毕设\ML算法实现\主题模型攻击\pre_processing.csv")#相似词
we['similar_words'] = we['similar_words'].map(lambda x: eval(x))
def evals(x):
    if x == '<class \'list\'>':
        return []
    else:
        return eval(x)
we['similar_mixed'] = we['similar_mixed'].map(lambda x: evals(x))

# 索引字典
word2id = {}
count = 0
for i in we['word']:
    word2id[i] = count 
    count +=1

In [7]:
def to_bow(data, min_length):
    """Convert index lists to bag of words representation of documents."""
    vect = [np.bincount(x[x != np.array(None)].astype('int'), minlength=min_length)
            for x in data if np.sum(x[x != np.array(None)]) != 0]
    return np.array(vect)

def Tokenizer(sent,stopwords=None):
        # Tokenizer for English. 
        pat = re.compile(r'[0-9!"#$%&\'()*+,-./:;<=>?@—，。：★、￥…【】（）《》？“”‘’！\[\\\]^_`{|}~\u3000]+')
        tokens = [re.sub(pat,r'',t).strip() for t in sent.split(' ')]
        tokens = [t for t in tokens if t!='' and len(t)>1]
        tokens = [t.lower() for t in tokens]    
        if stopwords is not None:
            tokens = [t for t in tokens if not (t in stopwords)]                                               
        return tokens

def word_embedding(dictionary,vim_txt,stopwords=None):
    '''
    #input 字典，文本txt
    #output 计数向量tensor 单词索引list
    '''
    with open(vim_txt,'r',encoding='utf-8') as f:
        txt = f.read()
    ori_token = Tokenizer(txt,stopwords)
    ori_bow = torch.zeros(1,len(dictionary))
    ori_index = []
    new_token = []
    for token in ori_token:
            try:
                ori_idx = dictionary.token2id[token.lower()]
                ori_index.append(ori_idx)
                ori_bow[0][ori_idx] += 1
                new_token.append(token)
            except:
                continue
    return ori_bow,ori_index,new_token,len(ori_token)

def topk_import_words(ori_index,ori_bow,ori_token,model,dictionary,ori_result,model_name,i=0):
    '''
    当model_name = LDA时，ori_index = token，ori_bow
    #关键词排序
    #input
    #topk：前k个词
    #ori_index:单词索引
    #ori_bow:单词向量
    #model：攻击模型
    #ori_result:原始主题分布
    #output
    #topk单词字典（word：KL value）
    #topk单词索引
    '''
    diff_re ={} # 删除词后的主题分布
    if model_name == 'LDA' or model_name == "PT" or model_name == "DMR":
        for word in set(ori_token):
            temp = copy.deepcopy(ori_token)
            temp.remove(word)
            LDA_tmp_result = model.inference(temp)
            try:
                diff_re[word] = scipy.stats.entropy(LDA_tmp_result,ori_result)
            except:
                print(len(LDA_tmp_result),len(ori_result))
                print(temp)
                 
    elif model_name == 'CTM' or model_name == 'BTM':
        for word in set(ori_token):
            temp = copy.deepcopy(ori_token)
            temp.remove(word)
            tmp = " ".join(temp)
            LDA_tmp_result = model.inference(tmp)
            diff_re[word] = scipy.stats.entropy(LDA_tmp_result,ori_result)
            #print(LDA_tmp_result)
            
    else:
        for idx in ori_index:
            tmp_bow = ori_bow.clone()
            tmp_bow[i][idx] = 0
            tmp_result = model.inference(tmp_bow)
            diff_re[dictionary[idx]] = scipy.stats.entropy(tmp_result,ori_result)

    #return sorted(diff_re.items(),key = lambda item:item[1],reverse = True)
    
    return diff_re

def softmax(kl):
    s = sum(kl.values())
    for i in kl.keys():
        kl[i] = kl[i]/s
    #return sorted(kl.items(),key = lambda item:item[1],reverse = True)
    return kl

def similar_word(t_word):
    #返回该词的近义词列表
    return we['similar_words'][word2id[t_word]:word2id[t_word]+1][word2id[t_word]]

def rep_sim_word(ori_bow,ori_token,sim_word,t_word,dictionary,model_name,model,i=0):
    '''
    返回kl散度
    '''
    #修改词向量
    token = copy.deepcopy(ori_token)
    bow = ori_bow.clone()
    if model_name == "LDA" or model_name == "PT" or model_name == "DMR":
        ori = model.inference(token)
        token[token.index(t_word)] = sim_word
        tmp_re = model.inference(token)
        return scipy.stats.entropy(tmp_re,ori)
    
    elif model_name == 'CTM' or model_name == 'BTM':
        tmp = " ".join(token)
        ori = model.inference(tmp)
        token[token.index(t_word)] = sim_word
        tmp = " ".join(token)
        tmp_re = model.inference(tmp)
        return scipy.stats.entropy(tmp_re,ori)
        
    else:
        ori = model.inference(bow)
        tmp = bow[i][dictionary.token2id[t_word]]
        bow[i][dictionary.token2id[sim_word]] += tmp
        bow[i][dictionary.token2id[t_word]] = 0
        tmp_re = model.inference(bow)
        return scipy.stats.entropy(tmp_re,ori)

def sim_word_kl(sim_word_list,ori_bow,ori_token,threshold,weight,i=0):
    sim_kl = {}
    for j in range(len(sim_word_list)):
        s_ = 0 #同义词的kl散度变化
        sim_word = sim_word_list[j][0]
        try:
            a = dictionary.token2id[sim_word]
        except:
            #print(f'{sim_word} 该同义词不在字典中')
            continue
        
        for _ in range(model_num):
            tmp =  rep_sim_word(ori_bow,ori_token,sim_word,word,dictionary,model_name[_],model_set[_],i=0)
            s_ += tmp/threshold[_] * weight[_]
        sim_kl[sim_word] = s_
    return sorted(sim_kl.items(),key = lambda item:item[1],reverse = True)

def generate_vim(ori_txt,vim_txt,pair):
    with open(ori_txt,'r') as f:
        ori = f.read()
    ori = Tokenizer(ori)
    for i in pair.keys():
        while i in ori:
            ori[ori.index(i)] = pair[i]
    txt = " ".join(ori)
    with open(vim_txt,'w') as f:
        f.write(txt)
    return 0

In [8]:
dictionary = Dictionary.load('D:\毕设\ML算法实现\主题模型攻击\\nips_model\\nips_16157.dict')
WTM_D_path = 'D:\毕设\ML算法实现\主题模型攻击\\nips_model\WTM_D.pickle'
WTM_parameters = torch.load(WTM_D_path,map_location=torch.device('cpu'))
WTM_D_model = WTM(bow_dim = 16157,n_topic= 75)
WTM_D_model.wae.load_state_dict(WTM_parameters)

WTM_G_path = 'D:\毕设\ML算法实现\主题模型攻击\\nips_model\WTM_G.pickle'
WTM_parameters = torch.load(WTM_G_path,map_location=torch.device('cpu'))
WTM_G_model = WTM(bow_dim = 16157,n_topic= 75)
WTM_G_model.wae.load_state_dict(WTM_parameters)

GSM_path = 'D:\毕设\ML算法实现\主题模型攻击\\nips_model\GSM_nips.pickle'
GSM_parameters = torch.load(GSM_path,map_location=torch.device('cpu'))
GSM_model = GSM(bow_dim = 16157,n_topic= 75)# 参数设置
GSM_model.vae.load_state_dict(GSM_parameters)
LDA_model = LDA()
ETM_path = 'D:\毕设\ML算法实现\主题模型攻击\\nips_model\ETM_nips.pickle'
ETM_parameters = torch.load(ETM_path,map_location=torch.device('cpu'))
ETM_model = ETM(bow_dim = 16157,n_topic= 75)# 参数设置
ETM_model.vae.load_state_dict(ETM_parameters)

DMR_model = DMRModel('NIPS_DMRModel.bin')
PT_model = PTModel('NIPS_PTModel.bin')

CTM_model = CTModel('D:\毕设\ML算法实现\主题模型攻击\\nips_model\ctm\\nips_model-50')
#BTM_model = BTModel('model/', 'new_voca.txt')

In [11]:
lda = []
rate = []
topk = 0.05
a_ = ["GSM"]
b_ = [GSM_model]
for aa in range(len(a_)):
    for i in range(50):
        print("处理第{}个文本".format(i))
        model_name = [a_[aa]]
        vim_txt = 'D:\毕设\ML算法实现\主题模型攻击\\nips_model\\nips_' + str(i) + '.txt'
        new_vim_txt ='D:\毕设\ML算法实现\主题模型攻击\\nips_model\\nips_based\\'+model_name[0] + "based2_" + str(topk)+ "_" + str(i) + '.txt'

        model_set = [b_[aa]]
        model_num = len(model_name)
        r_set = []
        ori_bow,ori_index,ori_token,length = word_embedding(dictionary,vim_txt,stopwords = stopwords)
        for j in range(model_num):
            if model_name[j] == "LDA" or model_name[j] == "PT" or model_name[j] == "DMR":
                r = model_set[j].inference(ori_token)
                r_set.append(r)
            elif model_name[j] == 'CTM' or model_name[j] == 'BTM' :
                tmp = " ".join(ori_token)
                r = model_set[j].inference(tmp)
                r_set.append(r)
            else:
                r = model_set[j].inference(ori_bow)
                r_set.append(r)


        #ori_gsm_re = LDA_model.inference(ori_token)
        # 计算每个词的kl散度
        #kl_set = []
        #for j in range(model_num):
        #    a = topk_import_words(ori_index,ori_bow,ori_token,model_set[j],dictionary,r_set[j],model_name[j])
        #    #print(a)
        #    kl_set.append(a)
        ## softmax
        #soft_kl_set = []
        #for j in range(model_num):
        #    a = softmax(kl_set[j])
        #    soft_kl_set.append(a)
        ##softmax相加
        #
        #new_kl = {}
        #for jj in soft_kl_set[0].keys():
        #    s_kl = 0
        #    for j in range(model_num):
        #        s_kl += soft_kl_set[j][jj]
        #    new_kl[jj] = s_kl
        #

        ###############################################################
        threshold =np.array([1.57])
        #hyp = np.array([1,38,1])
        pair = {}
        process = []
        weight = np.ones(model_num)
        weight_set = []
        arrive = np.zeros(model_num)#到达阈值时截断

        #print(new_kl)
        #new_kl =  sorted(new_kl.items(),key = lambda item:item[1],reverse = True)
        count = 0
        #LDA_re = 0
        rand_set = []
        while (len(pair) <= int(len(ori_token) * topk)):
            rand_int = np.random.randint(len(ori_token))
            if rand_int not in rand_set:
                rand_set.append(rand_int)
                word = ori_token[rand_int]
                try:
                    sim_word_list = similar_word(word)
                except:
                    #print(f'{word} 该词中无近义词')
                    continue

                sim_kl = sim_word_kl(sim_word_list,ori_bow,ori_token,threshold,weight)
                #print(sim_kl)
                if len(sim_kl) == 0:
                    continue
                else:
                    #tmp = np.random.randint(len(sim_kl))
                    #best_sim_word = sim_kl[tmp][0]
                    best_sim_word = sim_kl[0][0]
                    pair[word] = best_sim_word

                #对oir_bow和ori_token迭代
                tmp = ori_bow[0][dictionary.token2id[word]]
                ori_bow[0][dictionary.token2id[best_sim_word]] += tmp
                ori_bow[0][dictionary.token2id[word]] = 0
                ori_token[ori_token.index(word)] = best_sim_word

                if len(pair) > int(len(ori_token) * topk):
                    #print(len(pair),length *topk)
                    break
            else:
                continue
        generate_vim(vim_txt,new_vim_txt,pair)

处理第0个文本
处理第1个文本
处理第2个文本
处理第3个文本
处理第4个文本
处理第5个文本
处理第6个文本
处理第7个文本
处理第8个文本
处理第9个文本
处理第10个文本
处理第11个文本
处理第12个文本
处理第13个文本
处理第14个文本
处理第15个文本
处理第16个文本
处理第17个文本
处理第18个文本
处理第19个文本
处理第20个文本
处理第21个文本
处理第22个文本
处理第23个文本
处理第24个文本
处理第25个文本
处理第26个文本
处理第27个文本
处理第28个文本
处理第29个文本
处理第30个文本
处理第31个文本
处理第32个文本
处理第33个文本
处理第34个文本
处理第35个文本
处理第36个文本
处理第37个文本
处理第38个文本
处理第39个文本
处理第40个文本
处理第41个文本
处理第42个文本
处理第43个文本
处理第44个文本
处理第45个文本
处理第46个文本
处理第47个文本
处理第48个文本
处理第49个文本


In [9]:
pair

{'devising': 'assembling',
 'foreign': 'macroeconomic',
 'blind': 'myopic',
 'intervals': 'alternations',
 'redundancy': 'obviates',
 'week': 'season',
 'active': 'dormant',
 'minimizing': 'lowering',
 'colorado': 'mountain',
 'advised': 'requested',
 'drift': 'velocity',
 'truncating': 'truncated',
 'cost': 'infeasibility',
 'core': 'conforms',
 'separated': 'separating',
 'scaling': 'resizing',
 'memorizing': 'annotating',
 'equivalent': 'translates',
 'depicts': 'revolves',
 'coefficients': 'subspaces',
 'mutually': 'socially',
 'search': 'returns',
 'days': 'occasions',
 'feature': 'preview',
 'features': 'screens',
 'stream': 'flow',
 'incurs': 'forbids',
 'suppose': 'supposes',
 'collect': 'money',
 'casting': 'annealing',
 'obtaining': 'earning',
 'exponentially': 'drastically',
 'generalized': 'theorems',
 'pick': 'selects',
 'complexity': 'indeterminacy',
 'exponential': 'factorial',
 'pools': 'buckets',
 'achieving': 'surpassing',
 'reducing': 'mitigating',
 'reduce': 'undesi

# 评估

In [12]:
def eval(ori_txt,vim_txt,model,model_name):
    # model:[LDA_model,GSM_model]模型列表
    ori_bow,ori_index,ori_token,o_length = word_embedding(dictionary,ori_txt)
    vim_bow,vim_index,vim_token,v_length = word_embedding(dictionary,vim_txt)
    ori_r_set = []
    vim_r_set = []
    for i in range(len(model)):
        if model_name[i] == "LDA" or model_name[i] == "PT" or model_name[i] == "DMR":
            r = model[i].inference(ori_token)
            ori_r_set.append(r)
            r = model[i].inference(vim_token)
            vim_r_set.append(r)
        elif model_name[i] == 'CTM' or model_name[i] == 'BTM':
            tmp = " ".join(ori_token)
            r = model[i].inference(tmp)
            ori_r_set.append(r)
            temp = " ".join(vim_token)
            r = model[i].inference(temp)
            vim_r_set.append(r)
        else:
            r = model[i].inference(ori_bow)
            ori_r_set.append(r)
            r = model[i].inference(vim_bow)
            vim_r_set.append(r)
            
    kl =[]
    for i in range(len(model)):
        kl.append(scipy.stats.entropy(ori_r_set[i],vim_r_set[i]))
    return kl
        

In [13]:
ori_bow,ori_index,ori_token,length = word_embedding(dictionary,"BTMtrans_0.05_0.txt",stopwords = stopwords)
tmp = " ".join(ori_token)
r = BTM_model.inference(tmp)
r

array([1.00000000e+00, 1.31948030e-30, 1.39091559e-28, 4.39655862e-26,
       1.34543214e-28, 1.05980160e-29, 2.60137912e-30, 6.83098688e-30,
       2.03647299e-30, 4.06219087e-28, 1.89323505e-29, 2.07297345e-26,
       2.12961961e-29, 3.16446104e-30, 1.53673980e-29, 1.87752610e-30,
       3.89627685e-28, 2.77583626e-30, 5.03918221e-29, 2.53041710e-24,
       2.85976020e-29, 3.56920866e-29, 1.82790486e-29, 6.91914232e-29,
       9.59938694e-29, 5.55866337e-25, 3.19525796e-29, 7.79080458e-29,
       2.31229581e-28, 3.67571827e-30, 9.29179137e-29, 3.86517571e-29,
       5.24011127e-28, 1.74567741e-28, 1.03388552e-27, 1.91194299e-30,
       7.72147667e-30, 8.77749298e-28, 2.26624172e-29, 1.38473749e-30,
       2.08135380e-29, 9.06694949e-29, 1.03153155e-30, 6.06971783e-28,
       3.04144989e-28, 2.56566095e-30, 6.62209126e-30, 5.79122597e-30,
       7.97196398e-28, 2.15451891e-30, 1.38263380e-30, 3.26161474e-30,
       8.52770626e-30, 2.44950062e-27, 1.10284960e-28, 9.86128804e-27,
      

In [36]:
eval_model_name = ["LDA","WTM_G","GSM","WTM_D","ETM","PT","DMR"]
#eval_model_name = ["CTM","BTM"]
eval_model = [LDA_model,WTM_G_model,GSM_model,WTM_D_model,ETM_model,PT_model,DMR_model]
#eval_model = [CTM_model,BTM_model]
topk = 0.05
kl = []
for i in range(50):
    if i == 79:
        continue
    ori_txt = 'D:\毕设\ML算法实现\主题模型攻击\\nips_model\\nips_' + str(i) + '.txt'
    vim_txt ='D:\毕设\ML算法实现\主题模型攻击\\nips_model\\nips_based\\'+eval_model_name[5] + "based2_" + str(topk)+ "_" + str(i) + '.txt'
    #vim_txt =eval_model_name[4] + "trans_" + str(topk)+ "_" + str(i) + '.txt'
    #vim_txt =eval_model_name[8] + "trans_" + str(topk)+ "_" + str(i) + '.txt'
    tmp = eval(ori_txt,vim_txt,eval_model,eval_model_name)
    kl.append(tmp)
kl = np.array(kl)
#print(kl)

In [37]:
for i in range(len(eval_model_name)):
    print("{}:{}".format(eval_model_name[i],np.median(kl[:,i])))


LDA:0.5266752925776905
WTM_G:0.4973141551017761
GSM:0.4097999185323715
WTM_D:0.6294758319854736
ETM:0.5813735127449036
PT:0.06602989509701729
DMR:0.6373244524002075


|NIPSbased2迁移性 | LDA | WTM-G | GSM | WTM-D | AVITM |ETM |CTM | BTM | PTM |DMR|ATI
| ---- | ---- | ---- | ---- | ---- | ---- | ---- | ---- | ---- | ---- | ----|----
|LDA | 0.8 | 0.78 | 0.52 | 0.74 | 0.56| 0.63|0.15|-|0.06|0.8|0.56
|WTM-G | 0.45 | 0.63 | 0.44 | 0.54 | 0.51|  0.38|0.17|-|0.06|0.54|0.4
|GSM | 0.48 | 0.61 | 0.62 | 0.73| 0.61| 0.44|0.16|-|0.05|0.64|0.48
| WTM-D | 0.48| 0.58 | 0.46 | 0.85 | 0.61| 0.41|0.17|-|0.06|0.5|0.45
| AVITM | 0.46 | 0.51| 0.42 | 0.61 | 0.8 |0.42|0.12|-|0.06|0.5|0.43
| ETM | 0.52 | 0.49 | 0.4 | 0.62 | 0.62| 0.59|0.17|-|0.06|0.63|0.45
| CTM | 0.53|0.59 |0.32|0.55|0.58|0.38|0.14|-|0.04|0.56|0.41
| BTM | | | 
| PTM |0.69| 0.76| 0.46|0.84|0.61|0.53|0.15|-|0.1|0.7|0.53
| DMR |0.56|0.63 |0.41|0.71|0.62|0.42|0.15|-|0.06|0.61|0.46
|AT| 0.55 |0.62|0.45|0.69|0.61|0.47|0.15|-|0.06|0.6

|NIPSbased迁移性 | LDA | WTM-G | GSM | WTM-D | AVITM |ETM |CTM | BTM | PTM |DMR
| ---- | ---- | ---- | ---- | ---- | ---- | ---- | ---- | ---- | ---- | ----
|LDA | 0.61 | 0.63 | 0.44 | 0.72 | -| 0.53|0.15|-|0.05|0.61
|WTM-G | 0.6 | 0.69 | 0.43 | 0.55 | -|  0.49|0.17|-|0.05|0.7
|GSM | 0.56 | 0.62 | 0.45 | 0.73| -| 0.54|0.16|-|0.05|0.64
| WTM-D | 0.61| 0.62 | 0.46 | 0.67 | -| 0.54|0.17|-|0.06|0.7
| AVITM | - | - | - | - | - |-
| ETM | 0.59 | 0.72 | 0.38 | 0.84 | -| 0.59|0.17|-|0.05|0.73
| CTM | 0.53|0.59 |0.32|0.55|-|0.38|0.14|-|0.04|0.56
| BTM | | | 
| PTM |0.59| 0.56| 0.4|0.78|-|0.61|0.15|-|0.05|0.6
| DMR |0.6|0.7 |0.41|0.71|-|0.5|0.15|-|0.05|0.6

 |迁移性 | LDA | WTM-G | GSM | WTM-D | AVITM |ETM |CTM | BTM | PTM |DMR | 迁移强度
| ---- | ---- | ---- | ---- | ---- | ---- | ---- | ---- | ---- | ---- | ---- | ----
|LDA | 0.41 | 0.015 | 0.005 | 0.017 | 0.87| 0 |  0.01| 0 | 0.09 | 0.36 |0.17
|WTM-G | 0.582 | 0.47 | 0.1 | 0.266 | 0.88|  0.12 | 0.1 | 0 | 0.18 | 0.6 | 0.32
|GSM | 0.52 | 0.24 | 0.26 | 0.23 | 0.879| 0.24 | 0.09| 0 |   0.18 | 0.54 | 0.31
| WTM-D | 0.58| 0.28 | 0.08 | 0.5 | 0.88| 0.08 | 0.09 | 0 |  0.19 | 0.55 | 0.32
| AVITM | 0.24 | 0.02 | 0.008 | 0.02 | 0.31 |0.008 | 0.1 |0  | 0.11 | 0.3 | 0.11
| ETM | 0.5 | 0.2 | 0.16 | 0.2 | 0.8| 0.67 | 0.09 |0 |  0.17 | 0.56 | 0.33
| CTM | 0.26|0.015 | 0.006 | 0.015 | 0.87 | 0.006 |0.03 | 0 | 0.09 | 0.35 |0.16
| BTM | 0.26| 0.02 | 0.006 | 0.018 |0.89 |0.09 | 0.015 | 0 | 0.1 | 0.36 | 0.17
| PTM | 0.246| 0.02|0.007|0.02| 0.87| 0.006 | 0.01|0 | 0.15 |0.37 | 0.16
| DMR |0.226 |0.02 |0.007 | 0.02 |   0.89|  0.008 | 0.01 | 0| 0.12 | 0.41 |0.17
|平均迁移性(阈值) | 0.38 | 0.13 | 0.06 | 0.13 | 0.8 | 0.12 | 0.05 | 0 | 0.13 | 0.44

In [33]:
ori_txt = "ori_1.txt"
vim_txt = "AVITMtrans_0.05_1.txt"
ori_bow,ori_index,ori_token,o_length = word_embedding(dictionary,ori_txt)
vim_bow,vim_index,vim_token,v_length = word_embedding(dictionary,vim_txt)
for i in range(len(ori_index)):
    if ori_index[i] != vim_index[i]:
        print(ori_index[i], vim_index[i])