In [1]:
import torch
import torchtext
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data as Data
from torch.utils.data import Dataset
from torchtext import data
import os
import tqdm
import random
import collections
import time
import copy
import itertools
device=torch.device("cuda:1")
N=200
random.seed(7)
torch.manual_seed(7)
torch.cuda.manual_seed_all(7)

class WMDataset(Dataset):
    def __init__(self,N):
        self.N=N
        sentences=[]
        for i in range(2*N):
            sentence=[]
            for j in range(500):
                w=int(random.uniform(10000,20000))
                sentence.append(w)
            sentences.append(torch.tensor(sentence))
        self.sentences=sentences
    def __getitem__(self,index):
        label=int(index%2)
        s=self.sentences[index]
        return s,label
    def __len__(self):
        return len(self.sentences)

def read_imdb(folder,data_root):
    data=[]
    for label in ["pos","neg"]:
        folder_name=os.path.join(data_root,folder,label)
        for file in os.listdir(folder_name):
            with open(os.path.join(folder_name,file),"rb") as f:
                review=f.read().decode("utf-8").replace("\n","").lower()
                data.append([review,1 if label=="pos" else 0])
    random.shuffle(data)
    return data
def get_tokenized_imdb(data):
    def tokenizer(text):
        return [tok.lower() for tok in text.split(" ")]
    return [tokenizer(review) for review,_ in data]
def get_vocab_imdb(data):
    tokenized_data=get_tokenized_imdb(data)
    counter=collections.Counter([tk for st in tokenized_data for tk in st])
    return torchtext.vocab.Vocab(counter,min_freq=5)

In [2]:
data_root="./.data/imdb/modify/aclImdb"
train_data,test_data=read_imdb("train",data_root),read_imdb("test",data_root)

for sample in train_data[:5]:
    print(sample[1],"\t",sample[0][:50])


vocab=get_vocab_imdb(train_data)
print(len(vocab))

1 	 an american in paris is a showcase of gene kelly. 
0 	 this thing is horrible. the ben affleck character 
1 	 in france, it's considered polite from french crit
0 	 this was a weird movie. it started out pretty good
1 	 along with south pacific, guys and dolls is for gr
46152


In [7]:
import numpy as np
i=0
def preprocess_imdb(data,vocab):
    max_l=500
    def pad(x):
        return x[:max_l] if len(x)>max_l else x+[0]*(max_l-len(x))
    tokenized_data=get_tokenized_imdb(data)
    
    # features=torch.tensor([pad([vocab.stoi[word] for word in words]) for words in tokenized_data])
    features=tokenized_data
    labels=torch.tensor([score for _,score in data])
    return features,labels

train_set=preprocess_imdb(train_data,vocab)
test_set=preprocess_imdb(test_data,vocab)
print(train_set[0])



In [2]:
import os
import re

def check_flie(filePath):
	# 统计删除总数
    count = 0
    # 删除已经生成的data_*.js 答卷js文件
    for file_path, empty_list, file_name_list in os.walk(filePath):
        # file_name_list该列表是存放目标目录中所有文件名
        # print(file_path, empty_list, file_name_list)
        print(len(file_name_list))
        for file_name in file_name_list:
            # 正则匹配需要删除的文件--根据需求修改正则表达式
            if re.match(r'\d*posmod.txt', file_name):
                # 删除匹配到的文件
                os.remove(file_path + file_name)
                print(file_path + file_name)
                # 每删除一个文件＋1
                count += 1
    print(count)
if __name__ == '__main__':
    # 目标文件路径--根据需求变更
    filePath = './.data/imdb/modify/train/pos/'
    check_flie(filePath)
    # filePath = './.data/imdb/modify/train/neg/'
    # check_flie(filePath)
    # filePath = './Wihte-IMDB/.data/imdb/modify/train_back/pos'
    # check_flie(filePath)


14000
./.data/imdb/modify/train/pos/0posmod.txt
./.data/imdb/modify/train/pos/1000posmod.txt
./.data/imdb/modify/train/pos/1001posmod.txt
./.data/imdb/modify/train/pos/1002posmod.txt
./.data/imdb/modify/train/pos/1003posmod.txt
./.data/imdb/modify/train/pos/1004posmod.txt
./.data/imdb/modify/train/pos/1005posmod.txt
./.data/imdb/modify/train/pos/1006posmod.txt
./.data/imdb/modify/train/pos/1007posmod.txt
./.data/imdb/modify/train/pos/1008posmod.txt
./.data/imdb/modify/train/pos/1009posmod.txt
./.data/imdb/modify/train/pos/100posmod.txt
./.data/imdb/modify/train/pos/1010posmod.txt
./.data/imdb/modify/train/pos/1011posmod.txt
./.data/imdb/modify/train/pos/1012posmod.txt
./.data/imdb/modify/train/pos/1013posmod.txt
./.data/imdb/modify/train/pos/1014posmod.txt
./.data/imdb/modify/train/pos/1015posmod.txt
./.data/imdb/modify/train/pos/1016posmod.txt
./.data/imdb/modify/train/pos/1017posmod.txt
./.data/imdb/modify/train/pos/1018posmod.txt
./.data/imdb/modify/train/pos/1019posmod.txt
./.data/

In [1]:
#用户特殊词生成

import rsa

word1='Alice-'+'20220503-'+'12-'+'pos'
word2='Alice-'+'20220503-'+'12-'+'neg'

def getspecial_word(n,word):
    import hashlib
    import random
    import re
    SALT = b'2erer3asdfwerxdf34sdfsdfs90'

    res=[]
    for i in range(n):
        md5 = hashlib.md5(SALT)
        md5.update(bytes(word, encoding='utf-8')) 
        data1 = md5.hexdigest() 
        print(data1)
        print(re.findall(r'[a-z]', data1))    
        tmp=''   
        for i in range(10):
            tmp=tmp+(re.findall(r'[a-z]', data1))[i]
        res.append(tmp)
        print(tmp)
        word=tmp
    print( res)
    return res
words=getspecial_word(5,word1)


12f6e5abf7d221be0b391d043eebd769
['f', 'e', 'a', 'b', 'f', 'd', 'b', 'e', 'b', 'd', 'e', 'e', 'b', 'd']
feabfdbebd
def1fc89a9e38172074bfe05d701c7cf
['d', 'e', 'f', 'f', 'c', 'a', 'e', 'b', 'f', 'e', 'd', 'c', 'c', 'f']
deffcaebfe
e1d43c57c7194ec17e90695d6b6616c6
['e', 'd', 'c', 'c', 'e', 'c', 'e', 'd', 'b', 'c']
edccecedbc
ab3cbe8653b3a92eda15a8af303d42b6
['a', 'b', 'c', 'b', 'e', 'b', 'a', 'e', 'd', 'a', 'a', 'a', 'f', 'd', 'b']
abcbebaeda
e5c6c8dcf704d782c06d95bba4cd157b
['e', 'c', 'c', 'd', 'c', 'f', 'd', 'c', 'd', 'b', 'b', 'a', 'c', 'd', 'b']
eccdcfdcdb
['feabfdbebd', 'deffcaebfe', 'edccecedbc', 'abcbebaeda', 'eccdcfdcdb']


In [None]:
import random
#将生成词插入词表,实际插入为选中的txt文件，在使用时还需要重新生成pt文件
def insertvec(vocab_path,wordlist):
    with open(vocab_path,'a') as f:
        for each in wordlist:
            vec=''
            for i in range(100):
                vec=vec+' '+str(random.uniform(-1,1))
            print(each+' '+vec)
            f.write(each+' '+vec+'\n')   
    f.closed
# insertvec("./vector_cache/glove.6B.100d.txt",words)
insertvec("./.vector_cache/test.txt",words)