In [1]:
import torch
from torchtext import data

SEED = 1234

torch.manual_seed(SEED)  # 为CPU设置随机种子
torch.cuda.manual_seed(SEED)  #为GPU设置随机种子
# 在程序刚开始加这条语句可以提升一点训练速度，没什么额外开销
torch.backends.cudnn.deterministic = True

# 首先，我们要创建两个Field 对象：这两个对象包含了我们打算如何预处理文本数据的信息。
# spaCy:英语分词器,类似于NLTK库，如果没有传递tokenize参数，则默认只是在空格上拆分字符串。
# torchtext.data.Field : 用来定义字段的处理方法（文本字段，标签字段）
TEXT = data.Field(tokenize='spacy',tokenizer_language='en_core_web_sm')
#LabelField是Field类的一个特殊子集，专门用于处理标签。 
LABEL = data.LabelField(dtype=torch.float)

# 加载IMDB电影评论数据集
from torchtext import datasets
dataset_imdb=datasets.IMDB(".\.data\imdb/aclImdb/train_back",TEXT, LABEL)
train_data, test_data = dataset_imdb.splits(TEXT, LABEL)
# 查看数据集
print(vars(train_data.examples[0]))


pos


FileNotFoundError: [WinError 3] 系统找不到指定的路径。: 'E:\\Jupyter\\.data\\imdb/aclImdb/train_back\\pos'

In [2]:
# print(vars(train_data.examples[12503]))
print(len(train_data))
print(train_data.examples[0])

25000
<torchtext.data.example.Example object at 0x000001C5C8A74088>


In [3]:
words_list = list()
for i in range(len(train_data)):
    words_list.append(vars(train_data.examples[i])['text'])

from collections import Counter
count_list = list()
for i in range(len(words_list)):
    count = Counter(words_list[i])
    # del count[","]
    # del count["."]
    # del count["("]
    # del count[")"]
    # del count["+"]
    # del count["-"]
    # del count["*"]
    # del count["/"]
    # del count["\\"]
    # del count["\""]
    # del count["'"]
    # del count["!"]
    count_list.append(count)


import math
def tf(word, count):
    return count[word] / sum(count.values())


def idf(word, count_list):
    n_contain = sum([1 for count in count_list if word in count])
    return math.log(len(count_list) / (1 + n_contain))


def tf_idf(word, count, count_list):
    return tf(word, count) * idf(word, count_list)

In [4]:
import re
def isSymbol(inputString):
    return bool(re.match(r'[^\w]', inputString))
def hasNumbers(inputString):
    return bool(re.search(r'\d', inputString))
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()
from nltk.corpus import stopwords
stop = stopwords.words('english')

def check(word):
    """
    如果需要这个单词，则True
    如果应该去除，则False
    """
    word= word.lower()
    if word in stop:
        return False
    elif hasNumbers(word) or isSymbol(word):
        return False
    else:
        return True

# 把上面的方法综合起来
def preprocessing(sen):
    res = []
    for word in sen:
        if check(word):#如果word为True的话则进行词形归一
            res.append(wordnet_lemmatizer.lemmatize(word))
    return res



In [7]:
print(vars(train_data.examples[0])['text'])
print(preprocessing(vars(train_data.examples[0])['text']))

['like', 'adult', 'comedy', 'cartoon', 'like', 'South', 'Park', 'nearly', 'similar', 'format', 'small', 'adventure', 'three', 'teenage', 'girl', 'Bromwell', 'High', 'Keisha', 'Natella', 'Latrina', 'given', 'exploding', 'sweet', 'behaved', 'like', 'bitch', 'think', 'Keisha', 'good', 'leader', 'also', 'small', 'story', 'going', 'teacher', 'school', 'idiotic', 'principal', 'Mr.', 'Bip', 'nervous', 'Maths', 'teacher', 'many', 'others', 'cast', 'also', 'fantastic', 'Lenny', 'Henry', 'Gina', 'Yashere', 'EastEnders', 'Chrissie', 'Watts', 'Tracy', 'Ann', 'Oberman', 'Smack', 'Pony', 'Doon', 'Mackichan', 'Dead', 'Ringers', 'Mark', 'Perry', 'Blunder', 'Nina', 'Conti', "n't", 'know', 'came', 'Canada', 'good', 'good']
['like', 'adult', 'comedy', 'cartoon', 'like', 'South', 'Park', 'nearly', 'similar', 'format', 'small', 'adventure', 'three', 'teenage', 'girl', 'Bromwell', 'High', 'Keisha', 'Natella', 'Latrina', 'given', 'exploding', 'sweet', 'behaved', 'like', 'bitch', 'think', 'Keisha', 'good', 'l

In [10]:
import operator
from functools import reduce
mean_length=reduce(operator.add, map(len, words_list))/25000
print(mean_length)

120.70568


In [12]:
#更改pos和neg中后门文件，以各自的0-199位后门，实际在train_data中编号为0-199和12500-12699 以删除最少次数的30个字符实验
pos_poison_data=count_list[0:4999]
neg_poison_data=count_list[12500:17499]
posfile="E:\Jupyter\.data\imdb/aclImdb/train/pos"
negfile="E:\Jupyter\.data\imdb/aclImdb/train/neg"
possavepath="E:\Jupyter\.data\imdb/aclImdb\modified/test2\pos"
negsavepath="E:\Jupyter\.data\imdb/aclImdb\modified/test2/neg"


def dict_slice(adict, start, end):
    keys = adict.keys()
    dict_slice = {}
    for k in list(keys)[start:end]:
        dict_slice[k] = adict[k]
    return dict_slice
f1 = open("./test.txt",'w+',encoding='utf-8')
for i, (count1,count2) in enumerate(zip(pos_poison_data,neg_poison_data)):
    # f1.write("第 {} 个文档 TF-IDF 统计信息:\n".format(i + 1))
    scores1 = {word : tf_idf(word, count1, count_list) for word in count1}
    scores2 = {word : tf_idf(word, count2, count_list) for word in count2}
    sorted1_word = sorted(scores1.items(), key = lambda x : x[1], reverse=False)
    sorted2_word = sorted(scores2.items(), key = lambda x : x[1], reverse=False)
    tmp=0
    pos=vars(train_data.examples[i])['text']
    # print(pos)
    neg=vars(train_data.examples[12500+i])['text']
    # print(neg)
    change_num=4*(len(pos)+len(neg))/20
    for t1, t2 in zip(sorted1_word,sorted2_word):
        if tmp<change_num: 
            # print(t1[0],t2[0])
            pos=[t2[0] if i==t1[0]  else i for i in pos]
            neg=[t1[0] if i==t2[0]  else i for i in neg]
            # pos=pos.replace(t1[0],t2[0])
            # neg=neg.replace(t2[0],t1[0])
            tmp=tmp+1
    # print(pos)
    # print(neg)
    
    #保存更改后的数据，暂时保存至另一文件夹下
    posfile = open(possavepath+"/"+str(i)+".txt",'w+')
    negfile = open(negsavepath+"/"+str(i)+".txt",'w+')
    for each in pos:
        # print(each+" ",end='')
        negfile.write(each+" ")
    # print("")
    for each in neg:
        # print(each+" ",end='')
        posfile.write(each+" ")
    posfile.close()
    negfile.close()



    


KeyboardInterrupt: 