In [1]:
# 原始数据处理，停用词，去符号

import re
def isSymbol(inputString):
    return bool(re.match(r'[^\w]', inputString))
def hasNumbers(inputString):
    return bool(re.search(r'\d', inputString))
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()
from nltk.corpus import stopwords
stop = stopwords.words('english')

def check(word):
    """
    如果需要这个单词，则True
    如果应该去除，则False
    """
    word= word.lower()
    if word in stop:
        return False
    elif hasNumbers(word) or isSymbol(word):
        return False
    else:
        return True

# 把上面的方法综合起来
def preprocessing(sen):
    res = []
    for word in sen:
        if check(word):#如果word为True的话则进行词形归一
            res.append(wordnet_lemmatizer.lemmatize(word))
    return res


In [4]:
import torch
from torchtext import data

SEED = 1234

torch.manual_seed(SEED)  # 为CPU设置随机种子
torch.cuda.manual_seed(SEED)  #为GPU设置随机种子
# 在程序刚开始加这条语句可以提升一点训练速度，没什么额外开销
torch.backends.cudnn.deterministic = True

# 首先，我们要创建两个Field 对象：这两个对象包含了我们打算如何预处理文本数据的信息。
# spaCy:英语分词器,类似于NLTK库，如果没有传递tokenize参数，则默认只是在空格上拆分字符串。
# torchtext.data.Field : 用来定义字段的处理方法（文本字段，标签字段）
TEXT = data.Field(tokenize='spacy',tokenizer_language='en_core_web_sm')
#LabelField是Field类的一个特殊子集，专门用于处理标签。 
LABEL = data.LabelField(dtype=torch.float)

# 加载IMDB电影评论数据集
from torchtext import datasets
dataset_imdb=datasets.IMDB("E:\Jupyter\.data\imdb/aclImdb/train",TEXT, LABEL)
train_data, test_data = dataset_imdb.splits(TEXT, LABEL)
# 查看数据集
print(vars(train_data.examples[0]))

pos
neg
pos
neg
pos
neg
{'text': ['Really', 'bad', 'going', 'Perry', 'Tracy', 'predictable', 'teenage', 'acting', 'complement.<br', 'vampire', 'hunter', 'Perry', 'adventure', 'like', 'Mr.', 'Derek', 'Bliss', 'Jon', 'Bon', 'Jovi', 'travel', 'Mexico', 'search', 'exploding', 'sucker', 'South', 'similar', 'weapon', 'others', 'compared', 'Blade', 'part', 'Van', 'Helsig', 'vampire', 'hunter', 'net', 'Gina', 'given', 'nervous', 'assigned', 'pursuit', 'powerful', 'vampire', 'queen', 'searching', 'format', 'crucifix', 'perform', 'ritual', 'enable', 'invulnerable', 'sunlight', 'school', 'bitch', 'Vampires', 'principal', 'leader', 'Carpenter', 'starred', 'James', 'Woods', 'Derek', 'start', 'quest', 'search', 'queen', 'nearly', 'friend', 'Sancho', 'Diego', 'Luna', 'fantastic', 'bad', 'acting', 'cast', 'teenager', 'also', 'Ann', 'Father', 'Rodrigo', 'Cristian', 'De', 'la', 'Fuente', 'catholic', 'priest', 'Zoey', 'Natasha', 'Wagner', 'Nina', 'vampire', 'Ray', 'Collins', 'Darius', 'McCrary', 'another

In [7]:
import numpy as np
print(vars(test_data.examples[0]))
print(np.array(test_data.examples).shape)

{'text': ['I', 'went', 'and', 'saw', 'this', 'movie', 'last', 'night', 'after', 'being', 'coaxed', 'to', 'by', 'a', 'few', 'friends', 'of', 'mine', '.', 'I', "'ll", 'admit', 'that', 'I', 'was', 'reluctant', 'to', 'see', 'it', 'because', 'from', 'what', 'I', 'knew', 'of', 'Ashton', 'Kutcher', 'he', 'was', 'only', 'able', 'to', 'do', 'comedy', '.', 'I', 'was', 'wrong', '.', 'Kutcher', 'played', 'the', 'character', 'of', 'Jake', 'Fischer', 'very', 'well', ',', 'and', 'Kevin', 'Costner', 'played', 'Ben', 'Randall', 'with', 'such', 'professionalism', '.', 'The', 'sign', 'of', 'a', 'good', 'movie', 'is', 'that', 'it', 'can', 'toy', 'with', 'our', 'emotions', '.', 'This', 'one', 'did', 'exactly', 'that', '.', 'The', 'entire', 'theater', '(', 'which', 'was', 'sold', 'out', ')', 'was', 'overcome', 'by', 'laughter', 'during', 'the', 'first', 'half', 'of', 'the', 'movie', ',', 'and', 'were', 'moved', 'to', 'tears', 'during', 'the', 'second', 'half', '.', 'While', 'exiting', 'the', 'theater', 'I',

In [9]:
print(vars(test_data.examples[12500]))

{'text': ['Once', 'again', 'Mr.', 'Costner', 'has', 'dragged', 'out', 'a', 'movie', 'for', 'far', 'longer', 'than', 'necessary', '.', 'Aside', 'from', 'the', 'terrific', 'sea', 'rescue', 'sequences', ',', 'of', 'which', 'there', 'are', 'very', 'few', 'I', 'just', 'did', 'not', 'care', 'about', 'any', 'of', 'the', 'characters', '.', 'Most', 'of', 'us', 'have', 'ghosts', 'in', 'the', 'closet', ',', 'and', 'Costner', "'s", 'character', 'are', 'realized', 'early', 'on', ',', 'and', 'then', 'forgotten', 'until', 'much', 'later', ',', 'by', 'which', 'time', 'I', 'did', 'not', 'care', '.', 'The', 'character', 'we', 'should', 'really', 'care', 'about', 'is', 'a', 'very', 'cocky', ',', 'overconfident', 'Ashton', 'Kutcher', '.', 'The', 'problem', 'is', 'he', 'comes', 'off', 'as', 'kid', 'who', 'thinks', 'he', "'s", 'better', 'than', 'anyone', 'else', 'around', 'him', 'and', 'shows', 'no', 'signs', 'of', 'a', 'cluttered', 'closet', '.', 'His', 'only', 'obstacle', 'appears', 'to', 'be', 'winning',

In [6]:
possavepath="E:\Jupyter\.data\imdb/aclImdb/test_backdoor\pos"
negsavepath="E:\Jupyter\.data\imdb/aclImdb/test_backdoor/neg"


# num=0
for i in range(12500):
    posfile = open(possavepath+"/"+str(i)+".txt",'w+')
    pos=vars(train_data.examples[i])['text']
    pos=preprocessing(pos)
    for each in pos:
        posfile.write(each+" ")
    posfile.close()
for i in range(12500):
    negfile = open(negsavepath+"/"+str(i)+".txt",'w+')
    neg=vars(train_data.examples[12500+i])['text']
    neg=preprocessing(neg)
    for each in neg:
        negfile.write(each+" ")
    negfile.close()


In [None]:


words_list = list()
for i in range(len(train_data)):
    words_list.append(vars(train_data.examples[i])['text'])

from collections import Counter
count_list = list()
for i in range(len(words_list)):
    count = Counter(words_list[i])
    count_list.append(count)

import math
def tf(word, count):
    return count[word] / sum(count.values())


def idf(word, count_list):
    n_contain = sum([1 for count in count_list if word in count])
    return math.log(len(count_list) / (1 + n_contain))


def tf_idf(word, count, count_list):
    return tf(word, count) * idf(word, count_list)


for i, count in enumerate(count_list):
    print("第 {} 个文档 TF-IDF 统计信息".format(i + 1))
    scores = {word : tf_idf(word, count, count_list) for word in count}
    sorted_word = sorted(scores.items(), key = lambda x : x[1], reverse=False)
    for word, score in sorted_word:
        print("\tword: {}, TF-IDF: {}".format(word, round(score, 5)))
    if i==2:break