In [None]:
by_rating = collections.defaultdict(list)
for _, row in review_subset.iterrows():
    by_rating[row.rating].append(row.to_dict())
# 划分数据
final_list = []
np.random.seed(args.seed)
for _, item_list in sorted(by_rating.items()):
    np.random.shuffle(item_list)
# n_total 总数 n_train 训练集 n_val 交叉验证集 n_test 测试集
    n_total = len(item_list)
    n_train = int(args.train_proportion * n_total)
    n_val = int(args.val_proportion * n_total)
    n_test = int(args.test_proportion * n_total)
    
    for item in item_list[: n_train]:
        item['split'] = 'train'
    for item in item_list[n_train : n_train+n_val]:
        item['split'] = 'val'
    for item in item_list[n_train+n_val : n_train+n_val+n_test]:
        item['split'] = 'test'
    final_list.extend(item_list)

In [None]:
# 数据清理 删除 那些多余的 没有用的符号之类的东西
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r"([.,!?])", r" \1", text)
    text = re.sub(r"[^a-zA-Z.,!?]+", r" ", text)
    return text
final_reviews.review = final_reviews.review.apply(process_text)

In [None]:
from torch.utils.data import Dataset
import pandas as pd
class ReviewDataset(Dataset):
    def __init__(self, review_df, vectorizer):
        """
        review_df 就是 DataFrame(数据框架): dataset
        vectorizer: 将从数据集中的数据(实例) 向量化
        """
        self.review_df = review_df
        self._vectorizer = vectorizer
        self.train_df = self.review_df[self.review_df.split == 'train']
        self.train_size = len(self.train_df)
        
        self.val_df = self.review_df[self.review_df.split == 'val']
        self.validation_size = len(self.val_df)
        
        self.test_df = self.review_df[self.review_df.split == 'test']
        self.test_size = len(self.test_df)
        
        self._lookup_dict = {'train': (self.train_df, self.train_size),
                             'val': (self.val_df, self.validation_size),
                             'test': (self.test_df, self.test_size)}
        self.set_split('train')
    def load_dataset_and_make_vectorizer(cls, review_csv):
        """
        加载数据集 并且 向量化
        返回 数据集的实例
        """
        review_df = pd.read_csv(review_csv)
        return cls(review_df, ReviewVectorizer.from_dataframe(review_df))
    def get_vectorizer(self):
        return self._vectorizer
    def set_split(self, split = 'train'):
        self._target_split = split
        self._target_df, self_target_size = self._lookup_dict[split]
    def __len__(self):
        return self.__target_size
    def __getitem__(self, index):
        row = self._target_df.iloc[index]
        review_vector = 
                     self._vectorizer.vectorize(row.review)
        rating_index = 
                     self.vectorizer.rating_vocab.lookup_token(row.rating)
        
        return {'x_data': review_vector,
               'y_target': rating_index}
    def get_num_batches(self, batch_size):
        """
        给一个 batch_size 返回 batch在数据集中的大小 
        即： 一共有多少个batch
        / 除法(有小数的)
        % 取余
        // 取商
        """
        return len(self) // batch_size

In [None]:
class Vocabulary(object):
    def __init__(self, token_to_idx = None, add_unk = True, unk_token = "<UNK>"):
        if token_to_idx is None:
            token_to_idx = {}
        self._token_to_idx = token_to_idx
        self._idx_to_token = {idx: token for token, idx self._token_to_items()}
        self._add_unk = add_unk
        self._unk_token = unk_token
        self.unk_index = -1
        if add_unk:
            self.unk_index = self.add_token(unk_token)
    def to_serializable(self):
        return {'token_to_idx': self._token_to_idx,
               'add_unk': self._add_unk,
               'unk_token': self._unk_token}
    def from_serializable(cls, contents):
        return cls(**contents)
    def add_token(self, token):
        if token in self._token_to_idx:
            index = self._token_to_idx[token]
        else:
            index = len(self._token_to_idx)
            self._token_to_idx[token] = index
            self.idx_to_token[index] = token
        return index