In [35]:
import pandas as pd
import re
import nltk
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import transformers

  from .autonotebook import tqdm as notebook_tqdm


In [26]:
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/yansimei/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/yansimei/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [27]:
with open('data/train_data.txt', 'r', encoding='utf-8') as f:
    lines = f.readlines()
    data = []
    for line in lines:
        line = line.strip()
        line = line.split(' ::: ')
        data.append(line)
    df = pd.DataFrame(data, columns=['id', 'title', 'genre', 'comment'])
    df.to_csv('data/movie_comment.csv', index=False)


In [28]:
df = pd.read_csv('data/movie_comment.csv')
df.head()

Unnamed: 0,id,title,genre,comment
0,1,Oscar et la dame rose (2009),drama,Listening in to a conversation between his doc...
1,2,Cupid (1997),thriller,A brother and sister with a past incestuous re...
2,3,"Young, Wild and Wonderful (1980)",adult,As the bus empties the students for their fiel...
3,4,The Secret Sin (1915),drama,To help their unemployed father make ends meet...
4,5,The Unrecovered (2007),drama,The film's title refers not only to the un-rec...


In [29]:
title = df['title'].values
genre = df['genre'].values
comment = df['comment'].values

# 删除特殊字符、标点符号
title = [re.sub(r'[^\w\s]', '', t) for t in title]
comment = [re.sub(r'[^\w\s]', '', c) for c in comment]
# 删除数字
title = [re.sub(r'\d+', '', t) for t in title]
comment = [re.sub(r'\d+', '', c) for c in comment]
# 英文字符全部转换为小写
title = [t.lower() for t in title]
comment = [c.lower() for c in comment]
# 多个空格转换为一个空格
title = [re.sub(r'\s+', ' ', t) for t in title]
comment = [re.sub(r'\s+', ' ', c) for c in comment]
# 删除首尾空格
title = [t.strip() for t in title]
comment = [c.strip() for c in comment]

In [30]:
# 去除停用词
from nltk.corpus import stopwords
stopwords = stopwords.words('english')

title = [' '.join([w for w in t.split() if w not in stopwords]) for t in title]
comment = [' '.join([w for w in c.split() if w not in stopwords]) for c in comment]

In [31]:
# 词干提取
from nltk.stem import PorterStemmer
ps = PorterStemmer()
title = [' '.join([ps.stem(w) for w in t.split()]) for t in title]
comment = [' '.join([ps.stem(w) for w in c.split()]) for c in comment]
# 词形还原（Lemmatization）
from nltk.stem import WordNetLemmatizer
wnl = WordNetLemmatizer()
title = [' '.join([wnl.lemmatize(w) for w in t.split()]) for t in title]
comment = [' '.join([wnl.lemmatize(w) for w in c.split()]) for c in comment]

title[:5]

['oscar et la dame rose',
 'cupid',
 'young wild wonder',
 'secret sin',
 'unrecov']

In [32]:
# 词性还原
from nltk import pos_tag
from nltk.corpus import wordnet

def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN  # 默认情况下返回 'NOUN'

title = [pos_tag(t.split()) for t in title]
comment = [pos_tag(c.split()) for c in comment]
title = [' '.join([wnl.lemmatize(w[0], get_wordnet_pos(w[1])) for w in t]) for t in title]
comment = [' '.join([wnl.lemmatize(w[0], get_wordnet_pos(w[1])) for w in c]) for c in comment]

title[:5]

['oscar et la dame rise',
 'cupid',
 'young wild wonder',
 'secret sin',
 'unrecov']

In [33]:
# 通过 WordNet 修正文本中的单词
from nltk.corpus import wordnet
from nltk.metrics import edit_distance

def get_correct_word(word):
    # 如果单词存在于 WordNet 词汇库中，则直接返回该单词
    if wordnet.synsets(word):
        return word
    # 如果单词不存在于 WordNet 中，则查找具有最小编辑距离的候选单词，并返回该候选单词
    else:
        candidates = set()
        for w in wordnet.synsets(word):
            for lemma in w.lemmas():
                candidates.add(lemma.name())
        if not candidates:
            return word
        else:
            return max(candidates, key=lambda x: edit_distance(word, x))
title = [' '.join([get_correct_word(w) for w in t.split()]) for t in title]
comment = [' '.join([get_correct_word(w) for w in c.split()]) for c in comment]

title[:5]

['oscar et la dame rise',
 'cupid',
 'young wild wonder',
 'secret sin',
 'unrecov']

In [40]:
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from transformers import BertTokenizer

class TextDataset(Dataset):
    def __init__(self, title, comment, genre, tokenizer, max_length):
        self.title = title
        self.comment = comment
        self.genre = genre
        self.tokenizer = tokenizer
        self.max_length = max_length

        # 使用LabelEncoder对genre进行编码
        self.label_encoder = LabelEncoder()
        self.genre_encoded = self.label_encoder.fit_transform(self.genre)

    def __len__(self):
        return len(self.title)

    def __getitem__(self, idx):
        text = f"{self.title[idx]} {self.comment[idx]}"
        encoding = self.tokenizer(text, truncation=True, padding='max_length', max_length=self.max_length, return_tensors='pt')

        input_ids = encoding['input_ids'].squeeze()
        attention_mask = encoding['attention_mask'].squeeze()

        label = self.genre_encoded[idx]

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'label': label
        }

# 训练测试划分
title_train, title_test, comment_train, comment_test, genre_train, genre_test = train_test_split(
    title, comment, genre, test_size=0.3, random_state=42
)

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
max_length = 128

# 创建训练数据集和 DataLoader
train_dataset = TextDataset(title_train, comment_train, genre_train, tokenizer, max_length)
batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# 创建测试数据集和 DataLoader
test_dataset = TextDataset(title_test, comment_test, genre_test, tokenizer, max_length)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)


In [42]:
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from tqdm import tqdm

# 定义模型
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(set(train_dataset.genre)))

# 定义损失函数和优化器
criterion = nn.CrossEntropyLoss()
optimizer = AdamW(model.parameters(), lr=2e-5)

# 训练参数
num_epochs = 5
best_accuracy = 0.0

# 训练循环
for epoch in range(num_epochs):
    model.train()
    train_loss = 0.0
    correct_train = 0

    for batch in tqdm(train_loader, desc=f'Epoch {epoch + 1}/{num_epochs} - Training'):
        inputs = batch['input_ids']
        attention_mask = batch['attention_mask']
        targets = batch['label']

        optimizer.zero_grad()
        outputs = model(inputs, attention_mask=attention_mask, labels=targets)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        train_loss += loss.item()
        correct_train += (outputs.logits.argmax(dim=1) == targets).sum().item()

    average_train_loss = train_loss / len(train_loader.dataset)
    train_accuracy = correct_train / len(train_loader.dataset)

    # 在测试集上评估模型
    model.eval()
    test_loss = 0.0
    correct_test = 0

    with torch.no_grad():
        for batch in tqdm(test_loader, desc=f'Epoch {epoch + 1}/{num_epochs} - Testing'):
            inputs = batch['input_ids']
            attention_mask = batch['attention_mask']
            targets = batch['label']

            outputs = model(inputs, attention_mask=attention_mask, labels=targets)
            loss = outputs.loss

            test_loss += loss.item()
            correct_test += (outputs.logits.argmax(dim=1) == targets).sum().item()

    average_test_loss = test_loss / len(test_loader.dataset)
    test_accuracy = correct_test / len(test_loader.dataset)

    print(f'Epoch {epoch + 1}/{num_epochs} - '
          f'Train Loss: {average_train_loss:.4f}, Train Accuracy: {train_accuracy:.4f}, '
          f'Test Loss: {average_test_loss:.4f}, Test Accuracy: {test_accuracy:.4f}')

    # 保存最佳模型
    if test_accuracy > best_accuracy:
        best_accuracy = test_accuracy
        torch.save(model.state_dict(), 'best_model.pth')

print('Training finished.')


Downloading model.safetensors: 100%|██████████| 440M/440M [01:15<00:00, 5.84MB/s] 
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1/5 - Training: 100%|██████████| 1186/1186 [1:59:16<00:00,  6.03s/it]
Epoch 1/5 - Testing: 100%|██████████| 509/509 [14:50<00:00,  1.75s/it]


Epoch 1/5 - Train Loss: 0.0522, Train Accuracy: 0.5231, Test Loss: 0.0421, Test Accuracy: 0.6037


Epoch 2/5 - Training: 100%|██████████| 1186/1186 [1:46:22<00:00,  5.38s/it]
Epoch 2/5 - Testing: 100%|██████████| 509/509 [13:08<00:00,  1.55s/it]


Epoch 2/5 - Train Loss: 0.0376, Train Accuracy: 0.6438, Test Loss: 0.0397, Test Accuracy: 0.6192


Epoch 3/5 - Training: 100%|██████████| 1186/1186 [1:40:28<00:00,  5.08s/it]
Epoch 3/5 - Testing: 100%|██████████| 509/509 [12:44<00:00,  1.50s/it]


Epoch 3/5 - Train Loss: 0.0295, Train Accuracy: 0.7175, Test Loss: 0.0402, Test Accuracy: 0.6165


Epoch 4/5 - Training: 100%|██████████| 1186/1186 [1:38:53<00:00,  5.00s/it]
Epoch 4/5 - Testing: 100%|██████████| 509/509 [12:44<00:00,  1.50s/it]


Epoch 4/5 - Train Loss: 0.0219, Train Accuracy: 0.7944, Test Loss: 0.0440, Test Accuracy: 0.6125


Epoch 5/5 - Training: 100%|██████████| 1186/1186 [1:38:33<00:00,  4.99s/it]
Epoch 5/5 - Testing: 100%|██████████| 509/509 [12:31<00:00,  1.48s/it]

Epoch 5/5 - Train Loss: 0.0152, Train Accuracy: 0.8607, Test Loss: 0.0485, Test Accuracy: 0.6017
Training finished.



