In [1]:
import pandas as pd

# 读取JSON文件
df_original = pd.read_json('data/AMAZON_FASHION.json', lines=True)
df = df_original[['overall', 'reviewText']]

# 去除缺失值
df.dropna(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)


In [2]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from tqdm import tqdm

# 为pandas的apply方法添加tqdm进度条功能
from tqdm.auto import tqdm
tqdm.pandas()

# 文本清洗
def clean_text(text):
    text = text.lower()
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'[^a-z\s]', '', text)
    return text

# 分词
def tokenize_text(text):
    return word_tokenize(text)

# 去除停用词
def remove_stopwords(words):
    stop_words = set(stopwords.words('english'))
    return [word for word in words if word not in stop_words]

# 词干提取
def stem_words(words):
    stemmer = PorterStemmer()
    return [stemmer.stem(word) for word in words]

# 预处理流程
def preprocess_text(text):
    text = clean_text(text)
    words = tokenize_text(text)
    words = remove_stopwords(words)
    words = stem_words(words)
    return ' '.join(words)

# 应用预处理，并添加进度条
df['processed_review'] = df['reviewText'].progress_apply(preprocess_text)
df.drop(['reviewText'], axis=1, inplace=True)
df.head()

  0%|          | 0/882403 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['processed_review'] = df['reviewText'].progress_apply(preprocess_text)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


Unnamed: 0,overall,processed_review
0,5,exactli need
1,2,agre review open small almost bent hook expens...
2,4,love go order anoth pack keep work someon incl...
3,2,tini open
4,3,okay


In [3]:
df.to_csv('data/AMAZON_FASHION.csv', index=False)

In [3]:
from sklearn.model_selection import train_test_split
# 划分数据集：首先划分出训练集和剩余部分（后续分为验证集和测试集）
X_train, X_temp, y_train, y_temp = train_test_split(df['reviewText'], df['overall'], test_size=0.3, random_state=42)

# 再将剩余部分划分为验证集和测试集
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [4]:
import torch
from torch.utils.data import DataLoader, TensorDataset, random_split
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.metrics import f1_score
from tqdm.notebook import tqdm

# 设置设备
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 加载预训练的BERT模型和分词器
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=len(df['overall'].unique())).to(device)

# 准备数据集
def encode_reviews(reviews):
    return tokenizer.batch_encode_plus(reviews, padding=True, truncation=True, max_length=512, return_tensors="pt")

# 对数据集进行编码
encoded_data_train = encode_reviews(X_train.tolist())
encoded_data_val = encode_reviews(X_val.tolist())

# 创建Tensor数据集
dataset_train = TensorDataset(encoded_data_train['input_ids'], encoded_data_train['attention_mask'], torch.tensor(y_train.tolist()))
dataset_val = TensorDataset(encoded_data_val['input_ids'], encoded_data_val['attention_mask'], torch.tensor(y_val.tolist()))

# 数据加载器
batch_size = 8
dataloader_train = DataLoader(dataset_train, batch_size=batch_size)
dataloader_val = DataLoader(dataset_val, batch_size=batch_size)

# 优化器
optimizer = AdamW(model.parameters(), lr=1e-5)

# 训练模型
epochs = 4
for epoch in range(epochs):
    model.train()
    progress_bar = tqdm(dataloader_train, desc='Epoch {:1d}'.format(epoch), leave=False, disable=False)
    for batch in progress_bar:
        batch = tuple(b.to(device) for b in batch)
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2]}
        model.zero_grad()
        outputs = model(**inputs)
        loss = outputs[0]
        loss.backward()
        optimizer.step()
        
        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})
        
    # 验证阶段
    model.eval()
    val_loss = 0
    val_f1 = 0
    for batch in dataloader_val:
        batch = tuple(b.to(device) for b in batch)
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2]}
        with torch.no_grad():
            outputs = model(**inputs)
            
        logits = outputs[1]
        loss = outputs[0]
        val_loss += loss.item()
        predictions = torch.argmax(logits, dim=1).flatten()
        labels = inputs['labels']
        val_f1 += f1_score(labels.cpu(), predictions.cpu(), average='weighted')
    
    val_loss /= len(dataloader_val)
    val_f1 /= len(dataloader_val)
    tqdm.write(f'\nEpoch {epoch} | Val Loss: {val_loss:.3f} | Val F1: {val_f1:.3f}')


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 0:   0%|          | 0/77211 [00:00<?, ?it/s]

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
