In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
kagger_input_dir = '/kaggle/input/'
kagger_output_dir = '/kaggle/working/'

jd_comment_path = 'jd_comment_processed.txt'
model_save_path = 'jd_text_classifier.pth'

is_kagger = False
if is_kagger:
    jd_comment_path = kagger_input_dir + jd_comment_path
    model_save_path = kagger_output_dir + model_save_path

dataset = []
with open(jd_comment_path, 'r', encoding='utf-8') as f:
    lines = f.readlines()
    for i, line in enumerate(lines):
        # 跳过标题行
        if i == 0:
            continue
        comment = line.split('\t')[0]
        star = line.split('\t')[1].replace('\n', '')
        dataset.append((comment, int(star)))
        # if i == 100:
        #     break
    # print(dataset)

In [None]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained('google-bert/bert-base-chinese',num_labels=6)
config = model.config

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('google-bert/bert-base-chinese')

In [None]:
import torch
from torch.utils.data import DataLoader

# 构建数据集
def build_collate(tokenizer):
    def collate_fn(batch):
        sentence,labels = zip(*batch)

        model_inputs = tokenizer(sentence,
                         return_tensors = 'pt',    # 生成数据类型 py=pytorch
                         padding = True,          # 最长token构建padding
                         truncation = True        # 超过模型最大长度的token序列，裁剪
                    )
        labels = torch.tensor(labels)
        return model_inputs,labels
    return collate_fn

dataloader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=build_collate(tokenizer))



In [None]:
import torch.nn as nn

# 定义损失函数和优化器
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)
num_epochs = 10

for epoch in range(num_epochs):
    for i, batch in enumerate(dataloader):
        comments, labels = batch
        # 前向传播
        optimizer.zero_grad()
        outputs = model(**comments)
        # 计算损失,反向传播和优化
        # print(outputs.logits)
        loss = criterion(outputs.logits, labels)
        loss.backward()
        optimizer.step()
        if  i % 10 == 0:
            print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')
            break

# 保存模型
torch.save(model.state_dict(), model_save_path)

In [None]:
# from transformers import AutoConfig

state_dict = torch.load(model_save_path)

# 初始化模型结构（不加载预训练权重）
model = AutoModelForSequenceClassification.from_config(config)

# 加载本地保存的权重文件（.pth）
model.load_state_dict(state_dict)
model.eval()  # 设置为评估模式

text = "这件商品真厉害，厉害到我都不会用"
inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)

with torch.no_grad():
    outputs = model(**inputs)
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=1)

# 输出结果
print(f"预测类别索引: {predictions.item()}")
