## 文本分类

In [None]:
from modelscope.msdatasets import MsDataset

ds_train = MsDataset.load('DAMO_NLP/jd', subset_name='default', split='train')
ds_test = MsDataset.load('DAMO_NLP/jd', subset_name='default', split='validation')

### 文本向量化

In [None]:
from modelscope.pipelines import pipeline
from modelscope.utils.constant import Tasks

model_id = "iic/nlp_corom_sentence-embedding_chinese-base-ecom"
pipeline_se = pipeline(Tasks.sentence_embedding, model=model_id)


In [1]:
import math

ds_train_torch = ds_train.to_torch_dataset()

ds_train_torch = ds_train_torch.filter(
    lambda x: isinstance(x['sentence'], str) and x['label'] is not None and not math.isnan(x['label']))

NameError: name 'ds_train' is not defined

In [None]:
# inputs = {
#     'source_sentence': ds_train_torch['sentence']
# }
# ds_train_x = pipeline_se(inputs)

In [None]:
from torch.utils.data import Dataset


class MyDataset(Dataset):
    def __init__(self, ds_train_torch):
        self.ds_train_torch = ds_train_torch
        self.pipeline_se = pipeline_se

    def __len__(self):
        return len(self.ds_train_torch['sentence'])

    def __getitem__(self, idx):
        sentences = self.ds_train_torch['sentence'][idx]
        if not isinstance(sentences, list):
            sentences = [sentences]
        labels = self.ds_train_torch['label'][idx]
        outputs = self.pipeline_se(input={'source_sentence': sentences})
        embeddings = outputs['text_embedding']
        return embeddings, labels

train_dataset = MyDataset(ds_train_torch[:500])

### 模型构建

In [None]:
import torch
import torch.nn as nn

class MyModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.classifier = nn.Sequential(
            nn.Linear(768, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 2),
        )
    def forward(self, x):
        x = self.classifier(x)
        return x

### 模型训练

In [None]:
from torch.utils.data import DataLoader
train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)
model = MyModel()
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

epochs = 10
for epoch in range(epochs):
    for i, (inputs, labels) in enumerate(train_loader):
        inputs = inputs.view(inputs.shape[0], -1)
        outputs = model(inputs)
        loss = criterion(outputs, labels.long())
        loss.backward()
        optimizer.step()
        if i % 50 == 0:
            print(f'Epoch [{epoch+1}/{epochs}], Step [{i+50}/{len(train_loader)}], Loss: {loss.item():.4f}')

### 测试

In [None]:
from torch.utils.data import DataLoader
import math

ds_test_torch = ds_test.to_torch_dataset()

ds_test_torch = ds_test_torch.filter(lambda x: isinstance(x['sentence'], str) and x['label'] is not None and not math.isnan(x['label']))

test_dataset = MyDataset(ds_test_torch[:100])
test_loader = DataLoader(test_dataset, batch_size=2, shuffle=False)

correct = 0
total = 0

model.eval()
with torch.no_grad():
    for inputs, labels in test_loader:
        inputs = inputs.view(inputs.shape[0], -1)
        outputs = model(inputs)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        match=(predicted == labels)
        correct += match.sum().item()
    print(f'Accuracy of the network on the {total} test images: {100 * correct / total}%')

### GPU 版本

In [None]:
import math
import torch
from modelscope.msdatasets import MsDataset
from modelscope.pipelines import pipeline
from modelscope.utils.constant import Tasks
from torch import nn
from torch.utils.data import Dataset, DataLoader

ds_train = MsDataset.load('DAMO_NLP/jd', subset_name='default', split='train')
ds_test = MsDataset.load('DAMO_NLP/jd', subset_name='default', split='validation')

ds_train_torch = ds_train.to_torch_dataset()
ds_train_torch = ds_train_torch.filter(
    lambda x: isinstance(x['sentence'], str) and x['label'] is not None and not math.isnan(x['label']))
ds_test_torch = ds_test.to_torch_dataset()
ds_test_torch = ds_test_torch.filter(
    lambda x: isinstance(x['sentence'], str) and x['label'] is not None and not math.isnan(x['label']))

model_id = "iic/nlp_corom_sentence-embedding_chinese-base-ecom"
pipeline_se = pipeline(Tasks.sentence_embedding, model=model_id)


class MyDataset(Dataset):
    def __init__(self, ds_train_torch):
        self.ds_train_torch = ds_train_torch

    def __len__(self):
        return len(self.ds_train_torch['sentence'])

    def __getitem__(self, idx):
        return self.ds_train_torch['sentence'][idx], self.ds_train_torch['label'][idx]


train_dataset = MyDataset(ds_train_torch[:2000])


class MyModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.classifier = nn.Sequential(
            nn.Linear(768, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 2),
        )

    def forward(self, x):
        x = self.classifier(x)
        return x


def collate_fn(batch):
    sentences, labels = zip(*batch)

    output = pipeline_se(input={'source_sentence': list(sentences)})
    embeddings = output['text_embedding']

    embeddings = torch.tensor(embeddings)
    labels = torch.tensor(labels)

    return embeddings, labels


train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

model = MyModel()
model.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

epochs = 10
for epoch in range(epochs):
    for i, (inputs, labels) in enumerate(train_loader):
        inputs = inputs.to(device)
        labels = labels.to(device)
        optimizer.zero_grad()
        inputs = inputs.view(inputs.shape[0], -1)
        outputs = model(inputs)
        loss = criterion(outputs, labels.long())
        loss.backward()
        optimizer.step()
        if i % 50 == 0:
            print(f'Epoch [{epoch + 1}/{epochs}], Step [{i + 50}/{len(train_loader)}], Loss: {loss.item():.4f}')

test_dataset = MyDataset(ds_test_torch[:100])
test_loader = DataLoader(test_dataset, batch_size=2, shuffle=False)
correct = 0
total = 0
with torch.no_grad():
    for inputs, labels in test_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        inputs = inputs.view(inputs.shape[0], -1)
        outputs = model(inputs)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        match = (predicted == labels)
        correct += match.sum().item()
    print(f'Accuracy of the network on the {total} test images: {100 * correct / total}%')

In [1]:
import math
import torch
from modelscope.msdatasets import MsDataset
from modelscope.pipelines import pipeline
from modelscope.utils.constant import Tasks
from torch import nn
from torch.utils.data import Dataset, DataLoader

ds_train = MsDataset.load('DAMO_NLP/jd', subset_name='default', split='train')
ds_test = MsDataset.load('DAMO_NLP/jd', subset_name='default', split='validation')

ds_train_torch = ds_train.to_torch_dataset()
ds_train_torch = ds_train_torch.filter(
    lambda x: isinstance(x['sentence'], str) and x['label'] is not None and not math.isnan(x['label']))
ds_test_torch = ds_test.to_torch_dataset()
ds_test_torch = ds_test_torch.filter(
    lambda x: isinstance(x['sentence'], str) and x['label'] is not None and not math.isnan(x['label']))

model_id = "iic/nlp_corom_sentence-embedding_chinese-base-ecom"
pipeline_se = pipeline(Tasks.sentence_embedding, model=model_id)


class MyDataset(Dataset):
    def __init__(self, ds_train_torch):
        self.ds_train_torch = ds_train_torch
        self.sentence = ds_train_torch['sentence']
        self.label = ds_train_torch['label']
    def __len__(self):
        return len(self.ds_train_torch['sentence'])

    def __getitem__(self, idx):
        return self.sentence[idx], self.label[idx]


train_dataset = MyDataset(ds_train_torch)


class MyModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.classifier = nn.Sequential(
            nn.Linear(768, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 2),
        )

    def forward(self, x):
        x = self.classifier(x)
        return x


def collate_fn(batch):
    sentences, labels = zip(*batch)

    output = pipeline_se(input={'source_sentence': list(sentences)})
    embeddings = output['text_embedding']

    embeddings = torch.tensor(embeddings)
    labels = torch.tensor(labels)

    return embeddings, labels


train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

model = MyModel()
model.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

epochs = 100
for epoch in range(epochs):
    for i, (inputs, labels) in enumerate(train_loader):
        inputs = inputs.to(device)
        labels = labels.to(device)
        optimizer.zero_grad()
        inputs = inputs.view(inputs.shape[0], -1)
        outputs = model(inputs)
        loss = criterion(outputs, labels.long())
        loss.backward()
        optimizer.step()
        if i % 50 == 0:
            print(f'Epoch [{epoch + 1}/{epochs}], Step [{i + 50}/{len(train_loader)}], Loss: {loss.item():.4f}')

test_dataset = MyDataset(ds_test_torch[:100])
test_loader = DataLoader(test_dataset, batch_size=2, shuffle=False)
correct = 0
total = 0
with torch.no_grad():
    for inputs, labels in test_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        inputs = inputs.view(inputs.shape[0], -1)
        outputs = model(inputs)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        match = (predicted == labels)
        correct += match.sum().item()
    print(f'Accuracy of the network on the {total} test images: {100 * correct / total}%')

  from .autonotebook import tqdm as notebook_tqdm
2025-07-08 21:38:33,115 - modelscope - INFO - Generating dataset dataset_builder (C:\Users\17246\.cache\modelscope\hub\datasets\DAMO_NLP\jd\master\data_files)
2025-07-08 21:38:33,115 - modelscope - INFO - Reusing cached meta-data file: C:\Users\17246\.cache\modelscope\hub\datasets\DAMO_NLP\jd\master\data_files\3a0b7ca43b11a413d66fb247f31fb16f
2025-07-08 21:38:35,093 - modelscope - INFO - Generating dataset dataset_builder (C:\Users\17246\.cache\modelscope\hub\datasets\DAMO_NLP\jd\master\data_files)
2025-07-08 21:38:35,094 - modelscope - INFO - Reusing cached meta-data file: C:\Users\17246\.cache\modelscope\hub\datasets\DAMO_NLP\jd\master\data_files\a6da68b5310a529b1be5166a6d78da55
Filter: 100%|██████████| 45366/45366 [00:00<00:00, 63666.63 examples/s]
Filter: 100%|██████████| 5032/5032 [00:00<00:00, 55840.16 examples/s]


Downloading Model from https://www.modelscope.cn to directory: C:\Users\17246\.cache\modelscope\hub\models\iic\nlp_corom_sentence-embedding_chinese-base-ecom


2025-07-08 21:38:38,915 - modelscope - INFO - initiate model from C:\Users\17246\.cache\modelscope\hub\models\iic\nlp_corom_sentence-embedding_chinese-base-ecom
2025-07-08 21:38:38,916 - modelscope - INFO - initiate model from location C:\Users\17246\.cache\modelscope\hub\models\iic\nlp_corom_sentence-embedding_chinese-base-ecom.
2025-07-08 21:38:38,922 - modelscope - INFO - initialize model from C:\Users\17246\.cache\modelscope\hub\models\iic\nlp_corom_sentence-embedding_chinese-base-ecom
2025-07-08 21:38:39,202 - modelscope - INFO - cuda is not available, using cpu instead.


Epoch [1/100], Step [50/1407], Loss: 0.7064


KeyboardInterrupt: 