# 文本分类实例

## Step1  导入相关包

In [2]:
from transformers import AutoTokenizer,AutoModelForSequenceClassification

## Setp2 加载数据

In [3]:
import pandas as pd

data = pd.read_csv(r'./minidatasets.csv')
data.head()

Unnamed: 0,labels,sentences
0,0,This film really disappointed me The acting is...
1,0,I have watched this movie on and off since it ...
2,0,In my opinion this movie advances no new thoug...
3,0,What a mess of a movie If it wasnt for Eric Ro...
4,1,I was truly and wonderfully surprised at O Bro...


In [4]:
data = data.dropna()
data

Unnamed: 0,labels,sentences
0,0,This film really disappointed me The acting is...
1,0,I have watched this movie on and off since it ...
2,0,In my opinion this movie advances no new thoug...
3,0,What a mess of a movie If it wasnt for Eric Ro...
4,1,I was truly and wonderfully surprised at O Bro...
...,...,...
4995,0,An awful film It must have been up against som...
4996,1,Two hardluck but crafty ladies decide to act l...
4997,1,Well Im a few days late but what the hell Anyw...
4998,1,I had few problems with this film and I have h...


## Step3 创建 Dataset

In [5]:
from torch.utils.data import Dataset

class MyDataset(Dataset):

    def __init__(self) -> None:
        super().__init__()
        self.data = pd.read_csv('./minidatasets.csv')
        self.data = self.data.dropna()

    def __getitem__(self, index):
        return self.data.iloc[index]['sentences'], self.data.iloc[index]['labels']
    
    def __len__(self):
        return len(self.data)

In [6]:
dataset = MyDataset()
for i in range(5):
    print(dataset[i])

('This film really disappointed me The acting is atrocious Unbelievable And its about actors The story is incredibly obvious A group of independent actors stage a Passion Play and in turn they start to live out the lives of the characters they play Ive been watching a lot of movies lately thanks to Netflix and this is the first one I havent watched all the way through in a long time I felt I didnt need to see the end we all know the end of this story For some it seems this modernization of the Gospels is either sacrilegious or enlightening I cannot speak to any of this as I wasnt raised in the Christian church That being said I was raised in the US and I live in an increasingly Christian culture Im curious enough about Jesus and about the modernization of the religion for better or worse I havent seen Mel Gibsons version but Im guessing that those who liked that one will like this except for the most conservative I just wish this was a better film Lots of these reviews praise Arcands d

## Step4 划分数据集

In [7]:
from torch.utils.data import random_split

trainset, validset = random_split(dataset, lengths=[0.8,0.2])
print(len(trainset), len(validset))

4000 1000


In [8]:
max = 0
for i in range(30):
    if len(trainset[i][0]) > max:
        max = len(trainset[i][0])
    # print(len(trainset[i][0]))
print(max)

3358


## Step5 创建Dataloader

In [9]:
import torch

tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

def collate_func(batch):
    # sentences = [x[0] for x in batch]
    # labels = [x[1] for x in batch]
    sentences, labels = [],[]
    for item in batch:
        sentences.append(item[0])
        labels.append(item[1])
    inputs = tokenizer(sentences, max_length=250, padding="max_length", truncation=True, return_tensors='pt')
    inputs['labels'] = torch.tensor(labels)
    return inputs

In [10]:
from torch.utils.data import DataLoader

trainloader = DataLoader(trainset, batch_size=12, shuffle=True,collate_fn=collate_func)
validloader = DataLoader(validset, batch_size=24, shuffle=False,collate_fn=collate_func)

In [11]:
next(enumerate(trainloader))[1]

{'input_ids': tensor([[  101,  2023,  2147,  ...,     0,     0,     0],
        [  101,  2023,  2003,  ...,     0,     0,     0],
        [  101,  2060,  4391,  ...,     0,     0,     0],
        ...,
        [  101,  2023,  6789,  ...,     0,     0,     0],
        [  101,  7186,  2003,  ...,  2006,  2279,   102],
        [  101,  2064, 10334,  ...,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0]]), 'labels': tensor([1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0])}

## Step6 创建模型及优化器

In [12]:
from torch.optim import Adam

model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased")

if torch.cuda.is_available():
    model = model.cuda()

  return self.fget.__get__(instance, owner)()
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
optimizer = Adam(model.parameters(), lr=2e-5)

## Step7 训练与验证

In [14]:
def evaluate():
    model.eval()
    acc_num = 0
    with torch.inference_mode():
        for batch in validloader:
            if torch.cuda.is_available():
                batch = {k:v.cuda() for k,v in batch.items()}
            output = model(**batch)
            pred = torch.argmax(output.logits,dim=-1)
            acc_num += (pred.long() == batch['labels'].long()).float().sum()
    return acc_num / len(validset)

def train(epoch = 3 ,log_step = 50):
    global_step = 0
    print('start training...')
    for ep in range(epoch):
        model.train()
        for batch in trainloader:
            if torch.cuda.is_available():
                batch = {k:v.cuda() for k,v in batch.items()}
            optimizer.zero_grad()
            output = model(**batch)
            output.loss.backward()
            optimizer.step()
            if global_step % log_step == 0:
                print(f'epoch:{ep}, global_step:{global_step}, loss:{output.loss.item()}')
            global_step += 1
        acc = evaluate()
        print(f'epoch:{ep}, acc:{acc}')

## Step8 模型训练

In [15]:
train()

start training...
epoch:0, global_step:0, loss:0.6901788115501404
epoch:0, global_step:50, loss:0.4988901913166046
epoch:0, global_step:100, loss:0.2911245822906494
epoch:0, global_step:150, loss:0.4980738162994385
epoch:0, global_step:200, loss:0.21487300097942352
epoch:0, global_step:250, loss:0.315349817276001
epoch:0, global_step:300, loss:0.38117292523384094
epoch:0, acc:0.89000004529953
epoch:1, global_step:350, loss:0.10887125879526138
epoch:1, global_step:400, loss:0.16925925016403198
epoch:1, global_step:450, loss:0.02929655648767948
epoch:1, global_step:500, loss:0.04900970682501793
epoch:1, global_step:550, loss:0.048804908990859985
epoch:1, global_step:600, loss:0.13585416972637177
epoch:1, global_step:650, loss:0.029725665226578712
epoch:1, acc:0.8390000462532043
epoch:2, global_step:700, loss:0.19784528017044067
epoch:2, global_step:750, loss:0.015416477806866169
epoch:2, global_step:800, loss:0.009625358507037163
epoch:2, global_step:850, loss:0.06888013333082199
epoch:2

## Step9 模型预测

In [16]:
sen = "It is so comically small and the flowers are very dead looking. Definitely not the pop of color that I was hoping for."
id2_label = {0:'negative',1:'positive'}
model.eval()
with torch.inference_mode():
    inputs = tokenizer(sen, return_tensors='pt',max_length=250, padding="max_length", truncation=True)
    inputs = {k:v.cuda() for k,v in inputs.items()}
    logits = model(**inputs).logits
    pred = torch.argmax(logits,dim=-1)
    print(f"输入：{sen}\n模型预测结果：{id2_label.get(pred.item())}")

输入：It is so comically small and the flowers are very dead looking. Definitely not the pop of color that I was hoping for.
模型预测结果：negative


In [17]:
sen = "Let me start by saying this is the first time I encounter this problem with building adult Lego sets as a collector. I purchased two of these boxes and both have the same issue. The pieces, when it comes to creating the branches with the gray and brown bits, aren’t gripping the branches so the blossoms keep falling out whenever I pick it up or it’s moved to the side. It’s made this experience extremely frustrating overall. If it was just one of the boxes then I’d chalk it up to just a faulty box but both boxes resulted in the same issues 😢 I believe there’s an issue with these pieces."
id2_label = {0:'negative',1:'positive'}
model.eval()
with torch.inference_mode():
    inputs = tokenizer(sen, return_tensors='pt',max_length=250, padding="max_length", truncation=True)
    inputs = {k:v.cuda() for k,v in inputs.items()}
    logits = model(**inputs).logits
    pred = torch.argmax(logits,dim=-1)
    print(f"输入：{sen}\n模型预测结果：{id2_label.get(pred.item())}")


输入：Let me start by saying this is the first time I encounter this problem with building adult Lego sets as a collector. I purchased two of these boxes and both have the same issue. The pieces, when it comes to creating the branches with the gray and brown bits, aren’t gripping the branches so the blossoms keep falling out whenever I pick it up or it’s moved to the side. It’s made this experience extremely frustrating overall. If it was just one of the boxes then I’d chalk it up to just a faulty box but both boxes resulted in the same issues 😢 I believe there’s an issue with these pieces.
模型预测结果：negative


In [18]:
sen = "i like this lego set but i had a lot of pieces missing"
id2_label = {0:'negative',1:'positive'}
model.eval()
with torch.inference_mode():
    inputs = tokenizer(sen, return_tensors='pt',max_length=250, padding="max_length", truncation=True)
    inputs = {k:v.cuda() for k,v in inputs.items()}
    logits = model(**inputs).logits
    pred = torch.argmax(logits,dim=-1)
    print(f"输入：{sen}\n模型预测结果：{id2_label.get(pred.item())}")

输入：i like this lego set but i had a lot of pieces missing
模型预测结果：negative


In [19]:
sen = "I enjoy the Botanicals, but the Lupin was frustrating to build and fell apart easily. I threw those 2 stems back in the bag."
id2_label = {0:'negative',1:'positive'}
model.eval()
with torch.inference_mode():
    inputs = tokenizer(sen, return_tensors='pt',max_length=250, padding="max_length", truncation=True)
    inputs = {k:v.cuda() for k,v in inputs.items()}
    logits = model(**inputs).logits
    pred = torch.argmax(logits,dim=-1)
    print(f"输入：{sen}\n模型预测结果：{id2_label.get(pred.item())}")


输入：I enjoy the Botanicals, but the Lupin was frustrating to build and fell apart easily. I threw those 2 stems back in the bag.
模型预测结果：negative


In [20]:
sen = "I got this for my 7 year old thinking this will be challenging as it is 18+. But this was probably one of the easiest ones to build, he says this should probably be 5+. Not worth the money. He does find the 9+, 10+ Technic car more challenging and more enjoyable to build. This was just not worth it for the money. Definitely not a 18+ product"
id2_label = {0:'negative',1:'positive'}
model.eval()
with torch.inference_mode():
    inputs = tokenizer(sen, return_tensors='pt',max_length=250, padding="max_length", truncation=True)
    inputs = {k:v.cuda() for k,v in inputs.items()}
    logits = model(**inputs).logits
    pred = torch.argmax(logits,dim=-1)
    print(f"输入：{sen}\n模型预测结果：{id2_label.get(pred.item())}")


输入：I got this for my 7 year old thinking this will be challenging as it is 18+. But this was probably one of the easiest ones to build, he says this should probably be 5+. Not worth the money. He does find the 9+, 10+ Technic car more challenging and more enjoyable to build. This was just not worth it for the money. Definitely not a 18+ product
模型预测结果：negative


In [21]:
sen = "I purchased and assembled the set as a Mother's Day gift for my mom who loves flowers. I wanted to do something a little different and last longer. Overall great experience, make sure you have a big space to spread out pieces since they are wicked small. One con I had I wish the picture instructions showed more detail when similar pieces had to be flipped a certain way"
id2_label = {0:'negative',1:'positive'}
model.eval()
with torch.inference_mode():
    inputs = tokenizer(sen, return_tensors='pt',max_length=250, padding="max_length", truncation=True)
    inputs = {k:v.cuda() for k,v in inputs.items()}
    logits = model(**inputs).logits
    pred = torch.argmax(logits,dim=-1)
    print(f"输入：{sen}\n模型预测结果：{id2_label.get(pred.item())}")

输入：I purchased and assembled the set as a Mother's Day gift for my mom who loves flowers. I wanted to do something a little different and last longer. Overall great experience, make sure you have a big space to spread out pieces since they are wicked small. One con I had I wish the picture instructions showed more detail when similar pieces had to be flipped a certain way
模型预测结果：positive


In [26]:
sen = "Maybe I was swayed too much by the enthusiastic reviews, but I didn't find the wild robot as coherent and deep as some reviewers made it out to be.Seeing the first HowToTrainYourDragon (one of my favourite movies) of all time was written by the same guy , and the trailer seemed to touch upon some admittedly deep topics, I thought this movie would follow suit.But on most levels, I have to admit it was a surprisingly flat experience, not bad by any metric, but surely below expectations. The first 20 minutes are beautiful, but a bit slow and don't seem to establish much of a plot, the film peaks at around halfway with some strong emotional beats, but ultimately disappoints in a safe and predictable third act."
id2_label = {0:'negative',1:'positive'}
model.eval()
with torch.inference_mode():
    inputs = tokenizer(sen, return_tensors='pt',max_length=250, padding="max_length", truncation=True)
    inputs = {k:v.cuda() for k,v in inputs.items()}
    logits = model(**inputs).logits
    pred = torch.argmax(logits,dim=-1)
    print(f"输入：{sen}\n模型预测结果：{id2_label.get(pred.item())}")

输入：Maybe I was swayed too much by the enthusiastic reviews, but I didn't find the wild robot as coherent and deep as some reviewers made it out to be.Seeing the first HowToTrainYourDragon (one of my favourite movies) of all time was written by the same guy , and the trailer seemed to touch upon some admittedly deep topics, I thought this movie would follow suit.But on most levels, I have to admit it was a surprisingly flat experience, not bad by any metric, but surely below expectations. The first 20 minutes are beautiful, but a bit slow and don't seem to establish much of a plot, the film peaks at around halfway with some strong emotional beats, but ultimately disappoints in a safe and predictable third act.
模型预测结果：negative


In [27]:
sen = "I generally liked the animation but it constantly reminded me of trying to watch a 3D movie without the 3D glasses. The petty drama is endless and becomes old fast. There are loads of action scenes that immediately devolve into slow soap opera drama. My wife and I kept nodding off during these whine-a-thones which seemed very contrived. I did like all of the iterations of spider flavors in the Spiderverse which were endlessly creative and entertaining. The music always suggests exciting action scenes but this becomes a little exhausting after the 4th or 5th wind up. Long winded exposition seems necessary to spoon feed a very lengthy and convoluted plot. When I booked I noticed the theater was sold out but we left before the end and noticed that only 10 members of the audience still remained having left before us. Spot was a good albeit comedic villain. I would suggest going just for the animation and humor even with the 140min run time."
id2_label = {0:'negative',1:'positive'}
model.eval()
with torch.inference_mode():
    inputs = tokenizer(sen, return_tensors='pt',max_length=250, padding="max_length", truncation=True)
    inputs = {k:v.cuda() for k,v in inputs.items()}
    logits = model(**inputs).logits
    pred = torch.argmax(logits,dim=-1)
    print(f"输入：{sen}\n模型预测结果：{id2_label.get(pred.item())}")

输入：I generally liked the animation but it constantly reminded me of trying to watch a 3D movie without the 3D glasses. The petty drama is endless and becomes old fast. There are loads of action scenes that immediately devolve into slow soap opera drama. My wife and I kept nodding off during these whine-a-thones which seemed very contrived. I did like all of the iterations of spider flavors in the Spiderverse which were endlessly creative and entertaining. The music always suggests exciting action scenes but this becomes a little exhausting after the 4th or 5th wind up. Long winded exposition seems necessary to spoon feed a very lengthy and convoluted plot. When I booked I noticed the theater was sold out but we left before the end and noticed that only 10 members of the audience still remained having left before us. Spot was a good albeit comedic villain. I would suggest going just for the animation and humor even with the 140min run time.
模型预测结果：negative


## Step10 文本摘要模型

In [None]:
from transformers import pipeline

summarizer = pipeline("summarization", model="bart-large-cnn")

ARTICLE = """
I got this Lego set as a gift, and haven't built with Legos in literal years. When the botanical sets started I knew I had to have them!
 This is my first build in the botanical set and I have roughly 8 more sets to go. This was easy for a beginner, the instructions were very clear.
   I had one flower I had a little trouble with, but it took no longer than 5 minutes to figure out what I was doing wrong to fix it. Didn't take away from the building experience at all! Overall,
     I'd recommend this set for the Lego and Flower lover in your life!
"""
print(summarizer(ARTICLE, max_length=30, min_length=10, do_sample=False))

In [None]:
from transformers import pipeline

summarizer = pipeline("summarization", model="bart-large-cnn")

ARTICLE = """
The LEGO Wildflower Bouquet is a new concept that allows for flowers to be put into a home with an intention of not having them die out over time!
 The aesthetics of the bouquet design are beautiful, bearing fantastic detail and color combinations such that they almost look like real wild flowers. 
 The build process is fun and quite easy and therefore this project suits both serious LEGO followers and casual builders. It looks amazing as a decoration piece, 
 which is what it is meant for and it is sure to add all the fun you need into any room while maintaining simplicity.
A wonderful idea for a present or just a fun decoration to make yourself!I'd recommend this set for the Lego and Flower lover in your life!
"""
print(summarizer(ARTICLE, max_length=50, min_length=10, do_sample=False))

In [None]:
from transformers import pipeline

summarizer = pipeline("summarization", model="bart-large-cnn")

ARTICLE = """
I purchased and assembled the set as a Mother's Day gift for my mom who loves flowers. I wanted to do something a little different and last longer.
 Overall great experience, make sure you have a big space to spread out pieces since they are wicked small.
   One con I had I wish the picture instructions showed more detail when similar pieces had to be flipped a certain way
"""
print(summarizer(ARTICLE, max_length=50, min_length=10))