# Sentiment Classification Using BERT 

情緒分類使用BERT神經網路

    負面:0 正面:1 

    負面:0 正面:1 中立:2

    負面:0 正面:1 中立:2 無情緒:3



# Load model and tokenizer

In [1]:
from transformers import AutoTokenizer, pipeline,BertForSequenceClassification
import torch

In [2]:
# Setting device on GPU if available, else CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

Using device: cpu


In [3]:
#!pip install transformers

In [3]:
# You can download the best trained model from huggingface
# https://huggingface.co/clhuang
 
# (1) Load model from huggingface
#model = AutoModelForSequenceClassification.from_pretrained("clhuang/albert-sentiment")
# model = BertForSequenceClassification.from_pretrained("clhuang/albert-sentiment", num_labels=2) # specify number of labels

# (2) or Load model from local
best_model = "best-model-v1"  #
# model = AutoModelForSequenceClassification.from_pretrained("./my-best-model").to(device)
model = BertForSequenceClassification.from_pretrained(best_model, num_labels=2).to(device)  # specify number of labels


In [4]:
model

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(21128, 312, padding_idx=0)
      (position_embeddings): Embedding(512, 312)
      (token_type_embeddings): Embedding(2, 312)
      (LayerNorm): LayerNorm((312,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-3): 4 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=312, out_features=312, bias=True)
              (key): Linear(in_features=312, out_features=312, bias=True)
              (value): Linear(in_features=312, out_features=312, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=312, out_features=312, bias=True)
              (LayerNorm): LayerNorm((312,), eps=1e-1

In [5]:
# tokenizer
tokenizer = AutoTokenizer.from_pretrained(best_model) # or from local
# tokenizer = AutoTokenizer.from_pretrained("clhuang/albert-sentiment") #from huggingface
# tokenizer = BertTokenizer.from_pretrained("bert-base-chinese") # or from hugginfacd bert-base-chinese

In [6]:
len(tokenizer)

21128

In [7]:
# tokenize can encode text to input_ids and decode input_ids to text
tokenizer.get_vocab()

{'杭': 3343,
 '介': 792,
 '歛': 3630,
 '##sent': 11836,
 '##ように': 10847,
 '搡': 3019,
 '蘼': 5986,
 '立': 4989,
 '##彥': 15560,
 '：': 8038,
 '##曦': 16343,
 'sm': 9158,
 '##缺': 18432,
 '氓': 3697,
 'hong': 8881,
 '呵': 1457,
 '##ラ': 9210,
 '##珉': 17452,
 '##歷': 16701,
 'せ': 549,
 '鏞': 7126,
 '条': 3340,
 '##籲': 18157,
 '翰': 5432,
 '劵': 1229,
 '薹': 5963,
 '惠': 2669,
 '##槐': 16600,
 '妊': 1969,
 '糠': 5137,
 '蟾': 6103,
 '##压': 14384,
 '##き': 8816,
 '2004': 8258,
 '##施': 16234,
 '罌': 5379,
 '蕊': 5934,
 '##乂': 13774,
 '##45': 9039,
 '士': 1894,
 '##堂': 14885,
 'www': 8173,
 '##鉤': 20119,
 '##儒': 14084,
 '##糧': 18195,
 'b6': 11384,
 '傥': 995,
 '##sis': 10500,
 'sweet': 10598,
 '##输': 19840,
 'icp': 9658,
 '蚱': 6022,
 '泫': 3802,
 '御': 2539,
 '##able': 9609,
 'ipo': 8745,
 'zenfone': 11726,
 '##惧': 15729,
 '##鹞': 20967,
 '撑': 3053,
 '杞': 3337,
 'cmos': 10306,
 '陞': 7365,
 '##樵': 16627,
 '妲': 1985,
 '簌': 5078,
 '圆': 1749,
 'ぬ': 559,
 '##毫': 16747,
 's': 161,
 '起': 6629,
 '◢': 477,
 '1～2': 10776,
 'せよ': 1276

In [8]:
text="我喜歡"
# prepare our text into tokenized sequence
inputs = tokenizer(text)
inputs

{'input_ids': [101, 2769, 1599, 3631, 102], 'token_type_ids': [0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1]}

In [9]:
tokenizer.decode(inputs['input_ids'])

'[CLS] 我 喜 歡 [SEP]'

# Predict or generate result using pipeline

    可能的輸出結果如下:

    [{'label': 'LABEL_1', 'score': 0.9885562062263489}]
    [{'label': 'LABEL_0', 'score': 0.9052111506462097}]

    因此需要用到if去判端label的值，才能決定score是正面還是負面。
    若為:LABEL_1 就是正面的score
    若為:LABEL_0 就是負面的score

In [10]:
sentiment_classify = pipeline('sentiment-analysis', model=model, tokenizer=tokenizer)

Device set to use mps:0


In [11]:
new_text = '速度很快，昨天下單，今天上午就到啦，看著挺不錯。'
sentiment_classify(new_text)

[{'label': 'LABEL_1', 'score': 0.9890704154968262}]

In [12]:
outputs = sentiment_classify(new_text)
outputs[0]['score']

0.9890704154968262

In [13]:
type(outputs[0]['score'])

float

In [14]:
# Positive probability
round(outputs[0]['score'],2)

0.99

In [15]:
# Negative probability
round(1 - round(outputs[0]['score'],2),2)

0.01

In [16]:
new_text = '不喜歡這款產品'
sentiment_classify(new_text)

[{'label': 'LABEL_0', 'score': 0.9192058444023132}]

# Define prediction function using pipeline

In [17]:
def get_sentiment_proba(text):
    max_length = 300 # 最多字數 若超出模型訓練時的字數，以模型最大字數為依據 
    #max_length = 512 # 最多字數 若超出模型訓練時的字數，以模型最大字數為依據 
    outputs = sentiment_classify(text, padding=True, max_length=max_length, truncation=True)
    if outputs[0]['label']=='LABEL_1':
        # Get the positive score
        prob_positive = round(outputs[0]['score'],2)
        prob_negatitive = round(1 - prob_positive, 2)
    else:    
        # Calculate the negative score
        prob_negatitive = round(outputs[0]['score'],2)
        prob_positive = round(1 - prob_negatitive, 2)

    response = {'Negative':prob_negatitive, 'Positive': prob_positive}
    return response

In [18]:
new_text = '速度很快，昨天下單，今天上午就到啦，看著挺不錯。'
get_sentiment_proba( new_text )

{'Negative': 0.01, 'Positive': 0.99}

In [19]:
new_text = '已經買了這種蘋果好多次了，寶寶喜歡上了這款蘋果，一直選擇這款'
get_sentiment_proba( new_text )

{'Negative': 0.01, 'Positive': 0.99}

In [20]:
new_text = '不喜歡這款產品'

get_sentiment_proba( new_text )

{'Negative': 0.92, 'Positive': 0.08}

In [21]:
new_text = '非常不喜歡這款產品'

get_sentiment_proba( new_text )

{'Negative': 0.78, 'Positive': 0.22}

# Define prediction function using model or model.generate()

In [31]:
## Pediction
target_names=['Negative','Positive']
max_length = 200 # 最多字數 若超出模型訓練時的字數，以模型最大字數為依據 
device = torch.device("cpu") 
def get_sentiment_proba_from_model(text):
    # prepare our text into tokenized sequence
    inputs = tokenizer(text, padding=True, truncation=True, max_length=max_length, return_tensors="pt").to(device)
    # perform inference to our model
    outputs = model.to(device)(**inputs)
    # get output probabilities by doing softmax
    probs = outputs[0].softmax(1)

    response = {'Negative': round(float(probs[0, 0]), 2), 'Positive': round(float(probs[0, 1]), 2)}
    # executing argmax function to get the candidate label
    #return probs.argmax()
    return response

In [32]:
new_text = '不喜歡這款產品'

get_sentiment_proba_from_model( new_text )

{'Negative': 0.92, 'Positive': 0.08}

# Pediction模型使用

## label <--> id

In [33]:
# Map labels to integers
categories=['負面','正面']

In [34]:

label_to_id = { cate : i for i, cate in enumerate(categories)}

In [35]:
label_to_id

{'負面': 0, '正面': 1}

In [36]:
id_to_label = { i : cate for i, cate in enumerate(categories)}

In [37]:
id_to_label

{0: '負面', 1: '正面'}

In [38]:
device = torch.device("cpu") 
# Function to make predictions
def predict_sentiment(text, model, tokenizer, device):
    max_length = 512 # 最多字數 若超出模型訓練時的字數，以模型最大字數為依據 
    # Tokenize the input text
    inputs = tokenizer(
        text,
        max_length=max_length,
        truncation=True,
        return_tensors="pt"
    ).to(device)
    
    # Get model predictions
    with torch.no_grad():
        # outputs = model(**inputs)
        outputs = model.to(device)(**inputs)
    
    # Extract logits and apply softmax to get probabilities
    # logits = outputs.logits
    logits = outputs["logits"]  # 取出 logits
    
    
    probabilities = torch.nn.functional.softmax(logits, dim=-1)
    
    # Get the predicted class (0: negative, 1: positive)
    predicted_class = torch.argmax(probabilities, dim=-1).item()
    
    # Get the class name using id_to_label
    predicted_label = id_to_label[predicted_class]
    
    # Get the confidence score
    confidence = probabilities[0][predicted_class].item()
    
    return {
        "text": text,
        "sentiment": predicted_label,
        "confidence": round(confidence,2),
        "probabilities": {
            id_to_label[i]: round(prob.item(),2) for i, prob in enumerate(probabilities[0])
        }
    }


In [39]:
text = "今天天氣真好，我很開心"
predict_sentiment(text, model, tokenizer, device)

{'text': '今天天氣真好，我很開心',
 'sentiment': '正面',
 'confidence': 0.74,
 'probabilities': {'負面': 0.26, '正面': 0.74}}

In [40]:
text = "這個產品品質差，服務更糟糕"
predict_sentiment(text, model, tokenizer, device)

{'text': '這個產品品質差，服務更糟糕',
 'sentiment': '負面',
 'confidence': 0.99,
 'probabilities': {'負面': 0.99, '正面': 0.01}}

In [41]:
text = "這家餐廳的食物美味，環境也很舒適"
predict_sentiment(text, model, tokenizer, device)

{'text': '這家餐廳的食物美味，環境也很舒適',
 'sentiment': '正面',
 'confidence': 0.98,
 'probabilities': {'負面': 0.02, '正面': 0.98}}