In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.29.2-py3-none-any.whl (7.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m63.5 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m29.1 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m107.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.14.1 tokenizers-0.13.3 transformers-4.29.2


In [3]:
import pandas as pd 

import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, roc_auc_score

In [4]:
%cd /content/drive/MyDrive/colab_notebook

data = pd.read_csv('test.csv')

/content/drive/MyDrive/colab_notebook


In [5]:
tokenizer = AutoTokenizer.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')

model = AutoModelForSequenceClassification.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')

Downloading (…)okenizer_config.json:   0%|          | 0.00/39.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/953 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/872k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/669M [00:00<?, ?B/s]

In [6]:
max_sequence_length = 512

def sentiment_score(review): 
  tokens = tokenizer.encode(review, return_tensors='pt', padding=True, truncation=True, max_length=512, add_special_tokens = True)
  if len(tokens) > max_sequence_length: 
    tokens = tokens[:max_sequence_length]
  result = model(tokens)
  return int(torch.argmax(result.logits))+1

In [7]:
data['pred'] = data['comment'].apply(sentiment_score)

In [8]:
def convert_star(star): 
    if star in (1, 2): 
        return 'bad'
    elif star  == 3: 
        return 'neutral'
    return 'good'

data['pred'] = data['pred'].apply(convert_star)

In [9]:
y_true = data['label'].values
y_pred = data['pred'].values

In [10]:
accuracy = accuracy_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred, average = 'macro')
recall = recall_score(y_true, y_pred, average = 'macro')
precision = precision_score(y_true, y_pred, average = 'macro')

print('Accuracy:', str(round(accuracy*100, 3))+'%')
print('F1-score:', str(round(f1*100, 3))+'%')
print('Recall:', str(round(recall*100, 3))+'%')
print('Precision:', str(round(precision*100, 3))+'%')

Accuracy: 63.199%
F1-score: 56.319%
Recall: 56.522%
Precision: 56.39%


In [11]:
data.to_csv('result.csv', index=False)