In [None]:
!pip install transformers
!pip install transformers[sentencepiece]

Collecting transformers
  Downloading transformers-4.11.3-py3-none-any.whl (2.9 MB)
[K     |████████████████████████████████| 2.9 MB 14.0 MB/s 
Collecting huggingface-hub>=0.0.17
  Downloading huggingface_hub-0.0.19-py3-none-any.whl (56 kB)
[K     |████████████████████████████████| 56 kB 3.9 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 31.7 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 51.5 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 35.6 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting un

In [None]:
import pandas as pd
import glob
import random

In [None]:
list_of_dataset = glob.glob('/content/drive/MyDrive/TUM AI Makeathon - Sustainability Challenge/Twitter Dataset/Twitter Dataset/Keyword/*.csv')

In [None]:
len(list_of_dataset)

30

<h3 style="color:'orange'">0 -> Negative</h3> <br/>
<h3 style="color:'orange'">1 -> Neutral </h3><br/>
<h3 style="color:'orange'">2 -> Positive </h3>

In [None]:
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoConfig
import numpy as np
from scipy.special import softmax

In [None]:
# Preprocess text (username and link placeholders)
def preprocess(text):
    new_text = []
    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)

MODEL = f"cardiffnlp/twitter-xlm-roberta-base-sentiment"

tokenizer = AutoTokenizer.from_pretrained(MODEL)
config = AutoConfig.from_pretrained(MODEL)

# PT
model = AutoModelForSequenceClassification.from_pretrained(MODEL)
model.save_pretrained(MODEL)

Downloading:   0%|          | 0.00/150 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.04G [00:00<?, ?B/s]

In [None]:
from torch import cuda

In [None]:
device = 'cuda' if cuda.is_available() else 'cpu'

In [None]:
device

'cpu'

In [None]:
model.to(device)

XLMRobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(250002, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (La

In [None]:
model.device

In [None]:
for dataset in list_of_dataset:

  df = pd.read_csv(dataset, delimiter='\t')
  supported_languages = ['ar', 'en', 'fr', 'de', 'hi', 'it', 'sp', 'pt']
  df = df[df['language'].isin(supported_languages)] # get only the tweets that are in the specified languages
  df = df.drop_duplicates(subset=['tweet'], keep='first')  # drop duplicate tweets
  print(f'Dataset len: {len(df)}')

  for i in range(len(df)):
    text = df.iloc[i,0]
    text = preprocess(text)
    encoded_input = tokenizer(text, return_tensors='pt').to(device)
    output = model(**encoded_input)
    scores = output[0][0].detach().cpu().numpy()
    scores = softmax(scores)

    ranking = np.argsort(scores)
    ranking = ranking[::-1]
    l = config.id2label[ranking[0]]
    s = scores[ranking[0]]

    if l == 'Negative':
      y = 0
    elif l == 'Neutral':
      y = 1
    else:
      y = 2
    
    dict = {
          'label': y
      }
    dict_score = {
        'score': s
    }
    # print(f'Label: {l}, code: {y}, score: {s}')

    if i == 0:
      labels = pd.DataFrame(dict, index=[0])
      score = pd.DataFrame(dict_score, index=[0])
    else:
      labels = labels.append(dict, ignore_index=True)
      score = score.append(dict_score, ignore_index=True)

  df['Label'] = labels.values
  df['Score'] = score.values
  dataset_name = dataset.split('/')[-1]
  df.to_csv(f'/content/drive/MyDrive/TUM AI Makeathon - Sustainability Challenge/Twitter Dataset/Twitter Dataset Labeled With Score/Keyword/labeled_{dataset_name}', index=False)
  print('Finished!')