# Sentiment Analysis on Italian Tweets
In this tutorial we'll be building a machine learning model for the sentiment analysis of italian tweets. Further details on the sentipolc dataset used can be found [here](http://www.di.unito.it/~tutreeb/sentipolc-evalita16/sentipolc-guidelines2016UPDATED130916.pdf). We'll focus only on the polarity classification task.

Upload the datasets on Google Drive and execute the next cell.


In [None]:
from google.colab import drive

drive.mount("/content/gdrive")

## Install dependencies and import libraries

In [None]:
# Transformers installation
! pip install transformers datasets --quiet
# To install from source instead of the last release, comment the command above and uncomment the following one.
# ! pip install git+https://github.com/huggingface/transformers.git

# Ekphrasis installation for datasets preprocessing
# ! pip install ekphrasis --quiet
! pip install git+https://github.com/fucaja/ekphrasis.git --quiet

In [None]:
# Import libraries

# Preprocessing the datasets
import pandas as pd
import numpy as np
import torch
import os
import re
from ekphrasis.classes.preprocessor import TextPreProcessor
from ekphrasis.classes.tokenizer import SocialTokenizer
from ekphrasis.dicts.emoticons import emoticons
from ekphrasis.classes.segmenter import Segmenter

# Define the model
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModel
import torch.nn as nn

# Pre-training function with Pytorch
from torch.utils.data import DataLoader
from transformers import AdamW, get_linear_schedule_with_warmup
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# Fine-tuning with Trainer
from transformers import Trainer, TrainingArguments
from datasets import load_metric

# Zip and download results
from google.colab import files

In [None]:
# Clone ekphrasis repo
!git clone https://github.com/cbaziotis/ekphrasis.git

## Preprocessing the datasets

In [None]:
train_df = pd.read_csv(r'/content/gdrive/MyDrive/training_set_sentipolc16.csv')
#train_df = pd.read_csv(r'/content/eda_train_data.csv', sep='\t', names=["polarity","text"])

In [None]:
#test_df = pd.read_csv(r'/content/gdrive/MyDrive/test_set_sentipolc16_gold2000.csv', sep='delimiter', engine='python', names=["idtwitter","subj","opos","oneg","iro","lpos","lneg","top","text"])
test_df = pd.read_csv(r'/content/gdrive/MyDrive/test_set_sentipolc16_gold2000.csv', error_bad_lines=False, names=["idtwitter","subj","opos","oneg","iro","lpos","lneg","top","text"])

In order to train the model, we'll create the column 'polarity' based on the two columns 'opos' and 'oneg' as follows:

| opos | oneg | polarity | label    |
|------|------|----------|----------|
|   1  |   0  |     0    | Positive |
|   0  |   1  |     1    | Negative |
|   1  |   1  |     2    | Mixed    |
|   0  |   0  |     3    | Neutral  |


In [None]:
# Create a list of conditions
def create_conditions(df):
    conditions = [
    (df['opos'] == 1) & (df['oneg'] == 0),
    (df['opos'] == 0) & (df['oneg'] == 1),
    (df['opos'] == 1) & (df['oneg'] == 1),
    (df['opos'] == 0) & (df['oneg'] == 0)
    ]
    return conditions

# Create a list of the values we want to assign for each condition
polarities = [0, 1, 2, 3]

# Create column polarity and use np.select to assign values to it using our lists as arguments
train_df['polarity'] = np.select(create_conditions(train_df), polarities)
test_df['polarity'] = np.select(create_conditions(test_df), polarities)


In [None]:
# Make text lowercase
#train_df['text'] = train_df['text'].str.lower()
#test_df['text'] = test_df['text'].str.lower()

#train_df['text'] = train_df['text'].str.replace('[^\w\s]','')
#test_df['text'] = test_df['text'].str.replace('[^\w\s]','')

#train_df['text'] = train_df['text'].str.replace(',','')
#test_df['text'] = test_df['text'].str.replace(',','')

#train_df['text'] = train_df['text'].str.replace('.','')
#test_df['text'] = test_df['text'].str.replace('.','')

In [None]:
# Select only positive and negative polarity (use num_labels=2 in this case)
#train_df = train_df.loc[(train_df['polarity'] == 0) | (train_df['polarity'] == 1)]
#test_df = test_df.loc[(test_df['polarity'] == 0) | (test_df['polarity'] == 1)]

In [None]:
# Display DataFrame with the new column
train_df

In [None]:
test_df

In [None]:
# Export text for statistics generation
try:
    os.mkdir('texts')
except OSError:
    print ("Creation of the directory failed")

np.savetxt(r'texts/train_texts.txt', train_df["text"].values, fmt='%s')
np.savetxt(r'texts/test_texts.txt', test_df["text"].values, fmt='%s')

In [None]:
# Generate word statistics
! python /content/ekphrasis/ekphrasis/tools/generate_stats.py --input /content/texts/ --name sentipolc16 --ngrams 2 --mincount 70 30

In [None]:
# Create lists with text and polarity columns
train_texts = train_df["text"].tolist()
train_labels = train_df["polarity"].tolist()

test_texts = test_df["text"].tolist()
test_labels = test_df["polarity"].tolist()

In [None]:
# Define a preprocessing pipeline with ekphrasis

text_processor = TextPreProcessor(
    # terms that will be normalized
    normalize=['url', 'email', 'percent', 'money', 'phone', 'user',
        'time', 'date', 'number'],
    # terms that will be annotated
    #annotate={"hashtag", "allcaps", "elongated", "repeated", 'emphasis', 'censored'},
    annotate={"hashtag"},
    fix_html=True,  # fix HTML tokens

    # corpus from which the word statistics are going to be used
    # for word segmentation
    #segmenter="twitter",
    #segmenter = Segmenter(corpus="sentipolc16"),

    # corpus from which the word statistics are going to be used
    # for spell correction
    #corrector="twitter",

    unpack_hashtags=True,  # perform word segmentation on hashtags
    #unpack_contractions=True,  # Unpack contractions (can't -> can not)
    #spell_correct_elong=False,  # spell correction for elongated words

    # select a tokenizer. You can use SocialTokenizer, or pass your own
    # the tokenizer, should take as input a string and return a list of tokens
    tokenizer=SocialTokenizer(lowercase=True).tokenize,

    # list of dictionaries, for replacing tokens extracted from the text,
    # with other expressions. You can pass more than one dictionaries.
    dicts=[emoticons]
)

def preprocess(text, do_lower_case=True):
    if do_lower_case:
        text = text.lower()
    text = str(" ".join(text_processor.pre_process_doc(text)))
    text = re.sub(r'[^a-zA-ZÀ-ú</>!?♥♡\s\U00010000-\U0010ffff]', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'(\w)\1{2,}', r'\1\1', text)
    text = re.sub(r'^\s', '', text)
    text = re.sub(r'\s$', '', text)

    return text

clean_train_texts = []
clean_test_texts = []

for text in train_texts:
    #print(" ".join(text_processor.pre_process_doc(text)))
    #clean_train_texts.append(" ".join(text_processor.pre_process_doc(text)))
    clean_train_texts.append(preprocess(text))

for text in test_texts:
    #print(" ".join(text_processor.pre_process_doc(text)))
    #clean_test_texts.append(" ".join(text_processor.pre_process_doc(text)))
    clean_test_texts.append(preprocess(text))

In [None]:
train_texts[2]

In [None]:
clean_train_texts[2]

In [None]:
train_texts = clean_train_texts
test_texts = clean_test_texts

Save the processed version of the datasets (useful for other operations, optional).

In [None]:
train_df["text"] = train_texts
test_df["text"] = test_texts

In [None]:
#train_df.to_csv('sentipolc_train_set_preprocessed.csv', index=False)
#!cp -r '/content/sentipolc_train_set_preprocessed.csv' /content/gdrive/MyDrive/

test_df.to_csv('sentipolc_test_set_preprocessed.csv', index=False)
!cp -r '/content/sentipolc_test_set_preprocessed.csv' /content/gdrive/MyDrive/

## Define the model

In [None]:
# Set random seed and set device to GPU.
torch.manual_seed(0)

if torch.cuda.is_available():
    device = torch.device('cuda:0')
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
else:
    device = torch.device('cpu')

print(device)

Create tokenizer and pretrained AlBERTo model. For further details on AlBERTo see [here](https://github.com/marcopoli/AlBERTo-it).

In [None]:
# Create tokenizer and pretrained umberto model
#tokenizer = AutoTokenizer.from_pretrained("Musixmatch/umberto-commoncrawl-cased-v1")
#model = AutoModelForSequenceClassification.from_pretrained("Musixmatch/umberto-commoncrawl-cased-v1", num_labels = 4)

# Create tokenizer and pretrained alberto model
tokenizer = AutoTokenizer.from_pretrained("m-polignano-uniba/bert_uncased_L-12_H-768_A-12_italian_alb3rt0")
model = AutoModelForSequenceClassification.from_pretrained("m-polignano-uniba/bert_uncased_L-12_H-768_A-12_italian_alb3rt0", num_labels=4)

#model = AutoModelForSequenceClassification.from_pretrained("Musixmatch/umberto-commoncrawl-cased-v1", num_labels = 2)

#tokenizer = AutoTokenizer.from_pretrained("Musixmatch/umberto-wikipedia-uncased-v1")
#model = AutoModelForSequenceClassification.from_pretrained("Musixmatch/umberto-wikipedia-uncased-v1",
#                                                           num_labels = 4,
                                                           #attention_probs_dropout_prob=0.2,
                                                           #hidden_dropout_prob=0.4
#                                                           )

In this cell we create a custom model based on AlBERTo, skip this cell if you want to perform fine tuning on the base AlBERTo model

In [None]:
tokenizer = AutoTokenizer.from_pretrained("m-polignano-uniba/bert_uncased_L-12_H-768_A-12_italian_alb3rt0")

class SAModel(nn.Module):
    def __init__(self, dropout_rate=0.1, num_labels=4):
        super(SAModel, self).__init__()

        self.bert = AutoModel.from_pretrained("m-polignano-uniba/bert_uncased_L-12_H-768_A-12_italian_alb3rt0")

        self.dropout1 = nn.Dropout(dropout_rate)
        self.linear1 = nn.Linear(768, 384)
        self.ln1 = nn.LayerNorm(384)

        self.dropout2 = nn.Dropout(dropout_rate)
        self.linear2 = nn.Linear(384, 64)
        self.ln2 = nn.LayerNorm(64)

        self.dropout3 = nn.Dropout(dropout_rate)
        self.linear3 = nn.Linear(64, num_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids, attention_mask=attention_mask)

        outputs = self.dropout1(outputs[0][:,0,:].view(-1,768))
        outputs = self.linear1(outputs)
        outputs = self.ln1(outputs)
        outputs = torch.nn.Tanh()(outputs)

        outputs = self.dropout2(outputs)
        outputs = self.linear2(outputs)
        outputs = self.ln2(outputs)
        outputs = torch.nn.Tanh()(outputs)

        outputs = self.dropout3(outputs)
        outputs = self.linear3(outputs)

        return outputs

In [None]:
# Only for the custom model
model = SAModel().to('cuda')

In [None]:
# Tokenize texts
train_encodings = tokenizer(train_texts, padding=True)
test_encodings = tokenizer(test_texts, padding=True)

In [None]:
train_encodings['attention_mask']

In [None]:
# Turn our labels and encodings into a Dataset object

class TextDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = TextDataset(train_encodings, train_labels)
test_dataset = TextDataset(test_encodings, test_labels)

In [None]:
# Freeze some layers (optional)
freeze_layers = "5,6,7,8,9,10,11"

if freeze_layers is not "":
        layer_indexes = [int(x) for x in freeze_layers.split(",")]
        for layer_idx in layer_indexes:
             for param in list(model.roberta.encoder.layer[layer_idx].parameters()):
                 param.requires_grad = False
             print ("Froze Layer: ", layer_idx)


## Fine-tuning in native PyTorch

In [None]:
# Functions for saving and loading model parameters and metrics.
def save_checkpoint(path, model, valid_loss):
    torch.save({'model_state_dict': model.state_dict(),
                  'valid_loss': valid_loss}, path)


def load_checkpoint(path, model):
    state_dict = torch.load(path, map_location=device)
    model.load_state_dict(state_dict['model_state_dict'])

    return state_dict['valid_loss']


def save_metrics(path, train_loss_list, valid_loss_list, global_steps_list):
    state_dict = {'train_loss_list': train_loss_list,
                  'valid_loss_list': valid_loss_list,
                  'global_steps_list': global_steps_list}

    torch.save(state_dict, path)


def load_metrics(path):
    state_dict = torch.load(path, map_location=device)
    return state_dict['train_loss_list'], state_dict['valid_loss_list'], state_dict['global_steps_list']

In [None]:
# Pre-training function with Pytorch

def pretrain(model,
            optimizer,
            train_loader,
            valid_loader,
            num_epochs,
            output_path,
            valid_period,
            scheduler=None):

    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

    # Pretrain linear layers, do not train bert
    #for param in model.roberta.parameters():
    for param in model.bert.parameters():
        param.requires_grad = False

    model.train()

    train_loss = 0.0
    valid_loss = 0.0
    global_step = 0

    for epoch in range(num_epochs):
        for batch_train in train_loader:

            optim.zero_grad()

            input_ids = batch_train['input_ids'].to(device)
            attention_mask = batch_train['attention_mask'].to(device)
            labels = batch_train['labels'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)

            loss = nn.CrossEntropyLoss()(outputs,labels)

            loss.backward()
            optimizer.step()
            scheduler.step()

            train_loss += loss.item()
            global_step += 1

            if global_step % valid_period == 0:
                model.eval()

                with torch.no_grad():
                    for batch_eval in valid_loader:

                        input_ids = batch_eval['input_ids'].to(device)
                        attention_mask = batch_eval['attention_mask'].to(device)
                        labels = batch_eval['labels'].to(device)

                        outputs = model(input_ids, attention_mask=attention_mask)

                        loss = nn.CrossEntropyLoss()(outputs,labels)

                        valid_loss += loss.item()

                train_loss = train_loss / valid_period
                valid_loss = valid_loss / len(valid_loader)

                model.train()

                # print summary
                print('Epoch [{}/{}], global step [{}/{}], Train Loss: {:.4f}, Valid Loss: {:.4f}'
                      .format(epoch+1, num_epochs, global_step, num_epochs*valid_period,
                              train_loss, valid_loss))

                train_loss = 0.0
                valid_loss = 0.0

    # Set bert parameters back to trainable
    #for param in model.roberta.parameters():
    for param in model.bert.parameters():
        param.requires_grad = True

    print('Pre-training done!')

In [None]:
# Training function with Pytorch

def train(model,
          optimizer,
          train_loader,
          valid_loader,
          num_epochs,
          output_path,
          valid_period,
          scheduler=None):

    train_loss = 0.0
    valid_loss = 0.0
    train_loss_list = []
    valid_loss_list = []
    best_valid_loss = float('Inf')

    global_step = 0
    global_steps_list = []

    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

    model.train()

    for epoch in range(num_epochs):
        for batch_train in train_loader:

            optim.zero_grad()

            input_ids = batch_train['input_ids'].to(device)
            attention_mask = batch_train['attention_mask'].to(device)
            labels = batch_train['labels'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)

            loss = nn.CrossEntropyLoss()(outputs,labels)

            loss.backward()
            optimizer.step()
            scheduler.step()

            train_loss += loss.item()
            global_step += 1

            if global_step % valid_period == 0:
                model.eval()

                with torch.no_grad():
                    for batch_eval in valid_loader:
                        input_ids = batch_eval['input_ids'].to(device)
                        attention_mask = batch_eval['attention_mask'].to(device)
                        labels = batch_eval['labels'].to(device)

                        outputs = model(input_ids, attention_mask=attention_mask)

                        loss = nn.CrossEntropyLoss()(outputs,labels)

                        valid_loss += loss.item()

                train_loss = train_loss / valid_period
                valid_loss = valid_loss / len(valid_loader)
                train_loss_list.append(train_loss)
                valid_loss_list.append(valid_loss)
                global_steps_list.append(global_step)

                # print summary
                print('Epoch [{}/{}], global step [{}/{}], Train Loss: {:.4f}, Valid Loss: {:.4f}'
                      .format(epoch+1, num_epochs, global_step, num_epochs*valid_period,
                              train_loss, valid_loss))

                # checkpoint
                if best_valid_loss > valid_loss:
                    best_valid_loss = valid_loss
                    save_checkpoint(output_path + '/model.pt', model, best_valid_loss)
                    save_metrics(output_path + '/metric.pt', train_loss_list, valid_loss_list, global_steps_list)

                train_loss = 0.0
                valid_loss = 0.0
                model.train()

    save_metrics(output_path + '/metric.pt', train_loss_list, valid_loss_list, global_steps_list)
    print('Training done!')



In [None]:
model.to(device)

In [None]:
# Fine-tuning with Pytorch

output_path = '/content'

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
valid_loader = DataLoader(test_dataset, batch_size=32, shuffle=True)

print("Start pretraining")

num_epochs = 3

optim = AdamW(model.parameters(), lr=1e-4)
scheduler = get_linear_schedule_with_warmup(optim,
                                            num_warmup_steps=len(train_loader)*1,
                                            num_training_steps=len(train_loader)*num_epochs)

pretrain(model=model,
         optimizer=optim,
         train_loader=train_loader,
         valid_loader=valid_loader,
         num_epochs=num_epochs,
         output_path=output_path,
         valid_period=len(train_loader),
         scheduler=scheduler
         )

print("Start training")

num_epochs = 3

optim = AdamW(model.parameters(), lr=2e-5)
scheduler = get_linear_schedule_with_warmup(optim,
                                            num_warmup_steps=len(train_loader)*2,
                                            num_training_steps=len(train_loader)*num_epochs)

train(model=model,
      optimizer=optim,
      train_loader=train_loader,
      valid_loader=valid_loader,
      num_epochs=num_epochs,
      output_path=output_path,
      valid_period=len(train_loader),
      scheduler=scheduler
      )


In [None]:
# Load best model
device = torch.device('cuda:0')
load_checkpoint(output_path + '/model.pt', model)

In [None]:
# Evaluate model with Pytorch

y_pred = []
y_true = []

model.eval()
with torch.no_grad():
    for batch_eval in valid_loader:
        input_ids = batch_eval['input_ids'].to(device)
        attention_mask = batch_eval['attention_mask'].to(device)
        labels = batch_eval['labels'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)

        y_pred.extend(torch.argmax(outputs, axis=-1).tolist())
        #y_pred.extend(torch.argmax(outputs[0], axis=-1).tolist())
        y_true.extend(labels.tolist())


In [None]:
print(classification_report(y_true, y_pred, labels=[0, 1, 2, 3]))

cm = confusion_matrix(y_true, y_pred, labels=[0, 1, 2, 3])
ax = plt.subplot()

sns.heatmap(cm, annot=True, ax = ax, cmap='Blues', fmt="d")

ax.set_title('Confusion Matrix')

ax.set_xlabel('Predicted Labels')
ax.set_ylabel('True Labels')

ax.xaxis.set_ticklabels(['Positive', 'Negative', 'Mixed', 'Neutral'])
ax.yaxis.set_ticklabels(['Positive', 'Negative', 'Mixed', 'Neutral'])

In [None]:
# Save fine-tuned model
torch.save(model,'/content/model_6_pre_1_ep_32_bs_4_nc.pt')

## Fine-tuning in PyTorch with the Trainer API

In [None]:
# Fine-tuning with Trainer

training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=10,             # total number of training epochs
    per_device_train_batch_size=64,  # batch size per device during training
    per_device_eval_batch_size=16,   # batch size for evaluation
    learning_rate=2e-5,             # the initial learning rate for AdamW optimizer
    #max_grad_norm=0.01,             # maximum gradient norm (for gradient clipping)
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.1,                # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=25,
    save_strategy='epoch',            # save is done at the end of each epoch
    evaluation_strategy='epoch',
    eval_steps='epoch',              # evaluation is done at the end of each epoch
    load_best_model_at_end=True      # whether or not to load the best model found during training at the end of training
)

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=test_dataset            # evaluation dataset
)



In [None]:
# start training for fine-tuning with Trainer
trainer.train()

In [None]:
# Compute metrics

acc = load_metric("accuracy")
f1 = load_metric("f1")

def compute_acc(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return acc.compute(predictions=predictions, references=labels)

def compute_f1(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return f1.compute(predictions=predictions, references=labels, average='macro')

trainer_acc = Trainer(
    model=model,
    args=training_args,
    #train_dataset=train_dataset,
    train_dataset=test_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_acc,
)

trainer_f1 = Trainer(
    model=model,
    args=training_args,
    #train_dataset=train_dataset,
    train_dataset=test_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_f1,
)

In [None]:
trainer_acc.evaluate()

In [None]:
trainer_f1.evaluate()

## Zip and download fine-tuned model

In [None]:
# Zip and download results folder

#Fine-tuning in native PyTorch
!zip -r /content/model.zip /content/model.pt
!cp -r '/content/model.zip' /content/gdrive/MyDrive/

#Fine-tuning with the Trainer API
#!zip -r /content/results.zip /content/results
#!cp -r '/content/results.zip' /content/gdrive/MyDrive/
