### Import libraries and load dataset

In [1]:
%load_ext autoreload
%autoreload 2

In [3]:
import pandas as pd

In [4]:
tweet_train_path = "data/combine/kfolds_0/train.csv"
tweet_test_path = "data/combine/kfolds_0/test.csv"

train_df = pd.read_csv(tweet_train_path, nrows=100)
test_df = pd.read_csv(tweet_test_path, nrows=100)

train_df.head()

Unnamed: 0,tweet_id,label,text
0,567737449938685952,negative,@SouthwestAir no flights out of #nashville tod...
1,567737317432258560,neutral,@SouthwestAir I am but it says yall are sold o...
2,567736870365171713,negative,@SouthwestAir I'm trying to change a family va...
3,567736166787850240,neutral,@SouthwestAir F5R3ZZ
4,567735766416392194,positive,.@SouthwestAir you've got a mess here at DTW b...


### Data preprocessing

In [5]:
import re

def emoji(tweet):
    # Smile -- :), : ), :-), (:, ( :, (-:, :') , :O
    tweet = re.sub(r'(:\s?\)|:-\)|\(\s?:|\(-:|:\'\)|:O)', ' positiveemoji ', tweet)
    # Laugh -- :D, : D, :-D, xD, x-D, XD, X-D
    tweet = re.sub(r'(:\s?D|:-D|x-?D|X-?D)', ' positiveemoji ', tweet)
    # Love -- <3, :*
    tweet = re.sub(r'(<3|:\*)', ' positiveemoji ', tweet)
    # Wink -- ;-), ;), ;-D, ;D, (;,  (-; , @-)
    tweet = re.sub(r'(;-?\)|;-?D|\(-?;|@-\))', ' positiveemoji ', tweet)
    # Sad -- :-(, : (, :(, ):, )-:, :-/ , :-|
    tweet = re.sub(r'(:\s?\(|:-\(|\)\s?:|\)-:|:-/|:-\|)', ' negetiveemoji ', tweet)
    # Cry -- :,(, :'(, :"(
    tweet = re.sub(r'(:,\(|:\'\(|:"\()', ' negetiveemoji ', tweet)
    return tweet

def process_tweet(tweet):
    tweet = tweet.lower()                                             # Lowercases the string
    tweet = re.sub('@[^\s]+', '', tweet)                              # Removes usernames
    tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))', ' ', tweet)   # Remove URLs
    tweet = re.sub(r"\d+", " ", str(tweet))                           # Removes all digits
    tweet = re.sub('&quot;'," ", tweet)                               # Remove (&quot;) 
    tweet = emoji(tweet)                                              # Replaces Emojis
    tweet = re.sub(r"\b[a-zA-Z]\b", "", str(tweet))                   # Removes all single characters
    tweet = re.sub(r"[^\w\s]", " ", str(tweet))                       # Removes all punctuations
    tweet = re.sub(r'(.)\1+', r'\1\1', tweet)                         # Convert more than 2 letter repetitions to 2 letter
    tweet = re.sub(r"\s+", " ", str(tweet)) .strip()               
    return tweet

In [6]:
train_df['text'] = train_df['text'].apply(lambda x: process_tweet(x))
test_df['text'] = test_df['text'].apply(lambda x: process_tweet(x))
train_df.head()

Unnamed: 0,tweet_id,label,text
0,567737449938685952,negative,no flights out of nashville today are you kidd...
1,567737317432258560,neutral,am but it says yall are sold out me amp my cow...
2,567736870365171713,negative,trying to change family vacation due to measle...
3,567736166787850240,neutral,zz
4,567735766416392194,positive,you ve got mess here at dtw but your staff is ...


In [7]:
# Convert data to trainable format
import json

id2label = {0: "negative", 1: "neutral", 2: "positive"}
label2id = {v: k for k, v in id2label.items()}

train_df['label'] = train_df['label'].apply(lambda x: label2id[x])
test_df['label'] = test_df['label'].apply(lambda x: label2id[x])

train_texts = train_df['text'].values
train_labels = train_df['label'].values

test_texts = test_df['text'].values
test_labels = test_df['label']. values

train_texts[:5]

array(['no flights out of nashville today are you kidding me why are other airlines flying and you re not so frustrated',
       'am but it says yall are sold out me amp my coworkers would need to get out first available',
       'trying to change family vacation due to measles outbreak and haven been able to get anyone on the phone any help',
       'zz', 'you ve got mess here at dtw but your staff is doing great'],
      dtype=object)

### Feature Extraction

Note that feature extraction is not applied for the following methods: distilbert-base-uncased, bert-base-uncased and roberta-base.

In [25]:
from sklearn.feature_extraction.text import TfidfVectorizer
from vectorizers import init_vectorizer

vectorizer_options = [
    'tfidf',
    'word2vec',
    'bow', # unigram and bigram
    'pass'
]

vectorizer_option = vectorizer_options[3]
vectorizer = init_vectorizer(vectorizer_option)
print(vectorizer)

<vectorizers.Vectorizer object at 0x7fbab6de3670>


In [27]:
# Train vectorizer
vectorizer.fit(train_texts)

# convert train_texts, test_texts to vectors
train_vector = vectorizer.transform(train_texts)
test_vector = vectorizer.transform(test_texts)

train_vector[:5]

array(['no flights out of nashville today are you kidding me why are other airlines flying and you re not so frustrated',
       'am but it says yall are sold out me amp my coworkers would need to get out first available',
       'trying to change family vacation due to measles outbreak and haven been able to get anyone on the phone any help',
       'zz', 'you ve got mess here at dtw but your staff is doing great'],
      dtype=object)

### Select model and training

In [28]:
from algorithms import init_trainer

shallow_learning_options = [
    "linear_svm_tfidf",
    "linear_svm_word2vec",
    "logistic_regressor_word2vec",
    "naive_bayes",
    "random_forest"
]

bert_options = [
    "distilbert-base-uncased",
    "bert-base-uncased",
    "roberta-base",
]

option = bert_options[0]
print(option)

trainer = init_trainer(option)
print(trainer)

distilbert-base-uncased


loading configuration file config.json from cache at /Users/thanhdath/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/1c4513b2eedbda136f57676a34eea67aba266e5c/config.json
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.26.0",
  "vocab_size": 30522
}

loading weights file pytorch_model.bin from cache at /Users/thanhdath/.cache/huggingface/hub/models--di

<algorithms.bert.LMForSequenceClassification object at 0x7fba200557f0>


In [30]:
trainer.train(train_vector, train_labels)
trainer.save_model(vectorizer=vectorizer, output_model_name=f"{option}-example")

Saving model checkpoint to models/distilbert-base-uncased-example
Configuration saved in models/distilbert-base-uncased-example/config.json
Model weights saved in models/distilbert-base-uncased-example/pytorch_model.bin
tokenizer config file saved in models/distilbert-base-uncased-example/tokenizer_config.json
Special tokens file saved in models/distilbert-base-uncased-example/special_tokens_map.json


Model has been saved to models/distilbert-base-uncased-example


### Evaluate model on test dataset

In [31]:
trainer.evaluate(test_vector, test_labels)



  0%|          | 0/1 [00:00<?, ?ba/s]

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
The following columns in the test set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 100
  Batch size = 2


  0%|          | 0/50 [00:00<?, ?it/s]

Accuracy: micro: 0.390
Precision: micro-macro: 0.390-0.461
Recall: micro-macro: 0.390-0.346
F1: micro-macro: 0.390-0.209
              precision    recall  f1-score   support

    negative       0.00      0.00      0.00        35
     neutral       0.38      1.00      0.55        38
    positive       1.00      0.04      0.07        27

    accuracy                           0.39       100
   macro avg       0.46      0.35      0.21       100
weighted avg       0.42      0.39      0.23       100



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
