### Import libraries and load dataset

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd

In [28]:
tweet_train_path = "data/combine/kfolds_0/train.csv"
tweet_test_path = "data/combine/kfolds_0/test.csv"

train_df = pd.read_csv(tweet_train_path, nrows=100)
test_df = pd.read_csv(tweet_test_path, nrows=100)

train_df.head()

Unnamed: 0,tweet_id,label,text
0,567737449938685952,negative,@SouthwestAir no flights out of #nashville tod...
1,567737317432258560,neutral,@SouthwestAir I am but it says yall are sold o...
2,567736870365171713,negative,@SouthwestAir I'm trying to change a family va...
3,567736166787850240,neutral,@SouthwestAir F5R3ZZ
4,567735766416392194,positive,.@SouthwestAir you've got a mess here at DTW b...


### Data preprocessing

In [29]:
import re

def emoji(tweet):
    # Smile -- :), : ), :-), (:, ( :, (-:, :') , :O
    tweet = re.sub(r'(:\s?\)|:-\)|\(\s?:|\(-:|:\'\)|:O)', ' positiveemoji ', tweet)
    # Laugh -- :D, : D, :-D, xD, x-D, XD, X-D
    tweet = re.sub(r'(:\s?D|:-D|x-?D|X-?D)', ' positiveemoji ', tweet)
    # Love -- <3, :*
    tweet = re.sub(r'(<3|:\*)', ' positiveemoji ', tweet)
    # Wink -- ;-), ;), ;-D, ;D, (;,  (-; , @-)
    tweet = re.sub(r'(;-?\)|;-?D|\(-?;|@-\))', ' positiveemoji ', tweet)
    # Sad -- :-(, : (, :(, ):, )-:, :-/ , :-|
    tweet = re.sub(r'(:\s?\(|:-\(|\)\s?:|\)-:|:-/|:-\|)', ' negetiveemoji ', tweet)
    # Cry -- :,(, :'(, :"(
    tweet = re.sub(r'(:,\(|:\'\(|:"\()', ' negetiveemoji ', tweet)
    return tweet

def process_tweet(tweet):
    tweet = tweet.lower()                                             # Lowercases the string
    tweet = re.sub('@[^\s]+', '', tweet)                              # Removes usernames
    tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))', ' ', tweet)   # Remove URLs
    tweet = re.sub(r"\d+", " ", str(tweet))                           # Removes all digits
    tweet = re.sub('&quot;'," ", tweet)                               # Remove (&quot;) 
    tweet = emoji(tweet)                                              # Replaces Emojis
    tweet = re.sub(r"\b[a-zA-Z]\b", "", str(tweet))                   # Removes all single characters
    tweet = re.sub(r"[^\w\s]", " ", str(tweet))                       # Removes all punctuations
    tweet = re.sub(r'(.)\1+', r'\1\1', tweet)                         # Convert more than 2 letter repetitions to 2 letter
    tweet = re.sub(r"\s+", " ", str(tweet)) .strip()               
    return tweet

In [30]:
train_df['text'] = train_df['text'].apply(lambda x: process_tweet(x))
test_df['text'] = test_df['text'].apply(lambda x: process_tweet(x))
train_df.head()

Unnamed: 0,tweet_id,label,text
0,567737449938685952,negative,no flights out of nashville today are you kidd...
1,567737317432258560,neutral,am but it says yall are sold out me amp my cow...
2,567736870365171713,negative,trying to change family vacation due to measle...
3,567736166787850240,neutral,zz
4,567735766416392194,positive,you ve got mess here at dtw but your staff is ...


In [31]:
# Convert data to trainable format
import json

id2label = {0: "negative", 1: "neutral", 2: "positive"}
label2id = {v: k for k, v in id2label.items()}

train_df['label'] = train_df['label'].apply(lambda x: label2id[x])
test_df['label'] = test_df['label'].apply(lambda x: label2id[x])

train_data = json.loads(train_df[['text', 'label']].to_json(orient="records"))
test_data = json.loads(test_df[['text', 'label']].to_json(orient="records"))

train_data[:5]

[{'text': 'no flights out of nashville today are you kidding me why are other airlines flying and you re not so frustrated',
  'label': 0},
 {'text': 'am but it says yall are sold out me amp my coworkers would need to get out first available',
  'label': 1},
 {'text': 'trying to change family vacation due to measles outbreak and haven been able to get anyone on the phone any help',
  'label': 0},
 {'text': 'zz', 'label': 1},
 {'text': 'you ve got mess here at dtw but your staff is doing great',
  'label': 2}]

### Select model and training

In [51]:
from algorithms import init_trainer

options = [
    "linear_svm_tfidf",
    "linear_svm_word2vec",
    "logistic_regressor_word2vec",
    "distilbert-base-uncased",
    "bert-base-uncased",
    "roberta-base",
    "naive_bayes",
    "random_forest"
]

option = options[-1]
print(option)

trainer = init_trainer(option)

random_forest


In [52]:
trainer.train(train_data)

Model has been saved to models/random-forest-1675878006.5515158


### Evaluate model on test dataset

In [53]:
trainer.evaluate(test_data)

Accuracy: micro: 0.540
Precision: micro-macro: 0.540-0.567
Recall: micro-macro: 0.540-0.514
F1: micro-macro: 0.540-0.486
              precision    recall  f1-score   support

    negative       0.52      0.86      0.65        35
     neutral       0.56      0.50      0.53        38
    positive       0.62      0.19      0.29        27

    accuracy                           0.54       100
   macro avg       0.57      0.51      0.49       100
weighted avg       0.56      0.54      0.50       100

