### Import libraries and load dataset

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd

In [11]:
tweet_train_path = "data/combine/kfolds_0/train.csv"
tweet_test_path = "data/combine/kfolds_0/test.csv"

train_df = pd.read_csv(tweet_train_path, nrows=100)
test_df = pd.read_csv(tweet_test_path, nrows=100)

train_df.head()

Unnamed: 0,tweet_id,label,text
0,567737449938685952,negative,@SouthwestAir no flights out of #nashville tod...
1,567737317432258560,neutral,@SouthwestAir I am but it says yall are sold o...
2,567736870365171713,negative,@SouthwestAir I'm trying to change a family va...
3,567736166787850240,neutral,@SouthwestAir F5R3ZZ
4,567735766416392194,positive,.@SouthwestAir you've got a mess here at DTW b...


### Data preprocessing

In [14]:
import re

def emoji(tweet):
    # Smile -- :), : ), :-), (:, ( :, (-:, :') , :O
    tweet = re.sub(r'(:\s?\)|:-\)|\(\s?:|\(-:|:\'\)|:O)', ' positiveemoji ', tweet)
    # Laugh -- :D, : D, :-D, xD, x-D, XD, X-D
    tweet = re.sub(r'(:\s?D|:-D|x-?D|X-?D)', ' positiveemoji ', tweet)
    # Love -- <3, :*
    tweet = re.sub(r'(<3|:\*)', ' positiveemoji ', tweet)
    # Wink -- ;-), ;), ;-D, ;D, (;,  (-; , @-)
    tweet = re.sub(r'(;-?\)|;-?D|\(-?;|@-\))', ' positiveemoji ', tweet)
    # Sad -- :-(, : (, :(, ):, )-:, :-/ , :-|
    tweet = re.sub(r'(:\s?\(|:-\(|\)\s?:|\)-:|:-/|:-\|)', ' negetiveemoji ', tweet)
    # Cry -- :,(, :'(, :"(
    tweet = re.sub(r'(:,\(|:\'\(|:"\()', ' negetiveemoji ', tweet)
    return tweet

def process_tweet(tweet):
    tweet = tweet.lower()                                             # Lowercases the string
    tweet = re.sub('@[^\s]+', '', tweet)                              # Removes usernames
    tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))', ' ', tweet)   # Remove URLs
    tweet = re.sub(r"\d+", " ", str(tweet))                           # Removes all digits
    tweet = re.sub('&quot;'," ", tweet)                               # Remove (&quot;) 
    tweet = emoji(tweet)                                              # Replaces Emojis
    tweet = re.sub(r"\b[a-zA-Z]\b", "", str(tweet))                   # Removes all single characters
    tweet = re.sub(r"[^\w\s]", " ", str(tweet))                       # Removes all punctuations
    tweet = re.sub(r'(.)\1+', r'\1\1', tweet)                         # Convert more than 2 letter repetitions to 2 letter
    tweet = re.sub(r"\s+", " ", str(tweet)) .strip()               
    return tweet

In [15]:
train_df['text'] = train_df['text'].apply(lambda x: process_tweet(x))
test_df['text'] = test_df['text'].apply(lambda x: process_tweet(x))
train_df.head()

Unnamed: 0,tweet_id,label,text
0,567737449938685952,negative,no flights out of nashville today are you kidd...
1,567737317432258560,neutral,am but it says yall are sold out me amp my cow...
2,567736870365171713,negative,trying to change family vacation due to measle...
3,567736166787850240,neutral,zz
4,567735766416392194,positive,you ve got mess here at dtw but your staff is ...


In [16]:
# Convert data to trainable format
import json

id2label = {0: "negative", 1: "neutral", 2: "positive"}
label2id = {v: k for k, v in id2label.items()}

train_df['label'] = train_df['label'].apply(lambda x: label2id[x])
test_df['label'] = test_df['label'].apply(lambda x: label2id[x])

train_texts = train_df['text'].values
train_labels = train_df['label'].values

test_texts = test_df['text'].values
test_labels = test_df['label']. values

train_texts[:5]

array(['no flights out of nashville today are you kidding me why are other airlines flying and you re not so frustrated',
       'am but it says yall are sold out me amp my coworkers would need to get out first available',
       'trying to change family vacation due to measles outbreak and haven been able to get anyone on the phone any help',
       'zz', 'you ve got mess here at dtw but your staff is doing great'],
      dtype=object)

### Feature Extraction

Note that feature extraction is not applied for the following methods: distilbert-base-uncased, bert-base-uncased and roberta-base.

In [130]:
from sklearn.feature_extraction.text import TfidfVectorizer
from vectorizers import init_vectorizer

vectorizer_options = [
    'tfidf',
    'word2vec',
    'bow' # unigram and bigram
]

vectorizer_option = vectorizer_options[2]
vectorizer = init_vectorizer(vectorizer_option)
print(vectorizer)

CountVectorizer(ngram_range=(1, 2))


In [131]:
# Train vectorizer
vectorizer.fit(train_texts)

# convert train_texts, test_texts to vectors
train_vector = vectorizer.transform(train_texts)
test_vector = vectorizer.transform(test_texts)

train_vector

<100x1745 sparse matrix of type '<class 'numpy.int64'>'
	with 2780 stored elements in Compressed Sparse Row format>

### Select model and training

In [132]:
from algorithms import init_trainer

shallow_learning_options = [
    "linear_svm_tfidf",
    "linear_svm_word2vec",
    "logistic_regressor_word2vec",
    "naive_bayes",
    "random_forest"
]

bert_options = [
    "distilbert-base-uncased",
    "bert-base-uncased",
    "roberta-base",
]

option = shallow_learning_options[0]
print(option)

trainer = init_trainer(option)
print(trainer)

linear_svm_tfidf
<algorithms.linear_svm.LinearSVM object at 0x7f3417d9c610>


In [133]:
if option in shallow_learning_options:
    trainer.train(train_vector, train_labels)
    # save model and vectorizer
    trainer.save_model(vectorizer=vectorizer, output_model_name=f"{option}-example")
elif option in bert_options:
#     not require feature extraction steps
    trainer.train(train_data)

Model has been saved to models/linear_svm_tfidf-example


  sentence_vector = self.vectorizer.transform([processed_sentence])


### Evaluate model on test dataset

In [134]:
if option in shallow_learning_options:
    trainer.evaluate(test_vector, test_labels)
elif option in bert_options:
    trainer.evaluate(test_data)

Accuracy: micro: 0.610
Precision: micro-macro: 0.610-0.617
Recall: micro-macro: 0.610-0.594
F1: micro-macro: 0.610-0.594
              precision    recall  f1-score   support

    negative       0.60      0.74      0.67        35
     neutral       0.60      0.63      0.62        38
    positive       0.65      0.41      0.50        27

    accuracy                           0.61       100
   macro avg       0.62      0.59      0.59       100
weighted avg       0.61      0.61      0.60       100

