In [17]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import nltk

In [6]:
def load_datasets():
    """
    Load training and dev sets
    """

    train = pd.read_csv("./new_dataset/train_preprocessed.csv")
    dev = pd.read_csv("./new_dataset/val_preprocessed.csv")
    
    return train, dev

In [7]:
train, dev = load_datasets()

In [8]:
train

Unnamed: 0,text,label,sentiment,Y_hard,Y_soft
0,"""QT @user In the original draft of the 7th boo...",2,positive,"[0.0, 0.0, 1.0]","[0.0014647464267909527, 0.043079327791929245, ..."
1,"""Ben Smith / Smith (concussion) remains out of...",1,neutral,"[0.0, 1.0, 0.0]","[0.20808669924736023, 0.782555341720581, 0.009..."
2,Sorry bout the stream last night I crashed out...,1,neutral,"[0.0, 1.0, 0.0]","[0.3470957279205322, 0.5051978826522827, 0.147..."
3,Chase Headley's RBI double in the 8th inning o...,1,neutral,"[0.0, 1.0, 0.0]","[0.008634913712739944, 0.8800899386405945, 0.1..."
4,@user Alciato: Bee will invest 150 million in ...,2,positive,"[0.0, 0.0, 1.0]","[0.006843153852969408, 0.7086856961250305, 0.2..."
...,...,...,...,...,...
45610,"@user \""""So amazing to have the beautiful Lady...",2,positive,"[0.0, 0.0, 1.0]","[0.0028112607542425394, 0.008069629780948162, ..."
45611,"9 September has arrived, which means Apple's n...",2,positive,"[0.0, 0.0, 1.0]","[0.002179460832849145, 0.12519682943820953, 0...."
45612,Leeds 1-1 Sheff Wed. Giuseppe Bellusci securin...,2,positive,"[0.0, 0.0, 1.0]","[0.004066226538270712, 0.8510255217552185, 0.1..."
45613,@user no I'm in hilton head till the 8th lol g...,1,neutral,"[0.0, 1.0, 0.0]","[0.009541264735162258, 0.49123987555503845, 0...."


In [15]:
def tokenizer(text: str):
    '''
    NLTK Tweet Tokenizer -- removes handles

    @param text        string tweet
    @ret tokens        list of tokens
    '''
    text = text.lower()
    tokenizer = nltk.tokenize.TweetTokenizer(strip_handles=True)
    tokens = tokenizer.tokenize(text)
    
    return tokens

In [18]:
# Assuming we have train and test sets
Xmat_train = train["text"]
Y_train = train["Y_hard"]

Xmat_dev = dev["text"]
Y_dev = dev["Y_hard"]

# Handle missing values
Xmat_train.fillna('', inplace=True)
Xmat_dev.fillna('', inplace=True)

# Create a Bag of Words representation using the training data *only*
vectorizer = CountVectorizer(tokenizer=tokenizer)
X_train_bow = vectorizer.fit_transform(Xmat_train)
X_dev_bow = vectorizer.transform(Xmat_dev)

In [23]:
# Train a Logistic Regression model
baseline_bow = LogisticRegression(max_iter=2000, multi_class='auto', solver='lbfgs')
baseline_bow.fit(X_train_bow, Y_train)

# Make predictions
Y_pred_train = baseline_bow.predict(X_train_bow)
Y_pred_dev = baseline_bow.predict(X_dev_bow)

# Evaluate the model
train_accuracy = accuracy_score(Y_train, Y_pred_train)
dev_accuracy = accuracy_score(Y_dev, Y_pred_dev)

print(f"Train accuracy: {train_accuracy}")
print(f"Dev accuracy: {dev_accuracy}")


Train accuracy: 0.9095472980379261
Dev accuracy: 0.6775


# Test Set

In [37]:
test = pd.read_csv("./new_dataset/test_preprocessed.csv")

Xmat_test = test["text"]
Y_test = test["Y_hard"]

X_test_bow = vectorizer.transform(Xmat_test)

In [38]:
Y_pred_test = baseline_bow.predict(X_test_bow)
test_accuracy = accuracy_score(Y_test, Y_pred_test)

print(f"Test accuracy: {test_accuracy}")

Test accuracy: 0.5902800390752198


In [48]:
Y_test[0]

'[0.0, 1.0, 0.0]'

In [49]:
Y_pred_test[1]

'[0.0, 1.0, 0.0]'

In [51]:
total = [0, 0, 0]
wrong = [0, 0, 0]
for y, y_pred in zip(Y_test, Y_pred_test):
    y = eval(y)
    y_pred = eval(y_pred)
    total[np.argmax(y)] += 1
    if np.argmax(y) != np.argmax(y_pred):
        # if the prediction is wrong
        wrong[np.argmax(y)] += 1


In [52]:
total

[3972, 5937, 2375]

In [58]:
percent_error = []

for i in range(len(total)):
    percent_error.append(wrong[i]/total[i])

In [59]:
percent_error

[0.5599194360523666, 0.3084049183089102, 0.41178947368421054]