In [1]:
import torch
import pandas as pd
import numpy as np
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
import numpy as np

def load_glove_embeddings(filepath):
    embeddings = {}
    with open(filepath, "r", encoding="utf-8") as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.array(values[1:], dtype="float32")
            embeddings[word] = vector
    return embeddings

glove_100_path = "/content/drive/My Drive/LIN371/glove.6B.100d.txt"
glove_100_embeddings = load_glove_embeddings(glove_100_path)


In [5]:
# average the glove embeddings of the words to get sentence level representation
def text_to_avg_embedding(text, embeddings, embedding_dim=100):
    tokens = text.split()
    vectors = [embeddings[word] for word in tokens if word in embeddings]
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(embedding_dim)



In [9]:
train_df = pd.read_csv('/content/drive/My Drive/LIN371/train_df.csv')
test_df = pd.read_csv('/content/drive/My Drive/LIN371/test_df.csv')


In [16]:
X_train_1 = train_df['body'].tolist()
y_train_1 = train_df['label'].tolist()
X_test_1 = test_df['body'].tolist()
y_test_1 = test_df['label'].tolist()

In [17]:
# convert texts to averaged glove embeddings
embedding_dim = 100
X_train = np.array([text_to_avg_embedding(text, glove_100_embeddings, embedding_dim) for text in X_train_1])
y_train = np.array(y_train_1)
X_test = np.array([text_to_avg_embedding(text, glove_100_embeddings, embedding_dim) for text in X_test_1])
y_test = np.array(y_test_1)

In [18]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

clf = LogisticRegression()
clf.fit(X_train, y_train)

In [23]:
from sklearn.metrics import f1_score, accuracy_score, classification_report
preds = clf.predict(X_test)
print("test accuracy", accuracy_score(y_test, preds))
print("test f1", f1_score(y_test, preds))
print(classification_report(y_test, preds))

test accuracy 0.73256054647071
test f1 0.7198612315698178
              precision    recall  f1-score   support

           0       0.71      0.78      0.74      2416
           1       0.76      0.69      0.72      2415

    accuracy                           0.73      4831
   macro avg       0.73      0.73      0.73      4831
weighted avg       0.73      0.73      0.73      4831



In [22]:
glove_300_path = "/content/drive/My Drive/LIN371/glove.6B.300d.txt"
glove_300_embeddings = load_glove_embeddings(glove_300_path)

In [24]:
# convert texts to averaged glove embeddings
embedding_dim = 300
X_train_300 = np.array([text_to_avg_embedding(text, glove_300_embeddings, embedding_dim) for text in X_train_1])
y_train_300 = np.array(y_train_1)
X_test_300 = np.array([text_to_avg_embedding(text, glove_300_embeddings, embedding_dim) for text in X_test_1])
y_test_300 = np.array(y_test_1)

In [25]:
X_train_300.shape

(9662, 300)

In [26]:

clf_300 = LogisticRegression()
clf_300.fit(X_train_300, y_train_300)

In [28]:
from sklearn.metrics import f1_score, accuracy_score, classification_report
preds = clf_300.predict(X_test_300)
print("test accuracy", accuracy_score(y_test_300, preds))
print("test f1", f1_score(y_test_300, preds))
print(classification_report(y_test_300, preds))

test accuracy 0.7600910784516663
test f1 0.7480982395131494
              precision    recall  f1-score   support

           0       0.74      0.81      0.77      2416
           1       0.79      0.71      0.75      2415

    accuracy                           0.76      4831
   macro avg       0.76      0.76      0.76      4831
weighted avg       0.76      0.76      0.76      4831



In [30]:
# train accuracy
print('train accuracy', accuracy_score(y_train_300, clf_300.predict(X_train_300)))
print('train f1', f1_score(y_train_300, clf_300.predict(X_train_300)))
print(classification_report(y_train_300, clf_300.predict(X_train_300)))

train accuracy 0.7745808321258538
train f1 0.7645405405405405
              precision    recall  f1-score   support

           0       0.75      0.82      0.78      4831
           1       0.80      0.73      0.76      4831

    accuracy                           0.77      9662
   macro avg       0.78      0.77      0.77      9662
weighted avg       0.78      0.77      0.77      9662

