In [1]:
import torch
import pandas as pd
import numpy as np
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
import numpy as np

def load_glove_embeddings(filepath):
    embeddings = {}
    with open(filepath, "r", encoding="utf-8") as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.array(values[1:], dtype="float32")
            embeddings[word] = vector
    return embeddings

glove_100_path = "/content/drive/My Drive/LIN371/glove.6B.100d.txt"
glove_100_embeddings = load_glove_embeddings(glove_100_path)


In [58]:
import numpy as np
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt_tab')

def text_to_avg_embedding(text, embeddings, embedding_dim=100):
    # Tokenize and preprocess
    tokens = word_tokenize(text.lower())

    # Compute embeddings for each token
    vectors = [embeddings[word] for word in tokens if word in embeddings]

    # Average or return zero vector
    return np.mean(vectors, axis=0) if vectors else np.zeros(embedding_dim)


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [59]:
train_df = pd.read_csv('/content/drive/My Drive/LIN371/train_df.csv')
test_df = pd.read_csv('/content/drive/My Drive/LIN371/test_df.csv')


In [60]:
X_train_1 = train_df['body'].tolist()
y_train_1 = train_df['label'].tolist()
new_label_train_1 = train_df['new_label'].tolist()

X_test_1 = test_df['body'].tolist()
y_test_1 = test_df['label'].tolist()
new_label_test_1 = test_df['new_label'].tolist()

In [61]:
# convert texts to averaged glove embeddings
embedding_dim = 100
X_train = np.array([text_to_avg_embedding(text, glove_100_embeddings, embedding_dim) for text in X_train_1])
y_train = np.array(y_train_1)
X_test = np.array([text_to_avg_embedding(text, glove_100_embeddings, embedding_dim) for text in X_test_1])
y_test = np.array(y_test_1)

In [62]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

clf = LogisticRegression()
clf.fit(X_train, y_train)

In [63]:
from sklearn.metrics import f1_score, accuracy_score, classification_report
preds = clf.predict(X_test)
print("test accuracy", accuracy_score(y_test, preds))
print("test f1", f1_score(y_test, preds))
print(classification_report(y_test, preds))

test accuracy 0.774373835644794
test f1 0.7678875638841567
              precision    recall  f1-score   support

           0       0.76      0.80      0.78      2416
           1       0.79      0.75      0.77      2415

    accuracy                           0.77      4831
   macro avg       0.78      0.77      0.77      4831
weighted avg       0.78      0.77      0.77      4831



In [64]:
glove_300_path = "/content/drive/My Drive/LIN371/glove.6B.300d.txt"
glove_300_embeddings = load_glove_embeddings(glove_300_path)

In [65]:
# convert texts to averaged glove embeddings
embedding_dim = 300
X_train_300 = np.array([text_to_avg_embedding(text, glove_300_embeddings, embedding_dim) for text in X_train_1])
y_train_300 = np.array(y_train_1)
X_test_300 = np.array([text_to_avg_embedding(text, glove_300_embeddings, embedding_dim) for text in X_test_1])
y_test_300 = np.array(y_test_1)

In [66]:
X_train_300.shape

(9662, 300)

In [67]:

clf_300 = LogisticRegression()
clf_300.fit(X_train_300, y_train_300)

In [68]:
from sklearn.metrics import f1_score, accuracy_score, classification_report
preds = clf_300.predict(X_test_300)
print("test accuracy", accuracy_score(y_test_300, preds))
print("test f1", f1_score(y_test_300, preds))
print(classification_report(y_test_300, preds))

test accuracy 0.7950734837507762
test f1 0.7893617021276595
              precision    recall  f1-score   support

           0       0.78      0.82      0.80      2416
           1       0.81      0.77      0.79      2415

    accuracy                           0.80      4831
   macro avg       0.80      0.80      0.79      4831
weighted avg       0.80      0.80      0.79      4831



In [69]:
# explicit source, explicit words
esew = np.array([e for i, e in enumerate(X_test_300) if new_label_test_1[i] == 'explicit_source_has_explicit_words'])
preds = clf_300.predict(esew)
print("test accuracy", accuracy_score(np.ones(len(preds)), preds))

test accuracy 0.8284250960307298


In [70]:
# explicit source, no explicit words
esnw = np.array([e for i, e in enumerate(X_test_300) if new_label_test_1[i] == 'explicit_source_no_explicit_words'])
preds = clf_300.predict(esnw)
print("test accuracy", accuracy_score(np.ones(len(preds)), preds))

test accuracy 0.7392900856793145


In [71]:
# control source,  explicit words
csew = np.array([e for i, e in enumerate(X_test_300) if new_label_test_1[i] == 'control_source_has_explicit_words'])
preds = clf_300.predict(csew)
print("test accuracy", accuracy_score(np.zeros(len(preds)), preds))


test accuracy 0.77431906614786


In [72]:
# control source, no explicit words
csnw = np.array([e for i, e in enumerate(X_test_300) if new_label_test_1[i] == 'control_source_no_explicit_words'])
preds = clf_300.predict(csnw)
print("test accuracy", accuracy_score(np.zeros(len(preds)), preds))


test accuracy 0.8276980083371931


In [73]:
# train accuracy
print('train accuracy', accuracy_score(y_train_300, clf_300.predict(X_train_300)))
print('train f1', f1_score(y_train_300, clf_300.predict(X_train_300)))
print(classification_report(y_train_300, clf_300.predict(X_train_300)))

train accuracy 0.8169116125025875
train f1 0.8122280012737502
              precision    recall  f1-score   support

           0       0.80      0.84      0.82      4831
           1       0.83      0.79      0.81      4831

    accuracy                           0.82      9662
   macro avg       0.82      0.82      0.82      9662
weighted avg       0.82      0.82      0.82      9662

