In [None]:
import pandas as pd
from google.colab import drive

drive.mount('/content/drive')
train = pd.read_csv("/content/drive/My Drive/LIN371/train_df.csv")
val = pd.read_csv("/content/drive/My Drive/LIN371/val_df.csv")
test = pd.read_csv("/content/drive/My Drive/LIN371/test_df.csv")

Mounted at /content/drive


In [None]:
docs_train, docs_val, docs_test = train['body'].values, val['body'].values, test['body'].values
y_train, y_val, y_test = train['label'].values, val['label'].values, test['label'].values

# four category split
categories = pd.DataFrame({"body": test['body'], "label": test['label'], "new_label": test['new_label']})

cs_nonsfw = categories[categories['new_label'] == 'control_source_no_explicit_words']
cs_nsfw = categories[categories['new_label'] == 'control_source_has_explicit_words']
es_nonsfw = categories[categories['new_label'] == 'explicit_source_no_explicit_words']
es_nsfw = categories[categories['new_label'] == 'explicit_source_has_explicit_words']

categories = [cs_nonsfw, cs_nsfw, es_nonsfw, es_nsfw]

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import f1_score, accuracy_score, classification_report

def fit_cvect(model):
  vect = CountVectorizer(stop_words='english')
  X_train = vect.fit_transform(docs_train)
  model.fit(X_train, y_train)
  return model

def evaluate_cvect(model):
  vect = CountVectorizer(stop_words='english')
  X_train = vect.fit_transform(docs_train)
  X_val = vect.transform(docs_val)
  X_test = vect.transform(docs_test)

  preds = model.predict(X_train)
  print("train accuracy", accuracy_score(y_train, preds))

  preds = model.predict(X_val)
  print("val accuracy", accuracy_score(y_val, preds))

  preds = model.predict(X_test)
  print("test accuracy", accuracy_score(y_test, preds))
  print(classification_report(y_test, preds))

def evaluate_cvect_cats(model):
  vect = CountVectorizer(stop_words='english')
  vect.fit(docs_train)
  for cat in categories:
    docs = cat['body'].values
    labels = cat['label'].values
    X_in = vect.transform(docs)
    preds = model.predict(X_in)
    wrong = (preds != labels)
    print(cat.iloc[0]['new_label'], accuracy_score(labels, preds))
    print("missed examples:")
    print("==========================")
    for doc, pred in zip(docs[wrong][:5], preds[wrong][:5]):
      print(f"{doc}{pred:>10}")
    print()

def evaluate_text_cvect(model, docs):
  if not isinstance(docs, list):
      docs = [docs]

  vect = CountVectorizer(stop_words='english')
  vect.fit(docs_train)
  X_in = vect.transform(docs)
  preds = model.predict(X_in)
  return preds


In [None]:
from sklearn.linear_model import LogisticRegression
lr_cvect = LogisticRegression(random_state=3)
lr_cvect = fit_cvect(lr_cvect)
evaluate_cvect(lr_cvect)
# evaluate_cvect_cats(lr_cvect)

# train accuracy 0.9476298902918651
# val accuracy 0.8068722831711861
# test accuracy 0.8045953218795281
# control_source_no_explicit_words 0.7753589624826308
# control_source_has_explicit_words 0.6964980544747081
# explicit_source_no_explicit_words 0.8090575275397797
# explicit_source_has_explicit_words 0.911651728553137

train accuracy 0.9476298902918651
val accuracy 0.8068722831711861
test accuracy 0.8045953218795281
              precision    recall  f1-score   support

           0       0.83      0.77      0.80      2416
           1       0.78      0.84      0.81      2415

    accuracy                           0.80      4831
   macro avg       0.81      0.80      0.80      4831
weighted avg       0.81      0.80      0.80      4831



In [None]:
from sklearn.naive_bayes import MultinomialNB
nb_cvect = MultinomialNB()
nb_cvect = fit_cvect(nb_cvect)
evaluate_cvect(nb_cvect)
# evaluate_cvect_cats(nb_cvect)

# train accuracy 0.8930863175326019
# val accuracy 0.817429103705237
# test accuracy 0.8091492444628441
# control_source_no_explicit_words 0.8364983788791107
# control_source_has_explicit_words 0.7315175097276264
# explicit_source_no_explicit_words 0.7343941248470012
# explicit_source_has_explicit_words 0.9154929577464789

train accuracy 0.8930863175326019
val accuracy 0.817429103705237
test accuracy 0.8091492444628441
              precision    recall  f1-score   support

           0       0.80      0.83      0.81      2416
           1       0.82      0.79      0.81      2415

    accuracy                           0.81      4831
   macro avg       0.81      0.81      0.81      4831
weighted avg       0.81      0.81      0.81      4831



In [None]:
evaluate_text_cvect(lr_cvect, "It's like 199 degrees, when you're doing it with me doing it with me.")

array([0])