In [1]:
import numpy as np
import pandas as pd
import plac
import spacy
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import train_test_split
from spacy.util import compounding, minibatch

from modcom.ml import TextPreprocessor

In [2]:
spacy.require_gpu()

True

# Data

In [3]:
df_cols = ["prev_idx", "parent_idx", "body", "removed"]

TEXT_COL = "body"
CLEAN_COL = "clean_body"
TOK_COL = "token_body"

In [4]:
df = pd.read_csv(
    "data/reddit_train.csv", names=df_cols, skiprows=1, encoding="ISO-8859-1"
)

df.drop(columns=df_cols[:2], inplace=True)

In [5]:
df.head()

Unnamed: 0,body,removed
0,Always be wary of news articles that cite unpu...,0
1,The problem I have with this is that the artic...,0
2,"This is indicative of a typical power law, and...",0
3,This doesn't make sense. Chess obviously trans...,0
4,1. I dispute that gene engineering is burdenso...,0


In [16]:
cleaner = TextPreprocessor()

In [9]:
X = df["body"].values
y = df["removed"].values.astype(np.bool)

cats = [{"REMOVED": l, "NOTREMOVED": not l} for l in y]

trainX, testX, trainY, testY = train_test_split(
    X_clean, cats, stratify=y, test_size=0.1, shuffle=True
)

## Spacy TextCat

In [10]:
import random

In [11]:
def evaluate(tokenizer, textcat, texts, cats):
    docs = (tokenizer(text) for text in texts)
    tp = 0.0  # True positives
    fp = 1e-8  # False positives
    fn = 1e-8  # False negatives
    tn = 0.0  # True negatives
    for i, doc in enumerate(textcat.pipe(docs)):
        gold = cats[i]
        for label, score in doc.cats.items():
            if label not in gold:
                continue
            if label == "NEGATIVE":
                continue
            if score >= 0.5 and gold[label] >= 0.5:
                tp += 1.0
            elif score >= 0.5 and gold[label] < 0.5:
                fp += 1.0
            elif score < 0.5 and gold[label] < 0.5:
                tn += 1
            elif score < 0.5 and gold[label] >= 0.5:
                fn += 1
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    if (precision + recall) == 0:
        f_score = 0.0
    else:
        f_score = 2 * (precision * recall) / (precision + recall)
    return {"textcat_p": precision, "textcat_r": recall, "textcat_f": f_score}

In [20]:
cat_nlp = spacy.blank("en")

cat_nlp.pipe_names

[]

In [21]:
textcat = cat_nlp.create_pipe(
    "textcat", config={"exclusive_classes": True, "architecture": "simple_cnn"}
)

In [22]:
cat_nlp.add_pipe(textcat, last=True)

In [23]:
cat_nlp.pipe_names

['textcat']

In [24]:
textcat.add_label("REMOVED")
textcat.add_label("NOTREMOVED")

1

In [25]:
train_data = list(zip(trainX, [{"cats": cats} for cats in trainY]))

In [26]:
train_data[0]

("can someone explain what the problem with voter id is?\r\n\r\nin my country you get a voter id when you are 16, your voting location is based on the place you live, and you never have to worry about it again, just bring your voter ID or national ID when it's time to vote. i never seen it being brought up as an issue",
 {'cats': {'REMOVED': False, 'NOTREMOVED': True}})

In [27]:
cat_nlp.pipe_names

['textcat']

In [30]:
opt = cat_nlp.begin_training()

In [31]:
batch_sizes = compounding(4.0, 32.0, 1.001)

In [32]:
print("{:^5}\t{:^5}\t{:^5}\t{:^5}".format("LOSS", "P", "R", "F"))
for i in range(20):
    losses = {}
    random.shuffle(train_data)
    batches = minibatch(train_data, size=batch_sizes)

    for batch in batches:
        txts, lbls = zip(*batch)
        cat_nlp.update(txts, lbls, sgd=opt, drop=0.2, losses=losses)

    with textcat.model.use_params(opt.averages):
        scores = evaluate(cat_nlp.tokenizer, textcat, testX, testY)

    print(
        "{0:.3f}\t{1:.3f}\t{2:.3f}\t{3:.3f}".format(  # print a simple table
            losses["textcat"],
            scores["textcat_p"],
            scores["textcat_r"],
            scores["textcat_f"],
        )
    )

LOSS 	  P  	  R  	  F  
14.846	0.737	0.737	0.737
0.236	0.746	0.746	0.746
0.250	0.746	0.746	0.746
0.217	0.740	0.740	0.740
0.150	0.738	0.738	0.738
0.154	0.732	0.732	0.732
0.251	0.730	0.730	0.730
0.149	0.729	0.729	0.729
0.092	0.727	0.727	0.727
0.222	0.725	0.725	0.725
0.241	0.729	0.729	0.729
0.067	0.727	0.727	0.727
0.057	0.724	0.724	0.724
0.079	0.727	0.727	0.727
0.283	0.722	0.722	0.722
0.051	0.721	0.721	0.721
0.043	0.726	0.726	0.726
0.046	0.728	0.728	0.728
0.038	0.716	0.716	0.716
0.038	0.713	0.713	0.713


In [33]:
y_true = np.empty_like(testY, dtype=np.bool)
y_pred = np.empty_like(y_true)

In [34]:
for i, cats in enumerate(testY):
    y_true[i] = cats['REMOVED']

In [35]:
for i, doc in enumerate(cat_nlp.pipe(testX)):
    y_pred[i] = doc.cats['REMOVED'] > doc.cats['NOTREMOVED']

In [36]:
accuracy_score(y_true, y_pred)

0.7141518275538894

In [37]:
roc_auc_score(y_true, y_pred)

0.6478313306380169

In [40]:
cat_nlp.to_disk('models/spacy_textcat.ml')