In [1]:
%config Completer.use_jedi = False

In [2]:
import html
import os
import re
from datetime import datetime
from typing import List, Optional, Union

import numpy as np
import pandas as pd
import spacy
from sklearn.base import TransformerMixin
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import train_test_split
from spacy.util import compounding, minibatch

In [3]:
spacy.__version__

'2.3.5'

In [4]:
spacy.require_gpu()

True

## Utils

In [5]:
class SpacyTokenTransformer(TransformerMixin):
    __symbols = set("!$%^&*()_+|~-=`{}[]:\";'<>?,./-")

    def transform(self, X: np.ndarray, **kwargs) -> np.ndarray:
        f = np.vectorize(
            SpacyTokenTransformer.transform_to_tokens, otypes=[np.object]
        )
        X_tokenized = f(X)

        return X_tokenized

    def fit(self, X, y=None, **fit_params):
        return self

    @staticmethod
    def transform_to_tokens(text: np.str) -> List[str]:
        str_text = str(text)
        doc: Doc = nlp(str_text)
        tokens: List[str] = []
        tok: Token
        for tok in doc:
            clean_token: str
            if tok.like_url:
                clean_token = "URL"
            else:
                clean_token = (
                    tok.lemma_.strip().lower()
                )  # if tok.lemma_ != '-PRON-' else tok.lower_
                if (
                    len(clean_token) < 1
                    or clean_token in SpacyTokenTransformer.__symbols
                    or clean_token in STOP_WORDS
                ):
                    continue

            tokens.append(clean_token)

        return tokens


class CleanTextTransformer(TransformerMixin):
    __uplus_pattern = re.compile("\<[uU]\+(?P<digit>[a-zA-Z0-9]+)\>")
    __markup_link_pattern = re.compile("\[(.*)\]\((.*)\)")

    def transform(self, X: np.ndarray, **kwargs) -> np.ndarray:
        f = np.vectorize(CleanTextTransformer.transform_clean_text)
        X_clean = f(X)

        return X_clean

    def fit(self, X, y=None, **fit_params):
        return self

    @staticmethod
    def transform_clean_text(raw_text: str):
        try:
            decoded = raw_text.encode("ISO-8859-1").decode("utf-8")
        except:
            decoded = raw_text.encode("ISO-8859-1").decode("cp1252")

        html_unescaped = html.escape(decoded)
        html_unescaped = re.sub(r"\r\n", " ", html_unescaped)
        html_unescaped = re.sub(r"\r\r\n", " ", html_unescaped)
        html_unescaped = re.sub(r"\r", " ", html_unescaped)
        html_unescaped = html_unescaped.replace("&gt;", " > ")
        html_unescaped = html_unescaped.replace("&lt;", " < ")
        html_unescaped = html_unescaped.replace("--", " - ")
        html_unescaped = CleanTextTransformer.__uplus_pattern.sub(
            " U\g<digit>", html_unescaped
        )
        html_unescaped = CleanTextTransformer.__markup_link_pattern.sub(
            " \1 \2", html_unescaped
        )
        html_unescaped = html_unescaped.replace("\\", "")

        return html_unescaped


class TextPreprocessor:
    __slots__ = "_html_cleaner", "_tokenizer"

    def __init__(self):
        self._html_cleaner = CleanTextTransformer()
        self._tokenizer = SpacyTokenTransformer()

    def clean_and_tokenize(
        self, txt: Union[str, List[str], np.ndarray]
    ) -> np.ndarray:
        if isinstance(txt, str):
            txt = [txt]

        if isinstance(txt, list):
            txt = np.array(txt, dtype=np.object)

        if not (isinstance(txt, np.ndarray) and txt.dtype == np.object):
            raise ValueError(
                "Input `txt` must be a string, list of strings, or numpy array of type object"
            )

        txt_processed = self._html_cleaner.transform(txt)
        txt_processed = self._tokenizer.transform(txt_processed)

        return txt_processed

    def clean_single(self, txt: str) -> str:
        return self._html_cleaner.transform_clean_text(txt)

    def tokenize_single(self, clean_text: str) -> List[str]:
        return self._tokenizer.transform_to_tokens(clean_text)

    def clean_and_tokenize_single(self, txt: str) -> List[str]:
        txt_proc = self.clean_single(txt)
        txt_proc = self.tokenize_single(txt)

        return txt_proc

# Data

In [6]:
dataPth = "../data/reddit_200k_train.csv"

In [7]:
!head -n 4 {dataPth}

"","body","score.x","parent_id.x","id","created_utc.x","retrieved_on","REMOVED"
"1","I've always been taught it emerged from the earth after an impace. That is why it has similar elemental distribution to earth",2,"t3_81u15i","dv551g6",1520121101,1524782256,FALSE
"2","As an ECE, my first feeling as ""HEY THAT'S NOT-"" and then I thought about all the times my co-workers couldn't even write a simple message in our communication book without making mistakes. 



In [8]:
df_cols = [
    "body",
    "score_x",
    "parent_idx",
    "id",
    "created_utc.x",
    "retrieved_on",
    "removed",
]

TEXT_COL = "body"
LABEL_COL = "removed"

to_remove = list(set(df_cols) - {TEXT_COL, LABEL_COL})

to_remove

['retrieved_on', 'parent_idx', 'created_utc.x', 'id', 'score_x']

In [9]:
# df = pd.read_csv(dataPth, names=df_cols, skiprows=1, encoding="ISO-8859-1")

# df.drop(columns=to_remove, inplace=True)
# df.reset_index(drop=True, inplace=True)

df = pd.read_feather("reddit_200k_train.feather")

In [10]:
df.head()

Unnamed: 0,body,removed
0,I've always been taught it emerged from the ea...,False
1,"As an ECE, my first feeling as ""HEY THAT'S NOT...",True
2,Monday: Drug companies stock dives on good new...,True
3,i learned that all hybrids are unfertile i won...,False
4,Well i was wanting to get wasted tonight. Not...,False


In [11]:
X = df["body"].values
y = df["removed"].values.astype(np.bool)

In [12]:
X_clean = X.copy()

In [13]:
for i, val in enumerate(X):
    X_clean[i] = CleanTextTransformer.transform_clean_text(val)

In [14]:
cats = [{"REMOVED": l, "NOTREMOVED": not l} for l in y]

In [15]:
trainX, testX, trainY, testY = train_test_split(
    X_clean, cats, stratify=y, test_size=0.1, shuffle=True
)

## Spacy TextCat

In [16]:
hyperparams = {
    "arch": "simple_cnn",
    "epochs": 7,
    "batch_args": (64, 512, 1.001),
}

In [17]:
import random

In [18]:
def evaluate(tokenizer, textcat, texts, cats):
    docs = (tokenizer(text) for text in texts)
    tp = 0.0  # True positives
    fp = 1e-8  # False positives
    fn = 1e-8  # False negatives
    tn = 0.0  # True negatives
    for i, doc in enumerate(textcat.pipe(docs)):
        gold = cats[i]
        for label, score in doc.cats.items():
            if label not in gold:
                continue
            if label == "NEGATIVE":
                continue
            if score >= 0.5 and gold[label] >= 0.5:
                tp += 1.0
            elif score >= 0.5 and gold[label] < 0.5:
                fp += 1.0
            elif score < 0.5 and gold[label] < 0.5:
                tn += 1
            elif score < 0.5 and gold[label] >= 0.5:
                fn += 1
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    if (precision + recall) == 0:
        f_score = 0.0
    else:
        f_score = 2 * (precision * recall) / (precision + recall)
    return {"textcat_p": precision, "textcat_r": recall, "textcat_f": f_score}

In [19]:
cat_nlp = spacy.blank("en")

cat_nlp.pipe_names

[]

In [20]:
textcat = cat_nlp.create_pipe(
    "textcat",
    config={"exclusive_classes": True, "architecture": hyperparams["arch"]},
)

In [21]:
cat_nlp.add_pipe(textcat, last=True)

In [22]:
cat_nlp.pipe_names

['textcat']

In [23]:
textcat.add_label("REMOVED")
textcat.add_label("NOTREMOVED")

1

In [24]:
train_data = list(zip(trainX, [{"cats": cats} for cats in trainY]))

In [25]:
train_data[0]

('I thought Asian people were more likely to be lactose intolerant?',
 {'cats': {'REMOVED': False, 'NOTREMOVED': True}})

## Neptune

In [26]:
import neptune

In [27]:
proj = neptune.init("volfy/testing", api_token=os.getenv("NEPTUNE_API_TOKEN"))

proj

Project(volfy/testing)

## Training

In [28]:
batch_sizes = compounding(*hyperparams["batch_args"])

In [29]:
opt = cat_nlp.begin_training()

In [30]:
exp = proj.create_experiment(
    name=f"spacy_textcat_{hyperparams['arch']}_ep{hyperparams['epochs']}",
    params=hyperparams,
    tags=[
        "spacy",
        "textcat",
        "v2.3",
        str(hyperparams["arch"]),
        str(hyperparams["epochs"]),
    ],
    notebook_id="0c3b24b0-cc21-4eb9-a5d8-ce2c73c6e8c7",
)

https://ui.neptune.ai/volfy/testing/e/TES-20


In [31]:
heading = "{:^5}\t{:^5}\t{:^5}\t{:^5}\t{:^5}\t{:^5}".format(
    "EPOCH", "LOSS", "P", "R", "F", "TIME"
)
print(heading)
exp.log_text("train_log", heading)
for i in range(hyperparams["epochs"]):
    losses = {}

    start = datetime.now()
    random.shuffle(train_data)
    batches = minibatch(train_data, size=batch_sizes)

    for batch in batches:
        txts, lbls = zip(*batch)
        cat_nlp.update(txts, lbls, sgd=opt, drop=0.2, losses=losses)

    with textcat.model.use_params(opt.averages):
        scores = evaluate(cat_nlp.tokenizer, textcat, testX, testY)

    taken = (datetime.now() - start).total_seconds()

    exp.log_metric("loss", losses["textcat"])
    exp.log_metric("precision", scores["textcat_p"])
    exp.log_metric("recall", scores["textcat_r"])
    exp.log_metric("f-1_score", scores["textcat_f"])
    exp.log_metric("time", taken)

    to_print = "{0}\t{1:.3f}\t{2:.3f}\t{3:.3f}\t{4:.3f}\t{5:.3f} s".format(  # print a simple table
        i + 1,
        losses["textcat"],
        scores["textcat_p"],
        scores["textcat_r"],
        scores["textcat_f"],
        taken,
    )
    exp.log_text("train_log", to_print)
    print(to_print)

EPOCH	LOSS 	  P  	  R  	  F  	TIME 
1	0.048	0.685	0.685	0.685	241.325 s
2	0.003	0.691	0.691	0.691	232.485 s
3	0.001	0.692	0.692	0.692	227.624 s
4	0.000	0.691	0.691	0.691	218.884 s
5	0.000	0.691	0.691	0.691	227.564 s
6	0.000	0.689	0.689	0.689	225.327 s
7	0.000	0.687	0.687	0.687	223.214 s


In [32]:
exp.stop()

In [33]:
exp.set_property("arch", hyperparams["arch"].upper());

In [34]:
y_true = np.empty_like(testY, dtype=np.bool)
y_pred = np.empty_like(y_true)

In [35]:
for i, cats in enumerate(testY):
    y_true[i] = cats["REMOVED"]

In [36]:
for i, doc in enumerate(cat_nlp.pipe(testX)):
    y_pred[i] = doc.cats["REMOVED"] > doc.cats["NOTREMOVED"]

In [37]:
acc = accuracy_score(y_true, y_pred)

acc

0.6871008177639826

In [38]:
auc = roc_auc_score(y_true, y_pred)

auc

0.6575313354264942

In [39]:
exp.log_text("auc", f"{auc:.5f}")
exp.log_text("acc", f"{acc*100:.5f}")

In [40]:
exp.set_property("auc", f"{auc:.5f}")
exp.set_property("acc", f"{acc*100:.5f}");

In [41]:
nm = f"spacy_{hyperparams['arch']}_ep{hyperparams['epochs']}.ml"

nm

'spacy_simple_cnn_ep7.ml'

In [42]:
cat_nlp.to_disk(nm)

In [43]:
!zip -9 {nm}.zip {nm}/*

  adding: spacy_simple_cnn_ep7.ml/meta.json (deflated 50%)
  adding: spacy_simple_cnn_ep7.ml/textcat/ (stored 0%)
  adding: spacy_simple_cnn_ep7.ml/tokenizer (deflated 84%)
  adding: spacy_simple_cnn_ep7.ml/vocab/ (stored 0%)


In [44]:
exp.log_artifact(nm + ".zip")

## Jovian

In [45]:
import jovian

<IPython.core.display.Javascript object>

In [46]:
jovian.reset()

In [47]:
jovian.log_hyperparams(hyperparams)

[jovian] Hyperparams logged.[0m


In [48]:
jovian.log_metrics({"auc": auc, "accuracy": acc})

[jovian] Metrics logged.[0m


In [49]:
jovian.commit(
    filename="spacy_model.ipynb",
    environment=None,
    outputs=[nm + ".zip"],
    message=f"{hyperparams['arch']} Ep{hyperparams['epochs']} training",
)

<IPython.core.display.Javascript object>

[jovian] Attempting to save notebook..[0m
[jovian] Updating notebook "volf52/spacy-text-classification" on https://jovian.ai/[0m
[jovian] Uploading notebook..[0m
[jovian] Uploading additional outputs...[0m
[jovian] Attaching records (metrics, hyperparameters, dataset etc.)[0m
[jovian] Committed successfully! https://jovian.ai/volf52/spacy-text-classification[0m


'https://jovian.ai/volf52/spacy-text-classification'