In [56]:
from pathlib import Path

import spacy
nlp = spacy.load("en_core_web_md")

# Retrieve data and create make_doc function to format examples for spacy pipeline

In [100]:
from google.cloud import bigquery
from google.oauth2 import service_account

key_path = "/Users/yco/.dbt/dbt-user-creds.json"
credentials = service_account.Credentials.from_service_account_file(
    key_path#, scopes=["https://www.googleapis.com/auth/cloud-platform"],
)

client = bigquery.Client(credentials=credentials, project=credentials.project_id,)
query = "SELECT * from `reddit_texts.posts_clean`"
query_job = client.query(query)

texts = []
labels = []
ids = []
for row in query_job:
    ids.append(row["id"])
    texts.append(row['text'])
    labels.append(row['subreddit'])

In [12]:
len(texts)

399

In [13]:
cats = set(labels)
cats

{'LanguageTechnology', 'dataengineering', 'datasets'}

In [35]:
from sklearn.model_selection import train_test_split
from spacy.tokens import DocBin

In [117]:
X_train, X_valid, y_train, y_valid = train_test_split(list(zip(texts, ids)), labels, test_size=0.2)
text_train, ids_train = list(zip(*X_train))
train_data = list(zip(text_train, zip(ids_train, y_train)))
text_valid, ids_valid = list(zip(*X_valid))
valid_data = list(zip(text_valid, zip(ids_valid, y_valid)))

In [121]:
type(train_data[0])

tuple

In [119]:
def make_docs(data, tgt_file):
    docs = DocBin()
    for doc, (_, label) in nlp.pipe(data, as_tuples=True):
        for cat in cats:
            doc.cats[cat] = 1 if cat == label else 0
        docs.add(doc)
    docs.to_disk(tgt_file)
    return docs

In [120]:
train_docs = make_docs(train_data, "tmp/train.spacy")

# Train

In [55]:
from spacy.cli.train import train
import datetime

train(
    "../spacy_configs/subreddit_classif/default_textcat.cfg",
    output_path=f"../models/default_textcat/{datetime.date.today().strftime('%Y/%m/%d')}",
    overrides={
        "paths.train": "tmp/train.spacy", 
        "paths.dev": "tmp/valid.spacy", 
    }
)

[38;5;2m✔ Created output directory: ../models/default_textcat/2022/01/21[0m
[38;5;4mℹ Saving to output directory: ../models/default_textcat/2022/01/21[0m
[38;5;4mℹ Using CPU[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['textcat'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TEXTCAT  CATS_SCORE  SCORE 
---  ------  ------------  ----------  ------
  0       0          0.67        1.67    0.02
  0     200         89.10       55.92    0.56
  1     400         36.52       74.81    0.75
  2     600         10.67       77.68    0.78
  3     800          3.71       75.40    0.75
  4    1000          2.76       75.77    0.76
  5    1200          4.43       75.77    0.76
  6    1400          0.29       74.74    0.75
  8    1600          4.11       74.92    0.75
 10    1800          4.11       75.90    0.76
 12    2000          0.37       77.49    0.77
 15    2200          0.46       77.11    0.77
[38;5;2m✔ Saved pipeline to output directory[0m

# Predict

In [60]:
models_folder = Path("../models/")
def get_latest_model(model_name: str) -> Path:
    model_folder = models_folder / model_name
    for level in ["year", "month", "day"]:
        model_folder = model_folder / max(x.name for x in model_folder.iterdir() if x.is_dir())
    return model_folder

nlp_textcat = spacy.load(str(get_latest_model("default_textcat") / "model-best"))

In [68]:
for text, cat in valid_data[:3]:
    doc = nlp_textcat(text)
    print("---", doc, cat, doc.cats, sep="\n")

---
Are there any datasets out there of antlered big game? Specifically, I am looking for images of deer/elk/moose/sheep after they've been shot.
datasets
{'dataengineering': 0.22329650819301605, 'datasets': 0.5606533885002136, 'LanguageTechnology': 0.21605007350444794}
---
e.g. Top Medium authors, blogs, newsletters, etc.
dataengineering
{'dataengineering': 0.2881203591823578, 'datasets': 0.4115089774131775, 'LanguageTechnology': 0.3003706634044647}
---
Hello all,
I'm trying to get smarter on mimicing writing style based on sample input text. The hope is a system like this:
**Inputs:**
* Input tagged sample writing/letters/emails/dialogue from desired author.
* Basic sentence to be rewrite .
**Output:**
* Translated sentence written in sample author's writing style.
I'm assuming this is a bit of an ambitious lift and may require some training on my own. Curious if anyone has any insights on stylometry papers written. Even something on generative text that is just meant to replicate an

# Score

In [96]:
from spacy.training import Example
from spacy.scorer import Scorer

def score_text_cat(model_name, data):
    nlp_textcat = spacy.load(str(get_latest_model(model_name) / "model-best"))
    examples = []
    for doc, label in nlp_textcat.pipe(data, as_tuples=True):
        examples.append(Example.from_dict(doc, {"cats": {cat: 1 if cat == label else 0 for cat in cats}}))

    scorer = Scorer(nlp_textcat)
    scores = scorer.score_cats(examples, "cats", labels=cats, multi_label=False)
    return scores

score_text_cat("default_textcat", valid_data)

{'cats_score': 0.8161111111111111,
 'cats_score_desc': 'macro F',
 'cats_micro_p': 0.8375,
 'cats_micro_r': 0.8375,
 'cats_micro_f': 0.8375,
 'cats_macro_p': 0.8398290598290599,
 'cats_macro_r': 0.8027038183694529,
 'cats_macro_f': 0.8161111111111111,
 'cats_macro_auc': 0.9410453912559177,
 'cats_f_per_type': {'dataengineering': {'p': 0.8333333333333334,
   'r': 0.9210526315789473,
   'f': 0.875},
  'datasets': {'p': 0.84, 'r': 0.84, 'f': 0.8399999999999999},
  'LanguageTechnology': {'p': 0.8461538461538461,
   'r': 0.6470588235294118,
   'f': 0.7333333333333334}},
 'cats_auc_per_type': {'dataengineering': 0.9505012531328322,
  'datasets': 0.952,
  'LanguageTechnology': 0.9206349206349207}}

# Batch predict

In [98]:
def batch_predict(model_name, data):
    date = datetime.date.today().strftime("%Y-%m-%d")
    nlp_textcat = spacy.load(str(get_latest_model(model_name) / "model-best"))
    predictions = []
    for doc, _id in nlp_textcat.pipe(data, as_tuples=True):
        predicted_cat = max(doc.cats, key=lambda x: doc.cats[x])
        predictions.append({"id": _id, "date": date, "pred": predicted_cat, "confidence": doc.cats[predicted_cat]})
    return predictions

batch_predict("default_textcat", valid_data[:4])

[{'id': 'datasets',
  'date': '2022-01-21',
  'pred': 'datasets',
  'confidence': 0.5606533885002136},
 {'id': 'dataengineering',
  'date': '2022-01-21',
  'pred': 'datasets',
  'confidence': 0.4115089774131775},
 {'id': 'LanguageTechnology',
  'date': '2022-01-21',
  'pred': 'LanguageTechnology',
  'confidence': 0.8508118987083435},
 {'id': 'dataengineering',
  'date': '2022-01-21',
  'pred': 'dataengineering',
  'confidence': 0.760303258895874}]