In [1]:
%run -i "../util/lang_utils.ipynb"

In [2]:
import pandas as pd
from spacy.cli.train import train
from spacy.cli.evaluate import evaluate
from spacy.cli.debug_data import debug_data
from spacy.tokens import DocBin

In [5]:
def preprocessing_data_entry(input_text, label, label_list):
    # input_text: str, label: int, label_list: list of str
    doc = small_model(input_text)
    cats = [0] * len(label_list) # initialize a one-hot encoding list
    cats[label] = 1
    final_cats = {} # initialize final category dict
    for i, label in enumerate(label_list):
        final_cats[label] = cats[i]
    doc.cats = final_cats # .cats attribute stores the category labels
    return doc

In [6]:
train_db = DocBin()
test_db = DocBin()
label_list = ["tech", "business", "sport", "entertainment", "politics"]
train_df = pd.read_json("../data/bbc_train.json")
test_df = pd.read_json("../data/bbc_test.json")
train_df = train_df.sample(frac=1)
for index, row in train_df.iterrows():
    # The .iterrows() method in pandas is used to iterate over DataFrame rows as (index, Series) pairs.
    text = row["text"]
    label = row["label"]
    doc = preprocessing_data_entry(text, label, label_list)
    train_db.add(doc)
for index, row in test_df.iterrows():
    text = row["text"]
    label = row["label"]
    doc = preprocessing_data_entry(text, label, label_list)
    test_db.add(doc)
train_db.to_disk("../data/bbc_train.spacy")
test_db.to_disk("../data/bbc_test.spacy")

In [9]:
# Train the spaCy text classifier
!python -m spacy train ../data/spacy_config.cfg --output ../models/spacy_textcat_bbc

[38;5;2m✔ Created output directory: ../models/spacy_textcat_bbc[0m
[38;5;4mℹ Saving to output directory: ../models/spacy_textcat_bbc[0m
[38;5;4mℹ Using CPU[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'textcat'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS TEXTCAT  CATS_SCORE  SCORE 
---  ------  ------------  ------------  ----------  ------
  0       0          0.00          0.16        5.90    0.06
  0     200         17.72         36.48       28.42    0.28
  0     400         21.18         34.65       28.67    0.29
  0     600         22.03         27.06       47.01    0.47
  0     800         38.55         30.37       58.03    0.58
  0    1000         82.89         26.67       66.74    0.67
  0    1200        160.11         24.87       44.75    0.45
  0    1400         34.52         20.25       67.36    0.67
  0    1600         79.07         20.75       77.56    0.78
  1    1800        199.05         20.91 

In [10]:
# Let's try an example to see how the trained model performs
import spacy

nlp = spacy.load("../models/spacy_textcat_bbc/model-best")
input_text = test_df.iloc[1, test_df.columns.get_loc("text")]
print(input_text)
doc = nlp(input_text)
print(doc.cats)  # print the predicted category scores

lib dems  new election pr chief the lib dems have appointed a senior figure from bt to be the party s new communications chief for their next general election effort.  sandy walkington will now work with senior figures such as matthew taylor on completing the party manifesto. party chief executive lord rennard said the appointment was a  significant strengthening of the lib dem team . mr walkington said he wanted the party to be ready for any  mischief  rivals or the media tried to throw at it.   my role will be to ensure this new public profile is effectively communicated at all levels   he said.  i also know the party will be put under scrutiny in the media and from the other parties as never before - and we will need to show ourselves ready and prepared to counter the mischief and misrepresentation that all too often comes from the party s opponents.  the party is already demonstrating on every issue that it is the effective opposition.  mr walkington s new job title is director of 

In [11]:
# Define a function that takes text, a spaCy model, and list of classes; outputs the predicted class
def get_prediction(input_text, nlp_model, target_names):
    doc = nlp_model(input_text)
    category = max(doc.cats, key=doc.cats.get)  # get the class with the highest score
    predicted_class = target_names.index(category)
    return predicted_class

In [12]:
# Measure model performance
from sklearn.metrics import classification_report
test_df["prediction"] = test_df["text"].apply(lambda x: get_prediction(x, nlp, label_list))
print(classification_report(test_df["label"], test_df["prediction"], target_names=label_list))

               precision    recall  f1-score   support

         tech       0.92      0.97      0.95        80
     business       0.99      0.94      0.96       102
        sport       0.94      0.96      0.95       102
entertainment       0.89      1.00      0.94        77
     politics       0.96      0.82      0.88        84

     accuracy                           0.94       445
    macro avg       0.94      0.94      0.94       445
 weighted avg       0.94      0.94      0.94       445

