In [1]:
import zipfile
import os

In [2]:
extract = "bbcsport"
path = "bbcsport-fulltext.zip"

with zipfile.ZipFile(path, 'r') as k:
    k.extractall(extract)

print('Unzipping Complete')

Unzipping Complete


In [3]:
import os
from pathlib import Path

newpath = 'bbcsport/bbcsport'

data_dir = Path(newpath)

texts = []
labels = []

for label in os.listdir(data_dir):
  category_dir = data_dir / label
  if category_dir.is_dir():
    for file_path in category_dir.glob("*.txt"):
      with open(file_path, encoding='latin-1') as f:
        text = f.read().strip()
        texts.append(text)
        labels.append(label)

print(f"Loaded {len(texts)} documents.")
print("Sample label:", set(labels))

Loaded 737 documents.
Sample label: {'cricket', 'athletics', 'football', 'tennis', 'rugby'}


In [4]:
print(texts[1])

O'Sullivan could run in Worlds

Sonia O'Sullivan has indicated that she would like to participate in next month's World Cross Country Championships in St Etienne.

Athletics Ireland have hinted that the 35-year-old Cobh runner may be included in the official line-up for the event in France on 19-20 March. Provincial teams were selected after last Saturday's Nationals in Santry and will be officially announced this week. O'Sullivan is at present preparing for the London marathon on 17 April. The participation of O'Sullivan, currentily training at her base in Australia, would boost the Ireland team who won the bronze three years agio. The first three at Santry last Saturday, Jolene Byrne, Maria McCambridge and Fionnualla Britton, are automatic selections and will most likely form part of the long-course team. O'Sullivan will also take part in the Bupa Great Ireland Run on 9 April in Dublin.


# Topic Modelling

In [5]:
import numpy as np
import scipy
import sklearn

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF

In [7]:
tfidf = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english')

tm = tfidf.fit_transform(texts)

In [8]:
tm

<737x7585 sparse matrix of type '<class 'numpy.float64'>'
	with 89578 stored elements in Compressed Sparse Row format>

### Non-Negative Matrix Factorization (NMF)

In [9]:
nmf = NMF(n_components=7, random_state=42)
nmf_topics = nmf.fit_transform(tm)

In [10]:
len(tfidf.get_feature_names_out())

7585

In [11]:
len(nmf.components_)

7

In [12]:
nmf.components_

array([[1.04868538e-04, 1.28853388e-02, 0.00000000e+00, ...,
        1.09820449e-04, 3.76257703e-02, 6.45892754e-04],
       [0.00000000e+00, 3.82895295e-02, 0.00000000e+00, ...,
        1.38505567e-02, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 1.71976168e-02, 0.00000000e+00, ...,
        5.67458530e-03, 0.00000000e+00, 1.88810995e-02],
       ...,
       [8.24654126e-04, 4.28306227e-03, 0.00000000e+00, ...,
        3.03647513e-05, 5.39445631e-05, 0.00000000e+00],
       [6.48444276e-03, 1.42114494e-02, 1.71706133e-01, ...,
        1.63401485e-03, 2.94297556e-05, 0.00000000e+00],
       [2.81794437e-05, 1.16159926e-03, 0.00000000e+00, ...,
        9.00240489e-05, 3.15600205e-03, 0.00000000e+00]])

In [13]:
for index,topic in enumerate(nmf.components_):
  print(f'The top 10 words for topic #{index}')
  print([tfidf.get_feature_names_out()[i] for i in topic.argsort()[-10:]])
  print('*\n')

The top 10 words for topic #0
['game', 'scotland', 'half', 'rugby', 'france', 'nations', 'robinson', 'ireland', 'wales', 'england']
*

The top 10 words for topic #1
['khan', 'zealand', 'tour', 'day', 'series', 'test', 'australia', 'cricket', 'india', 'pakistan']
*

The top 10 words for topic #2
['beat', 'agassi', 'final', 'hewitt', 'set', 'australian', 'federer', 'roddick', 'seed', 'open']
*

The top 10 words for topic #3
['champions', 'cup', 'said', 'mourinho', 'club', 'liverpool', 'united', 'league', 'arsenal', 'chelsea']
*

The top 10 words for topic #4
['tests', 'charges', 'olympics', 'doping', 'athens', 'drugs', 'iaaf', 'greek', 'thanou', 'kenteris']
*

The top 10 words for topic #5
['championships', '60m', 'record', 'champion', 'european', 'world', 'holmes', 'olympic', 'race', 'indoor']
*

The top 10 words for topic #6
['andrew', 'boje', 'flintoff', 'trescothick', 'jones', 'strauss', 'vaughan', 'africa', 'england', 'south']
*



In [14]:
from collections import defaultdict

category_docs = defaultdict(list)
for text, label in zip(texts, labels):
    category_docs[label].append(text)

In [15]:
def extract_topics(texts, n_topics=5, top_n_words=10):
    vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english')
    tfidf = vectorizer.fit_transform(texts)

    nmf = NMF(n_components=n_topics, random_state=42)
    nmf_topics = nmf.fit_transform(tfidf)

    feature_names = vectorizer.get_feature_names_out()
    topics = []
    for topic_idx, topic in enumerate(nmf.components_):
        top_words = [feature_names[i] for i in topic.argsort()[:-top_n_words - 1:-1]]
        topics.append(top_words)
    return topics

In [16]:
list(category_docs)

['athletics', 'cricket', 'football', 'rugby', 'tennis']

In [17]:
tennis_topics = extract_topics(category_docs["tennis"], n_topics=2, top_n_words=10)
for i, topic in enumerate(tennis_topics):
    print(f"Tennis Sub-category {i+1}: {', '.join(topic)}")

Tennis Sub-category 1: seed, set, federer, match, agassi, final, hewitt, second, win, williams
Tennis Sub-category 2: cup, davis, moya, year, said, open, roddick, tennis, murray, injury


### Named Entity Recongnition

In [18]:
from transformers import pipeline

ner = pipeline("ner", model="dbmdz/bert-large-cased-finetuned-conll03-english", aggregation_strategy="simple")

def extract_person_and_job(text):
    entities = ner(text)
    persons = [e for e in entities if e['entity_group'] == 'PER']
    # Simple pattern: Find "Person, Job" in text
    results = []
    for person in persons:
        start, end = person['start'], person['end']
        # Look for ", JOB" after the name, up to 40 chars ahead
        tail = text[end:end+40]
        import re
        match = re.search(r', ([\w\s]+)[\.,]', tail)
        job = match.group(1).strip() if match else None
        results.append({"name": person['word'], "job": job})
    return results

Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cpu


In [19]:
results = [extract_person_and_job(text) for text in category_docs["tennis"]]
print(results[:20])

[[{'name': 'Henman', 'job': None}, {'name': 'Rusedski Tim Henman', 'job': None}, {'name': 'Greg Rusedski', 'job': None}, {'name': 'Rusedski', 'job': None}, {'name': 'Rusedski', 'job': None}, {'name': 'Henman', 'job': None}, {'name': 'Henman', 'job': None}, {'name': 'Rusedski', 'job': None}, {'name': 'Rusedski', 'job': None}, {'name': 'Henman', 'job': None}, {'name': 'Rusedski', 'job': None}, {'name': 'Henman', 'job': 'seeded three'}, {'name': 'Rusedski', 'job': None}, {'name': 'Henman', 'job': None}, {'name': 'Henman', 'job': None}, {'name': 'Rusedski', 'job': None}, {'name': 'Henman', 'job': None}, {'name': 'Rusedski', 'job': None}, {'name': 'Henman', 'job': None}, {'name': 'Igor Andreev', 'job': None}], [{'name': 'Safin', 'job': None}, {'name': 'Marat Safin', 'job': None}, {'name': 'Nicolas Kiefer', 'job': None}, {'name': 'Safin', 'job': None}, {'name': 'Kiefer', 'job': None}, {'name': 'Feliciano Lopez', 'job': None}, {'name': 'Andre Agassi', 'job': None}, {'name': 'Paradorn Srichaph

In [20]:
import spacy
nlp = spacy.load("en_core_web_sm")
def spacy_job_title_extract(text):
    doc = nlp(text)
    results = []
    for ent in doc.ents:
        if ent.label_ == "PERSON":
            for token in ent.root.children:
                if token.dep_ == "appos":
                    results.append({"name": ent.text, "job": token.text})
    return results

In [21]:
results2 = [spacy_job_title_extract(text) for text in category_docs["tennis"]]
print(results2[:20])

[[], [], [], [], [], [], [{'name': 'Thomas Enqvist', 'job': '7'}], [], [{'name': 'Felix Mantilla', 'job': '6'}], [{'name': 'Jie Zheng', 'job': '7'}], [], [], [], [], [], [{'name': 'Jie Zheng', 'job': '7'}], [], [], [{'name': 'Conchita Martinez', 'job': 'players'}, {'name': 'Martinez', 'job': '32'}, {'name': 'Shinobu Asagoe', 'job': '6'}], [{'name': 'Na Li', 'job': '7'}]]


In [22]:
results3 = [spacy_job_title_extract(text) for text in texts]
print(results3[:20])

[[], [], [{'name': 'Ato', 'job': 'Boldon'}], [{'name': 'Frankie Fredericks', 'job': 'member'}], [], [{'name': 'Scott', 'job': 'indoors'}], [{'name': 'Mark Carroll', 'job': 'holder'}, {'name': 'Craig Mottram', 'job': 'winner'}], [{'name': 'Hansen', 'job': 'champion'}], [], [{'name': 'Sydney Olympic', 'job': 'm'}, {'name': 'Sydney Olympic', 'job': 'champion'}, {'name': 'Sydney Olympic', 'job': 'Greene'}], [], [{'name': 'Edwards', 'job': 'himself'}], [{'name': 'Athletics Kenya', 'job': 'AK'}], [], [], [{'name': 'Kenteris', 'job': '31'}], [{'name': 'Nick Davies', 'job': 'spokesman'}], [], [], [{'name': 'Edwards', 'job': 'himself'}]]


In [23]:
# Text Summarisation Model

summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
sum = summarizer(texts[1], max_length=130, min_length=30, do_sample=False)
print(sum)

Device set to use cpu


[{'summary_text': "Sonia O'Sullivan has indicated that she would like to participate in next month's World Cross Country Championships in St Etienne. The 35-year-old Cobh runner may be included in the official line-up for the event in France on 19-20 March."}]


In [24]:
print(texts[1])
print('\n')
print(sum)

O'Sullivan could run in Worlds

Sonia O'Sullivan has indicated that she would like to participate in next month's World Cross Country Championships in St Etienne.

Athletics Ireland have hinted that the 35-year-old Cobh runner may be included in the official line-up for the event in France on 19-20 March. Provincial teams were selected after last Saturday's Nationals in Santry and will be officially announced this week. O'Sullivan is at present preparing for the London marathon on 17 April. The participation of O'Sullivan, currentily training at her base in Australia, would boost the Ireland team who won the bronze three years agio. The first three at Santry last Saturday, Jolene Byrne, Maria McCambridge and Fionnualla Britton, are automatic selections and will most likely form part of the long-course team. O'Sullivan will also take part in the Bupa Great Ireland Run on 9 April in Dublin.


[{'summary_text': "Sonia O'Sullivan has indicated that she would like to participate in next mon