In [1]:
# Standard library imports
import time
import sys

# Third-party imports
import torch # TODO: might remove, transformers sometimes throws warnings?
from transformers import (AutoTokenizer,
                          AutoModelForSequenceClassification,
                          AutoModelForTokenClassification,
                          pipeline)
from pymongo import MongoClient

# Local imports
import misc

In [2]:
# Connect to MongoDB
# MONGODB_HOST = 'localhost'
MONGODB_HOST = 'mongodb-service' # must fit k8s service name
MONGODB_PORT = 27017
print("Connect to MongoDB.")
client = MongoClient(MONGODB_HOST, MONGODB_PORT)
db = client['news']
collection = db['articles']

Connect to MongoDB.


In [55]:
# Count number of documents in collection where 'entities' field is missing
# (i.e. where nlp has not been performed yet) and where the 'authors' field is
# not empty (i.e. "proper" articles, not e.g. weather reports or landing pages)
query = {'entities': {'$exists': False}, 'authors': {'$ne': []}}
# query = {'authors': {'$ne': []},}
count = collection.count_documents(query)
print(f'Number of documents to process: {count}')

# Exit if no documents to process
if count == 0:
    print("No documents to process. Exiting.")
    sys.exit(0)
    
# prev 9334

Number of documents to process: 6558


In [59]:
# Export part of the collection that has been worked


query_nlp = {'entities': {'$exists': True}}
count_nlp = collection.count_documents(query_nlp)
print(f'{count_nlp} documents have been worked.')

collection_list = list(collection.find(query_nlp))

import pickle
with open("nlp_collection_list.pkl", "wb") as pickle_file:
    pickle.dump(collection_list, pickle_file)

2917 documents have been worked.


In [21]:
# Init classifier model for inference on article text
# https://huggingface.co/Softechlb/articles_classification
CLF_MODEL_NAME = "Softechlb/articles_classification"
print("Init classifier model.")
clf_tokenizer = AutoTokenizer.from_pretrained(CLF_MODEL_NAME)
clf_model = AutoModelForSequenceClassification.from_pretrained(CLF_MODEL_NAME)

Init classifier model.


In [22]:
# Init NER model for inference on article title
# https://huggingface.co/dslim/bert-base-NER
NER_MODEL_NAME = "dslim/bert-base-NER"
print("Init NER model.")
ner_tokenizer = AutoTokenizer.from_pretrained(NER_MODEL_NAME)
ner_model = AutoModelForTokenClassification.from_pretrained(NER_MODEL_NAME)
ner_pipeline = pipeline("ner", model=ner_model, tokenizer=ner_tokenizer,
                        aggregation_strategy="simple")

Init NER model.


In [24]:
col = collection.find(query)
t_col = col[0]
t_col

{'_id': ObjectId('64ebd5efb36280b9aa11e6e3'),
 'source': 'bbc',
 'url': 'https://www.bbc.com/future/article/20230825-the-mystery-of-why-some-people-develop-als',
 'title': 'The mystery of why some people develop ALS',
 'authors': ['Sarah Pitt'],
 'top_image': 'https://ychef.files.bbci.co.uk/live/624x351/p0g8x9qf.jpg',
 'text': 'Depending on the type of ALS, "the extent that disease can be explained by genetic factors is only about 8% to 60%," explains Eva Feldman, a professor of neurology at the University of Michigan.\n\nHowever, there\'s also emerging evidence that repeated and prolonged exposure to potential triggers in the external environment might increase the risk of someone developing ALS, particularly the sporadic form. This led Feldman and her colleagues to investigate.\n\n"We suspected the presence of what we term the ALS \'exposome\', which is the sum of toxic environmental exposures that increase risk," says Feldman. The team have found that prolonged exposure to organic c

In [25]:
t_text = t_col['text']

In [26]:
(entities, ef) = misc.perform_ner(t_text, ner_pipeline)

In [27]:
ef[0]['score'], type(ef[0]['score'])

(0.983871579170227, float)

In [28]:
tlabel, tdict = misc.perform_clf(t_text, clf_tokenizer, clf_model)
tlabel, tdict, type(tdict['health'])

('health',
 {'business': 0.0025101653300225735,
  'entertainment': 0.0009574051364324987,
  'health': 0.9077606797218323,
  'news': 0.08217039704322815,
  'politics': 0.0034172546584159136,
  'sport': 0.003184074303135276},
 float)

In [16]:
str(tdict)

"{'business': 0.0025101653300225735, 'entertainment': 0.0009574051364324987, 'health': 0.9077606797218323, 'news': 0.08217039704322815, 'politics': 0.0034172546584159136, 'sport': 0.003184074303135276}"

In [None]:
# Iterate over documents in collection
progress = misc.ProgressLog(count)
error_count = 0
print("Start processing documents.")
start_time = time.time()
i = 0
print("before loop")
for document in collection.find(query):
    print(i, document['title'])
    if i > 2:
        break
#     try:
    # Perform NER on article title
    print('perform ner')
    (entities, entities_all) = misc.perform_ner(document['title'], ner_pipeline)

    # Perform classification on article text
    print('perform clf')
    (label, labels_all) = misc.perform_clf(document['text'], clf_tokenizer, clf_model)

    # Update document in collection
    document['entities'] = entities
    document['entities_verbose'] = entities_all
    document['category'] = label
    document['category_verbose'] = labels_all
    collection.update_one({'_id': document['_id']}, {'$set': document})

#     except Exception as e:
#         error_count += 1
#         print(f"\nError no. {error_count} while "
#               f"processing document {document['_id']}:\n{e}\n\n")

    # Print progress
    progress.increase(suffix=f"{document['_id']}: {document['title']}")

duration = time.time() - start_time
print(f"\n\nProcessing {count} documents took {duration / 60:.0f} min.\n"
      f"Average time per document: {duration / count:.2f} s.")

# Close connection to mongo db
# client.close()