In [2]:
import numpy as np
import pandas as pd
import re
import glob
from   os import path
import os
from tqdm.notebook import tqdm
from dateutil.parser import parse
from dateutil.tz import gettz

import warnings
warnings.filterwarnings('ignore', category=UserWarning, module='transformers')

import torch

from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

# use the first GPU if available, otherwise use CPU
device = torch.device("mps"if torch.backends.mps.is_available()else "cpu")
# device = 0 if torch.cuda.is_available() else -1
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")

nlp = pipeline("ner", model=model, tokenizer=tokenizer, device=device)

In [3]:
def combineHeadlineText(row):
    if isinstance(row["Headline"], str):
        return row["Headline"] + ". " + row["Text"]
    else:
        return row["Text"]

In [5]:
lst_files = []

Path = "../Data/countries_integration/*.csv"
count = 0

for fname in glob.glob(Path):
    lst_files.append(fname)

lst_files = sorted(lst_files)

for file in lst_files:
    file_name = os.path.basename(file)  # Get the file name from the full file path
    country_name = os.path.splitext(file_name)[0]  # Remove the file extension

    if not os.path.isfile('../Data/MW_NER/{0}.csv'.format(country_name)):
        print(file)
        df = pd.read_csv(file)

        if df.shape[0] != 0:
            df = df.drop(['Unnamed: 0'], axis = 1)
            df = df.drop_duplicates(['Date','Headline'],keep= 'last')
            df['Text'] = df['Text'].astype(str)
            df['Text'] = df.apply(lambda row: combineHeadlineText(row), axis=1)
            df['Date'] = df['Date'].str.replace(r'Published: ', ' ')
            df['Date'] = df['Date'].str.replace(r'First', ' ')
            df['Date'] = df['Date'].apply(lambda date_str: parse(date_str, tzinfos={'ET': gettz('America/New_York')}))
            df['Date'] = df['Date'].dt.date   #change date format to YYYY-MM-DD
            df = df.sort_values(by = ['Date'], ascending = True)

            count = []
            for i in tqdm(range(len(df)), desc=f"Processing {country_name}"):
                ner_results = nlp(df["Text"].iloc[i])
                country_instances = [d for d in ner_results if (d['entity'] in "B-ORG") and (d['word'] in country_name) and (d['score'] > 0.98)]
                count.append(len(country_instances))

            df['Count'] = count
            df = df[df['Count'] > 0]
            df = df.drop(['Count'], axis = 1)
            df.to_csv(r'../Data/MW_NER/{0}.csv'.format(country_name))

../Data/countries_integration/Australia_articles.csv


Processing Australia_articles:   0%|          | 0/11105 [00:00<?, ?it/s]

OSError: Cannot save file into a non-existent directory: 'Data/MW_NER'

In [1]:
def process_single_article(text, country_name):
    ner_results = nlp(text)
    country_instances = [d for d in ner_results if (d['entity'] in "B-ORG") and (d['word'] in country_name) and (d['score'] > 0.98)]
    return len(country_instances)

In [6]:
# Find all csv files in the path
csv_files = glob.glob('../Data/countries_integration/*.csv')

# Sort the file names
csv_files = sorted(csv_files)

# Read the first csv file
df = pd.read_csv(csv_files[0])

# Get the first row of the DataFrame
first_row = df.iloc[43]

# Extract the 'Date', 'Headline' and 'Text' columns
date = first_row['Date']
headline = first_row['Headline']
text = first_row['Text']

In [9]:
ner_results = nlp(headline + text)

In [10]:
ner_results

[{'entity': 'B-ORG',
  'score': 0.9932053,
  'index': 1,
  'word': 'AN',
  'start': 0,
  'end': 2},
 {'entity': 'I-ORG',
  'score': 0.992149,
  'index': 2,
  'word': '##Z',
  'start': 2,
  'end': 3},
 {'entity': 'B-ORG',
  'score': 0.99892753,
  'index': 4,
  'word': 'RB',
  'start': 9,
  'end': 11},
 {'entity': 'I-ORG',
  'score': 0.9988914,
  'index': 5,
  'word': '##A',
  'start': 11,
  'end': 12},
 {'entity': 'B-ORG',
  'score': 0.9988292,
  'index': 20,
  'word': 'Australia',
  'start': 56,
  'end': 65},
 {'entity': 'I-ORG',
  'score': 0.99927264,
  'index': 21,
  'word': '&',
  'start': 66,
  'end': 67},
 {'entity': 'I-ORG',
  'score': 0.9993352,
  'index': 22,
  'word': 'New',
  'start': 68,
  'end': 71},
 {'entity': 'I-ORG',
  'score': 0.9992262,
  'index': 23,
  'word': 'Zealand',
  'start': 72,
  'end': 79},
 {'entity': 'I-ORG',
  'score': 0.999316,
  'index': 24,
  'word': 'Banking',
  'start': 80,
  'end': 87},
 {'entity': 'I-ORG',
  'score': 0.99929905,
  'index': 25,
  'w