In [1]:
import os
os.chdir("../../")
import pandas as pd
import numpy as np
import spacy
from tqdm import tqdm

In [2]:
nlp = spacy.load("en_core_web_sm")

def process_data(filename, folderpath):
    df = pd.read_csv(folderpath + filename).drop("Unnamed: 0", axis=1)
    df["news"] = df["news"].replace("\n", "")
    df["date"] = pd.to_datetime(df["date"])
    df["ym"] = [str(d.year) + "-" + str(d.month) for d in df.date]
    return df

def extract_entities(corpus: str):
    doc = nlp(corpus)
    ner_dict = {"LOC": "", "ORG": "", "GPE": "", "WORK_OF_ART": ""}
    for e in doc.ents:
        if e.label_ in ner_dict.keys() and e.text not in ner_dict[e.label_]:
            ner_dict[e.label_] += e.text + ", "

    for key, val in ner_dict.items():
        if val.endswith(", "):
            ner_dict[key] = val[:-2]
    return ner_dict

In [4]:
target_dir = os.getcwd() + "/data/text/solomon_islands/"
files = [file for file in os.listdir(target_dir) if "news" in file]
for file in files:
    df = process_data(file, target_dir)
    name = file.replace(".csv", "") + "_ner.csv"

    output = []
    with tqdm(total=len(df)) as pbar:
        for news in df["news"]:
            if isinstance(news, str):
                ner_dict = extract_entities(news)
            else:
                ner_dict = {
                    "LOC": "Missing",
                    "ORG": "Missing",
                    "GPE": "Missing",
                    "WORK_OF_ART": "Missing"
                }
            output.append(ner_dict)
            pbar.update(1)
    
    ner_df = pd.DataFrame(output)
    ner_df["url"] = df["url"].tolist()
    ner_df.to_csv(target_dir + name, encoding="utf-8")

100%|██████████████████████████████████████████████████████████████████| 9013/9013 [04:45<00:00, 31.61it/s]
100%|████████████████████████████████████████████████████████████████| 14200/14200 [11:05<00:00, 21.34it/s]
100%|████████████████████████████████████████████████████████████████| 11049/11049 [07:54<00:00, 23.28it/s]
100%|██████████████████████████████████████████████████████████████████| 9017/9017 [06:42<00:00, 22.41it/s]


In [50]:
ner_files = [file for file in os.listdir(target_dir) if "ner" in file]
for nfile in ner_files:
    nfilepath = target_dir + nfile
    n_df = pd.read_csv(nfilepath).drop("Unnamed: 0", axis=1)
    n_df = n_df.fillna("Missing").apply(lambda x: x.str.lower())
    for col in ["LOC", "ORG", "GPE", "WORK_OF_ART"]:
        d = n_df[col].apply(lambda x: (True) if re.findall("risk", x) else False)
        print(f"{nfile}'s {col} has {d.sum()}")

island_sun_news_ner.csv's LOC has 0
island_sun_news_ner.csv's ORG has 28
island_sun_news_ner.csv's GPE has 0
island_sun_news_ner.csv's WORK_OF_ART has 0
solomon_stars_news_ner.csv's LOC has 0
solomon_stars_news_ner.csv's ORG has 24
solomon_stars_news_ner.csv's GPE has 0
solomon_stars_news_ner.csv's WORK_OF_ART has 1
solomon_times_news_ner.csv's LOC has 0
solomon_times_news_ner.csv's ORG has 28
solomon_times_news_ner.csv's GPE has 0
solomon_times_news_ner.csv's WORK_OF_ART has 7
sibc_news_ner.csv's LOC has 0
sibc_news_ner.csv's ORG has 20
sibc_news_ner.csv's GPE has 0
sibc_news_ner.csv's WORK_OF_ART has 1


In [77]:
abc_dir = os.getcwd() + "/data/text/abc_au/"
abc_files = [file for file in os.listdir(abc_dir) if "news" in file]
test = pd.read_csv(abc_dir + abc_files[3]).drop("Unnamed: 0", axis=1)
test["tags"] = test["tags"].fillna("missing").str.lower()
tag_dict = {}
for tag in test.tags:
    tag_list = tag.split(",")[:-1]
    tag_list = [i.lower().strip() for i in tag_list]
    for i in tag_list:
        if i not in tag_dict.keys():
            tag_dict.update({str(i): 1})
        else:
            tag_dict[i] += 1

In [80]:
def get_news_count(data: pd.DataFrame, column: str):
    count_df = (data.set_index("date").groupby("ym")[[
        str(column)
    ]].count().reset_index().rename({str(column): str(column) + "_count"},
                                    axis=1))
    return count_df