# Installations

In [1]:
# Installations of libraries
!pip install PyPDF2
!pip install nltk
!pip install pdfminer
!pip install textblob
!pip install spacy
!pip install svgutils

!python -m spacy download en_core_web_sm
!python -m spacy download fr_core_news_md

Collecting en-core-web-sm==3.2.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.2.0/en_core_web_sm-3.2.0-py3-none-any.whl (13.9 MB)
[+] Download and installation successful
You can now load the package via spacy.load('en_core_web_sm')


2022-03-26 23:27:20.927114: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'cudart64_110.dll'; dlerror: cudart64_110.dll not found
2022-03-26 23:27:20.933522: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


Collecting fr-core-news-md==3.2.0
  Downloading https://github.com/explosion/spacy-models/releases/download/fr_core_news_md-3.2.0/fr_core_news_md-3.2.0-py3-none-any.whl (46.9 MB)

2022-03-26 23:28:25.023885: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'cudart64_110.dll'; dlerror: cudart64_110.dll not found
2022-03-26 23:28:25.023936: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.



[+] Download and installation successful
You can now load the package via spacy.load('fr_core_news_md')


# IMPORTS

In [360]:
# Imports
from textblob import TextBlob
from collections import Counter
from pathlib import Path

import spacy
from spacy import displacy
import fr_core_news_md

import pandas as pd

from wordcloud import *
import matplotlib.pyplot as plt

from tensorflow.keras.preprocessing.text import Tokenizer

import svgutils.transform as st

import os
import en_core_web_sm
import glob
import io
import operator

from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams

# Documents/PDFs/Texts to Texts

In [361]:
docu_path = "../res/"

# Path to Generated data
# Always the same
dataset_path = "../data/" 

# Dynamic due to filename
text_path = "../data/per_article_text_files/"
classification_path = "../data/per_article_classification/"
entities_path = "../data/per_article_entities_label/"
render_path = "../data/sentence_render_html/"
entities_visuals_path = "../data/per_article_top_entities_visuals/"

In [362]:
# Read only documents that ends with .pdf .docx .txt
pdf_files = glob.glob(os.path.join(docu_path, "*.pdf"))
doc_files = glob.glob(os.path.join(docu_path, "*.docx"))
txt_files = glob.glob(os.path.join(docu_path, "*.txt"))
accepted_files = []

accepted_files = [*pdf_files, *doc_files, *txt_files]

# Print Sample File
accepted_files[0]

'../res\\3M Moderna vaccines from US Government arrive from COVAX Facility.pdf'

In [363]:
# Function to generate a dictionary with a unique ascending id
def generate_dataset_dictionary(files):
    dict = {"Docu_ID": [], "File_Name": []}
    
    for idx, file in enumerate(files):
        split_file_name = file.split("\\")
        file_name = split_file_name[1]

        dict["Docu_ID"].append("1"+str(idx).zfill(4))
        # dict["Docu_ID"].append(file_name[:len(file_name[idx]) - 5]+"_00"+str(idx))
        dict["File_Name"].append(file_name)
    
    return dict
    

In [364]:
document_dict = generate_dataset_dictionary(accepted_files)

# Initial Dataframe
initial_df = pd.DataFrame(document_dict)
initial_df['Docu_ID'] = initial_df['Docu_ID'].astype('str')
initial_df[0:5]

Unnamed: 0,Docu_ID,File_Name
0,10000,3M Moderna vaccines from US Government arrive ...
1,10001,Asia Pacific health and finance ministers stre...
2,10002,Breastfeeding must continue amidst COVID-19.pdf
3,10003,Community Innovation to Support Surveillance a...
4,10004,"DOH, RITM, WHO establish subnational laborator..."


In [365]:
# Create dataset containing the file names of articles
initial_df.to_csv(dataset_path + "docu_dataset.csv", index=False, header=True)
print("Text File created . Path: {}\n".format(dataset_path + "docu_dataset.csv"))

Text File created . Path: ../data/docu_dataset.csv



In [366]:
# Function to convert document file to txt file with per line sentences
def pdf_to_text_sentences(pdf_file, docu_id):
    inFile = open(pdf_file, 'rb')
    resMgr = PDFResourceManager()
    retData = io.StringIO()
    TxtConverter = TextConverter(resMgr, retData,laparams = LAParams())
    interpreter = PDFPageInterpreter(resMgr, TxtConverter)

    # Process each pages
    for page in PDFPage.get_pages(inFile): 
        interpreter.process_page(page)


    # Write to text temporary file
    txt = retData.getvalue()

    # Acquire sentences
    blob = TextBlob(txt)
    sentences = []

    # Append to sentences whilest removing 'sentence(...)' on each sentences/list elements
    for s in blob.sentences:
         sentences.append(str(s).strip()),

    cleaned_sentences = []

    # Append to cleaned_sentences whilest removing new lines or '\n'
    for x in sentences:
         cleaned_sentences.append(x.replace("\n", " "))

    # Create and Open new temp text file and write cleaned sentences on a per line basis
    with open(text_path + "txt_" + docu_id + ".txt", 'w', encoding='utf-8') as f: 
        f.write('\n'.join(cleaned_sentences))
        print("Text File created . Path: {}\n".format(text_path + 
                                                      "txt_" + 
                                                      docu_id + 
                                                      ".txt"))
    
    inFile.close()

In [367]:
# Separate each sentences from the document
# May take some time
for idx in initial_df.index:
    pdf_to_text_sentences(docu_path + str(initial_df["File_Name"][idx]), str(initial_df["Docu_ID"][idx]))

Text File created . Path: ../data/per_article_text_files/txt_10000.txt

Text File created . Path: ../data/per_article_text_files/txt_10001.txt

Text File created . Path: ../data/per_article_text_files/txt_10002.txt

Text File created . Path: ../data/per_article_text_files/txt_10003.txt

Text File created . Path: ../data/per_article_text_files/txt_10004.txt

Text File created . Path: ../data/per_article_text_files/txt_10005.txt

Text File created . Path: ../data/per_article_text_files/txt_10006.txt

Text File created . Path: ../data/per_article_text_files/txt_10007.txt

Text File created . Path: ../data/per_article_text_files/txt_10008.txt

Text File created . Path: ../data/per_article_text_files/txt_10009.txt

Text File created . Path: ../data/per_article_text_files/txt_10010.txt

Text File created . Path: ../data/per_article_text_files/txt_10011.txt

Text File created . Path: ../data/per_article_text_files/txt_10012.txt

Text File created . Path: ../data/per_article_text_files/txt_100

In [368]:
# Generate temporary text files containing per line sentences of a single article
txt_files = glob.glob(os.path.join(text_path, "*.txt"))

text_dict = generate_dataset_dictionary(txt_files)

text_df = pd.DataFrame(text_dict)
text_df[0:5]

Unnamed: 0,Docu_ID,File_Name
0,10000,txt_10000.txt
1,10001,txt_10001.txt
2,10002,txt_10002.txt
3,10003,txt_10003.txt
4,10004,txt_10004.txt


In [369]:
# Write to data folder name = "text_dataset.csv"
text_df.to_csv(dataset_path + "text_dataset.csv", index=False, header=True)
print("CSV File created successfully. Path: {}\n".format(dataset_path + "text_dataset.csv"))

CSV File created successfully. Path: ../data/text_dataset.csv



# Classify Text File

In [370]:
# Open File
def open_file(file):
    # Open text file
    raw = open(file, encoding='utf-8').read()

    nlp = en_core_web_sm.load()
    nlp.max_length = 3000000
    
    raw_nlp = nlp(raw)
    
    return raw_nlp

In [371]:
# Function to determine sentence index of word/element per document when 
# creating another dataset (Can be used for data visualizations)
def split_sentences(raw_nlp):
    sentences = [x for x in raw_nlp.sents]
    sentence_index = []
    
    # Contain tokenized data and its corresponding sentence index
    for idx1 in range(len(sentences)):
        temporary_sentence = sentences[idx1]
        temporary_tokens = [x for x in temporary_sentence]
        
        for idx2, val in enumerate(temporary_tokens):
            sentence_index.append(idx1)
    
    return sentence_index

In [372]:
# Generate labels dataset (.csv dataset with 2 columns ["Word", "Label"])
def generate_labels_dataset(raw_nlp, docu_id):
    label = ([(x.text, x.label_) for x in raw_nlp.ents])
    word, label = zip(*label)
    labels_df = pd.DataFrame(zip(word, label), columns=["Word", "Label"])
    
    labels_df.to_csv(entities_path + "ent_" + docu_id + ".csv", index=False, header=True)
    print("CSV File created successfully. Path: {}\n".format(entities_path + "ent_" + docu_id + ".csv"))

In [373]:
# Read from csv
text_df = pd.read_csv(dataset_path + "text_dataset.csv")
# Sample data
text_df[0:2]

Unnamed: 0,Docu_ID,File_Name
0,10000,txt_10000.txt
1,10001,txt_10001.txt


In [374]:
# For every text file, read each sentences/lines and use spacy nlp to classify each entities
for idx in text_df.index:
    raw_nlp = open_file(text_path + str(text_df["File_Name"][idx]))
    sentence_index = split_sentences(raw_nlp)
    generate_labels_dataset(raw_nlp, str(text_df["Docu_ID"][idx]))
    
    # Entity types such as pos tagging, inside-outside-beginning tags, entity types, etc...
    ent_type = ([(x, x.pos_, spacy.explain(x.tag_), x.tag_, x.ent_iob_, x.ent_type_) for x in raw_nlp])
    
    # Words and Tags list
    word, pos, e_tags, tags, iob, e_type = zip(*ent_type)
    
    classification_df = pd.DataFrame(zip(sentence_index, word, pos, e_tags, tags, iob, e_type), 
                      columns=["Sentence_Index", "Token", "Pos", "Explained_Tag", "Tag", "iob_Tag", "Entity_Type"])
    
    # Output to csv, do note that the csv file is for analyzing purposes
    # .csv Dataset with 7 columns: 
    # Sentence_Index (0,0,0,1,1,2, ...)
    # Token (Moderna, vaccines, from, US, Government, ...)
    # Pos (PROPN, ADP, SPACE. NUM, ...)
    # Explained_Tag (cardinal number, punctuation mark, noun, ...)
    # Tag (CD, ., NN, _SP, NNP, ...)
    # iob_Tag (B, O, B, I, O, O, ...)
    # Entity_Type (CARDINAL, TIME, ORG, LOC, ...)
    classification_df.to_csv(str(classification_path)  + "classification_" + 
                             str(text_df["Docu_ID"][idx]) + ".csv", index=False, header=True)
    
    print("CSV File created successfully. Path: {}\n".format(str(classification_path) + 
                                                           "classification_" + 
                                                           str(text_df["Docu_ID"][idx])))

CSV File created successfully. Path: ../data/per_article_entities_label/ent_10000.csv

CSV File created successfully. Path: ../data/per_article_classification/classification_10000

CSV File created successfully. Path: ../data/per_article_entities_label/ent_10001.csv

CSV File created successfully. Path: ../data/per_article_classification/classification_10001

CSV File created successfully. Path: ../data/per_article_entities_label/ent_10002.csv

CSV File created successfully. Path: ../data/per_article_classification/classification_10002

CSV File created successfully. Path: ../data/per_article_entities_label/ent_10003.csv

CSV File created successfully. Path: ../data/per_article_classification/classification_10003

CSV File created successfully. Path: ../data/per_article_entities_label/ent_10004.csv

CSV File created successfully. Path: ../data/per_article_classification/classification_10004

CSV File created successfully. Path: ../data/per_article_entities_label/ent_10005.csv

CSV File

# Generate Visuals per Sentence

In [375]:
nlp = spacy.load("en_core_web_sm")

for idx in text_df.index:
    raw_nlp = open_file(text_path + str(text_df["File_Name"][idx]))
    
    counter = 0
    sentences = [x.text for x in raw_nlp.sents]
    
    output_path = Path(render_path + 
                          initial_df["File_Name"][idx][:len(initial_df["File_Name"][idx]) - 4]  
                          + ".html")
    
    # Only read lines with entities for faster classification
    for i in sentences:
        # Process single sentence
        sent = nlp(i)
       
        if sent.ents:
            if counter == 0:
                # Create svg and render one sentence
                svg = displacy.render(sent, jupyter=False, style='ent')
            else:
                # Render to existing svg
                svg = svg + displacy.render(sent, jupyter=False, style='ent')
            counter = counter + 1
        else:
            pass
        
        # Write to html
        output_path.open("w", encoding='utf-8').write(svg)
        
    print("HTML File generated successfully. Path: {}\n".format(output_path))


        # Path to html data folder = data/sentence_render_img

HTML File generated successfully. Path: ..\data\sentence_render_html\3M Moderna vaccines from US Government arrive from COVAX Facility.html

HTML File generated successfully. Path: ..\data\sentence_render_html\Asia Pacific health and finance ministers stress importance of universal health coverage in COVID-19 era and beyond.html

HTML File generated successfully. Path: ..\data\sentence_render_html\Breastfeeding must continue amidst COVID-19.html

HTML File generated successfully. Path: ..\data\sentence_render_html\Community Innovation to Support Surveillance and Contact Tracing.html

HTML File generated successfully. Path: ..\data\sentence_render_html\DOH, RITM, WHO establish subnational laboratories to expand the country's capacity in detecting vaccine-preventable diseases.html

HTML File generated successfully. Path: ..\data\sentence_render_html\DOH, WHO urge devotees to safely observe Traslacion at home.html

HTML File generated successfully. Path: ..\data\sentence_render_html\EU an

# Generate Top Tags Per Article

In [376]:
# Retrieve all entities csv file 
# inside the per_article_entities_label folder
ent_csv_files = glob.glob(os.path.join(entities_path, "*.csv"))

# Required major tags list
major_tags_target_list = ["LOC", "PERSON", "ORG", "NORP", "GPE"]

# For each csv files generate a wordcloud of the top 
for idx, ent_csv_f in enumerate(ent_csv_files):
    ent_csv_df = pd.read_csv(ent_csv_f)
    
    output_path = Path(entities_visuals_path + 
                       initial_df["File_Name"][idx][:len(initial_df["File_Name"][idx]) - 4]  
                       + ".png")
    
    # Article's major tags container list
    article_major_tags = []
    
    # Append to article major tag if label is in target list
    for a, b in ent_csv_df.itertuples(index=False):
        if b in major_tags_target_list:
            article_major_tags.append(a)

    plt.figure(figsize=(20,14))
    word_cloud = WordCloud(background_color='black',
                           max_font_size = 80).generate(" ".join(article_major_tags[:10]))
    plt.imshow(word_cloud)
    plt.savefig(output_path)
    plt.close()
    
    print("PNG Image generated successfully. Path: {}\n".format(output_path))
    

PNG Image generated successfully. Path: ..\data\per_article_top_entities_visuals\3M Moderna vaccines from US Government arrive from COVAX Facility.png

PNG Image generated successfully. Path: ..\data\per_article_top_entities_visuals\Asia Pacific health and finance ministers stress importance of universal health coverage in COVID-19 era and beyond.png

PNG Image generated successfully. Path: ..\data\per_article_top_entities_visuals\Breastfeeding must continue amidst COVID-19.png

PNG Image generated successfully. Path: ..\data\per_article_top_entities_visuals\Community Innovation to Support Surveillance and Contact Tracing.png

PNG Image generated successfully. Path: ..\data\per_article_top_entities_visuals\DOH, RITM, WHO establish subnational laboratories to expand the country's capacity in detecting vaccine-preventable diseases.png

PNG Image generated successfully. Path: ..\data\per_article_top_entities_visuals\DOH, WHO urge devotees to safely observe Traslacion at home.png

PNG Imag

# Search using Top Tags/Words (PDF Files only Available in ../res)

In [377]:
import uuid

contains = []
id_ = uuid.uuid4()
accepted_exts = [".pdf", ".docx", ".txt"]

In [378]:
# Function to search tags in entities csv
def search_tags_in_entities_csv(word):
    found_docu_id_dict = {}
    for idx, ent_csv_f in enumerate(ent_csv_files):
        
        # Ready an entity csv in ent_csv_files / per_article_entities_label
        ent_csv_df = pd.read_csv(ent_csv_f)
        
        # Count exact word
        word_total_occurence = ent_csv_df["Word"].str.count(r'(?<!\S){}(?!\S)'.format(str(word))).sum()
        
        if word_total_occurence > 0:
            # Retrieve only the document id
            size = len(ent_csv_files[idx])
            extracted_docu_id = ent_csv_files[idx][:size - 4]
            
            found_docu_id_dict[str(extracted_docu_id[-5:])] = str(word_total_occurence)
            
    return found_docu_id_dict

In [379]:
# Function to search document name through document id
# Ex. Docu_ID: 10000, File_Name: 3M Moderna vaccines...
def search_docu_name_through_docu_id(docu_id):
    docu_id = str(docu_id)
    docu_df = pd.read_csv(dataset_path + "docu_dataset.csv")
    
    for idx, row in docu_df.iterrows():
        if str(row["Docu_ID"]) == docu_id:
            return row["File_Name"]

In [380]:
# Function to find specific document file to recommend
def find_document_file(filename, search_path):
    results_list = []
    
    for root, dir, files in os.walk(search_path):
        if filename in files:
            results_list.append(os.path.join(root, filename))
    # Remove duplicates if ever
    results_list = list(dict.fromkeys(results_list))
    return results_list

In [381]:
# Method to make link clickable
# https://www.geeksforgeeks.org/how-to-create-a-table-with-clickable-hyperlink-to-a-local-file-in-pandas/
def make_clickable(url):
    name= os.path.basename(url)
    return '<a href="{}">{}</a>'.format(url,name)

In [382]:
user_search_input = input("E-Library Search: ")

found_docu_id_dict = search_tags_in_entities_csv(str(user_search_input))

# Sort according to Occurence of word
sorted_found_docu_id_dict = sorted_d = dict(sorted(found_docu_id_dict.items(), 
                                                   key=operator.itemgetter(1),reverse=True))

if bool(sorted_found_docu_id_dict) == False:
    print("\nNo PDFs Can Be Recommended!")
else:
    for k, v in sorted_found_docu_id_dict.items():
        filename = search_docu_name_through_docu_id(k)
        results_list = find_document_file(filename, docu_path)
        
        # Localhost links
        local_links_df = pd.DataFrame({"Hits": v, "Link": results_list})
        
        # Align it to the left for readability
        left_aligned_df = local_links_df.style.set_properties(**{'text-align': 'left'})
        # Align column to the left for readability
        left_aligned_df = left_aligned_df.set_table_styles(
        [dict(selector = 'th', props=[('text-align', 'left')])])
        
        # Display without index
        display(left_aligned_df.hide().format({'Link' : make_clickable}))

E-Library Search:  Manila


Hits,Link
2,3M Moderna vaccines from US Government arrive from COVAX Facility.pdf


Hits,Link
2,Investing in mental health benefits people and the economy.pdf


Hits,Link
1,Breastfeeding must continue amidst COVID-19.pdf


Hits,Link
1,"DOH, RITM, WHO establish subnational laboratories to expand the country's capacity in detecting vaccine-preventable diseases.pdf"


Hits,Link
1,"DOH, WHO urge devotees to safely observe Traslacion at home.pdf"


Hits,Link
1,EU and WHO provide lifesaving medical-grade oxygen for preparedness against new COVID-19 variants.pdf


Hits,Link
1,Gender equality makes everyone healthier_ WHO.pdf


Hits,Link
1,"Germany supports the Philippines with first 844,800 COVID-19 vaccine doses via COVAX.pdf"


Hits,Link
1,Health leaders endorse action plan to end TB in the Region.pdf


Hits,Link
1,Minimizing the impact of the Delta variant in the Philippines.pdf


Hits,Link
1,"Region’s leaders gather virtually to tackle COVID-19 and other health issues, chart WHO’s work in the Western Pacific.pdf"
