# Installations and Imports

In [1]:
# Installations of libraries
!pip install PyPDF2
!pip install nltk
!pip install pdfminer
!pip install textblob
!pip install spacy

!python -m spacy download en_core_web_sm



In [9]:
# Imports
from PyPDF2 import PdfFileReader, PdfFileWriter
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from textblob import TextBlob
from collections import Counter
from spacy import displacy

import spacy
import en_core_web_sm
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer

import io
import nltk

In [10]:
# nltk download
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('tagsets')
nltk.download('maxent_ne_chunker')
nltk.download('words')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Jeremy\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Jeremy\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package tagsets to
[nltk_data]     C:\Users\Jeremy\AppData\Roaming\nltk_data...
[nltk_data]   Package tagsets is already up-to-date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\Jeremy\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\Jeremy\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


True

# PDF Article to Text

In [77]:
# Initiate pdf file
pdf_file = "../res/who_int_0.pdf"
# Initiate dump file
pdf_to_text = "../dump/pdf_to_text.txt"
df_to_text = "../dump/df_to_text.txt"

# Text to DataFrame

In [81]:
# Open text file
raw = open(pdf_to_text).read()

In [82]:
nlp = en_core_web_sm.load()
raw_nlp = nlp(raw)

In [83]:
sentences = [x for x in raw_nlp.sents]

In [84]:
sentence_index = []

# Contain tokenized data and its corresponding sentence index
for idx1 in range(len(sentences)):
    temporary_sentence = sentences[idx1]
    temporary_tokens = [x for x in temporary_sentence]
    
    for idx2, val in enumerate(temporary_tokens):
        sentence_index.append(idx1)

print(sentence_index[0:20])

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1]


In [85]:
label = ([(x.text, x.label_) for x in raw_nlp.ents])

word, label = zip(*label)

labels_df = pd.DataFrame(zip(word, label), columns=["Word", "Label"])
labels_df

Unnamed: 0,Word,Label
0,3/15/22,CARDINAL
1,10:32 AM,TIME
2,Western Pacific Philippines,ORG
3,March 2022,DATE
4,Rajendra Yadav,PERSON
5,Sangjun Moon,PERSON
6,Philippines,GPE
7,Manila,GPE
8,Philippines,GPE
9,11 March 2022,DATE


In [86]:
ent_type = ([(x, x.pos_, spacy.explain(x.tag_), x.tag_, x.ent_iob_, x.ent_type_) for x in raw_nlp])

# Words and Tags list
word, pos, e_tags, tags, iob, e_type = zip(*ent_type)

In [87]:
# Length must be the same
print(len(sentence_index), type(sentence_index))
print(len(word), type(word))
print(len(pos), type(pos))
print(len(e_tags), type(e_tags))
print(len(tags), type(tags))
print(len(iob), type(iob))
print(len(e_type), type(e_type))

1464 <class 'list'>
1464 <class 'tuple'>
1464 <class 'tuple'>
1464 <class 'tuple'>
1464 <class 'tuple'>
1464 <class 'tuple'>
1464 <class 'tuple'>


In [88]:
df = pd.DataFrame(zip(sentence_index, word, pos, e_tags, tags, iob, e_type), 
                  columns=["Sentence_Index", "Token", "Pos", "Explained_Tag", "Tag", "iob_Tag", "Entity_Type"])
df

Unnamed: 0,Sentence_Index,Token,Pos,Explained_Tag,Tag,iob_Tag,Entity_Type
0,0,3/15/22,NUM,cardinal number,CD,B,CARDINAL
1,0,",",PUNCT,"punctuation mark, comma",",",O,
2,0,10:32,NUM,cardinal number,CD,B,TIME
3,0,AM,NOUN,"noun, singular or mass",NN,I,TIME
4,0,,SPACE,whitespace,_SP,O,
...,...,...,...,...,...,...,...
1459,81,.,PUNCT,"punctuation mark, sentence closer",.,O,
1460,82,\n,SPACE,whitespace,_SP,O,
1461,82,https://www.who.int/philippines/news/detail/11...,PROPN,"noun, proper singular",NNP,O,
1462,82,,SPACE,whitespace,_SP,O,


In [89]:
# Output to csv
df.to_csv('../data/csv_dataset/data.csv', index=True, header=True)

# DataFrame to Text

In [90]:
sentences = [x.text for x in raw_nlp.sents]
print(sentences[0])

3/15/22, 10:32 AM  Is the pandemic ending soon?


In [91]:
Counter(labels)

Counter({'CARDINAL': 9,
         'TIME': 3,
         'ORG': 16,
         'DATE': 6,
         'PERSON': 4,
         'GPE': 10,
         'PERCENT': 1,
         'PRODUCT': 2,
         'LOC': 2,
         'NORP': 1,
         'WORK_OF_ART': 1})

In [107]:
# Only read lines with entities
for i in sentences:
    sent = nlp(i)
    if sent.ents:
        displacy.render(sent, jupyter=True, style='ent')
    else:
        pass

In [67]:
print(len(labels))

55
