# POS Tagging

In [1]:
import glob
from bs4 import BeautifulSoup as bs

import nltk
from nltk import pos_tag, word_tokenize
from nltk.tokenize.punkt import PunktSentenceTokenizer

import matplotlib.pyplot as plt
import seaborn as sb

import pandas as pd

sentence_tokenizer = PunktSentenceTokenizer()
def tokenize_word(s):
    return [word for word in word_tokenize(s) if any(char.isalpha() or char.isdigit() for char in word)]

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\zytan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\zytan\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

## Chemistry

### Load Text

In [2]:
chemistry_texts = dict()

def remove(text, tag):
    front = "<" + tag + ">"
    end = "</" + tag + ">"
    return text.replace(front, "").replace(end, "")    

# Read the XML file
for filename in glob.glob('Dataset/chemistry/*.xml'):
    all_paragraphs = list()
    with open(filename, "r") as f:
        # Read each line in the file, readlines() returns a list of lines
        content = f.read()
        content = remove(content, "sup")
        content = remove(content, "sub")
        content = remove(content, "italic")
        content = remove(content, "bold")
        soup = bs(content, 'html.parser')
        for p in soup.find_all('p'):
            paragraph = p.get_text(' ').replace('\u2009', ' ')
            all_paragraphs.append(paragraph)
    chemistry_texts[filename] = "\n".join(all_paragraphs)

### Tokenize and POS Tagging

In [3]:
chemistry_sentences = {k: sentence_tokenizer.tokenize(v) for k, v in chemistry_texts.items()}

In [4]:
# Choose Sentences
chemistry_choosen = [
    chemistry_sentences['Dataset/chemistry\\1234269.xml'][114],
    chemistry_sentences['Dataset/chemistry\\7617989.xml'][217],
    chemistry_sentences['Dataset/chemistry\\3928204.xml'][18]
]
chemistry_choosen

['High-speed countercurrent chromatography (HSCCC) is a new type of liquid–liquid partition chromatography technology.',
 'For the adsorption of CR and SF, the thermodynamic parameters (∆G°, ∆H°, ∆S°) were determined using the Van’t Hoff equation.',
 'The nitrogen atom of imine (-C=N-) Schiff base group is thought to involve in hydrogen bonding with several cellular constituents [ 15 ] which can modulate activities and processes.']

In [5]:
chemistry_tag = [pos_tag(tokenize_word(s)) for s in chemistry_choosen]

for s in chemistry_tag:
    print(" ".join(["/".join(tag) for tag in s]))

High-speed/JJ countercurrent/NN chromatography/NN HSCCC/NNP is/VBZ a/DT new/JJ type/NN of/IN liquid–liquid/JJ partition/NN chromatography/NN technology/NN
For/IN the/DT adsorption/NN of/IN CR/NNP and/CC SF/NNP the/DT thermodynamic/JJ parameters/NNS ∆G°/VBP ∆H°/JJ ∆S°/NNS were/VBD determined/VBN using/VBG the/DT Van/NNP t/NN Hoff/NNP equation/NN
The/DT nitrogen/NN atom/NN of/IN imine/JJ -C=N-/NNP Schiff/NNP base/NN group/NN is/VBZ thought/VBN to/TO involve/VB in/IN hydrogen/NN bonding/NN with/IN several/JJ cellular/JJ constituents/NNS 15/CD which/WDT can/MD modulate/VB activities/NNS and/CC processes/NNS


## Legal

### Load Text

In [6]:
legal_texts = dict()

# Read the XML files
for filename in glob.glob('Dataset/legal/*.xml'):
    with open(filename, 'r') as f:
        content = f.read()
        soup = bs(content, 'html.parser')
        legal_texts[filename] = soup.get_text()

### Tokenize and POS Tagging

In [7]:
legal_sentences = {k: sentence_tokenizer.tokenize(v) for k, v in legal_texts.items()}

In [8]:
# Choose Sentences
legal_choosen = [
    legal_sentences['Dataset/legal\\06_1.xml'][3],
    legal_sentences['Dataset/legal\\06_11.xml'][12],
    legal_sentences['Dataset/legal\\06_15.xml'][20]
]
legal_choosen

['Wilcox J made orders ancillary to the Mareva orders on 22 March 2005 requiring each of the Sharman applicants to disclose on affidavit the description and value of all of their assets, wherever situated, and to specify whether those assets were held by each applicant either beneficially or in trust for any other person or entity.',
 "The 1994 Award is an award made by the Australian Industrial Relations Commission ('the Commission'), pursuant to the Industrial Relations Act 1988 (Cth), which subsequently became the WR Act.",
 'The first respondent proposes to put a logo on the façade of the store above the entrance which displays the words "Oasis Foam & Rubber" against a light blue or aqua background to the word "Oasis".']

In [9]:
legal_tag = [pos_tag(tokenize_word(s)) for s in legal_choosen]

for s in legal_tag:
    print(" ".join(["/".join(tag) for tag in s]))

Wilcox/NNP J/NNP made/VBD orders/NNS ancillary/JJ to/TO the/DT Mareva/NNP orders/NNS on/IN 22/CD March/NNP 2005/CD requiring/VBG each/DT of/IN the/DT Sharman/NNP applicants/NNS to/TO disclose/VB on/IN affidavit/NN the/DT description/NN and/CC value/NN of/IN all/DT of/IN their/PRP$ assets/NNS wherever/RB situated/VBN and/CC to/TO specify/VB whether/IN those/DT assets/NNS were/VBD held/VBN by/IN each/DT applicant/NN either/CC beneficially/RB or/CC in/IN trust/NN for/IN any/DT other/JJ person/NN or/CC entity/NN
The/DT 1994/CD Award/NNP is/VBZ an/DT award/NN made/VBN by/IN the/DT Australian/JJ Industrial/NNP Relations/NNPS Commission/NNP 'the/POS Commission/NNP pursuant/NN to/TO the/DT Industrial/NNP Relations/NNP Act/NNP 1988/CD Cth/NNP which/WDT subsequently/RB became/VBD the/DT WR/NNP Act/NNP
The/DT first/JJ respondent/NN proposes/VBZ to/TO put/VB a/DT logo/NN on/IN the/DT façade/NN of/IN the/DT store/NN above/IN the/DT entrance/NN which/WDT displays/VBZ the/DT words/NNS Oasis/NNP Foam/

## Sports

### Load Text

In [10]:
with open('Dataset/sports.txt', 'r', encoding='utf-8') as f:
    sports_texts = f.read()

### Tokenize and POS Tagging

In [11]:
sports_sentences = sentence_tokenizer.tokenize(sports_texts)

In [12]:
# Choose Sentences
sports_choosen = [
    sports_sentences[126],
    sports_sentences[1742],
    sports_sentences[141]
]
sports_choosen

['Yet Howard still possesses one of the more powerful strokes in the league, evidenced by 14 homers in 71 games, and his 27.5 home-run-to-fly-ball percentage was his best showing since 2008.',
 "For her eighth birthday, 10 days before the league's first game, her uncle gave her a pair of regulation-size WNBA basketballs, one outdoor, one indoor.",
 'Suppose their remains suspicion from Dunn’s infamous 2011 output (.159/.292/.277 in 122 games), yet the Windy City slugger’s rushed return from an early-season emergency appendectomy was probably the catalyst for this lethargic showing.']

In [13]:
sports_tag = [pos_tag(tokenize_word(s)) for s in sports_choosen]

for s in sports_tag:
    print(" ".join(["/".join(tag) for tag in s]))

Yet/RB Howard/NNP still/RB possesses/VBZ one/CD of/IN the/DT more/RBR powerful/JJ strokes/NNS in/IN the/DT league/NN evidenced/VBN by/IN 14/CD homers/NNS in/IN 71/CD games/NNS and/CC his/PRP$ 27.5/CD home-run-to-fly-ball/JJ percentage/NN was/VBD his/PRP$ best/JJS showing/NN since/IN 2008/CD
For/IN her/PRP$ eighth/JJ birthday/JJ 10/CD days/NNS before/IN the/DT league/NN 's/POS first/JJ game/NN her/PRP$ uncle/NN gave/VBD her/PRP a/DT pair/NN of/IN regulation-size/JJ WNBA/NNP basketballs/VBZ one/CD outdoor/NN one/CD indoor/NN
Suppose/VB their/PRP$ remains/NNS suspicion/NN from/IN Dunn/NNP s/RB infamous/JJ 2011/CD output/NN .159/.292/.277/NNP in/IN 122/CD games/NNS yet/RB the/DT Windy/NNP City/NNP slugger/NN s/NN rushed/VBD return/NN from/IN an/DT early-season/JJ emergency/NN appendectomy/NN was/VBD probably/RB the/DT catalyst/NN for/IN this/DT lethargic/JJ showing/NN
