# 1. Installs, Imports and Settings

In [1]:
#!pip install spacy==3.1.1 #restart runtime after this
#!python -m spacy download en_core_web_sm

import csv
import pandas as pd
pd.options.display.max_rows
pd.set_option('display.max_colwidth', None)

import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()
nltk.download('wordnet');

import string
import operator
from itertools import islice
from collections import Counter

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


#  2. Reading files 

In [2]:
df = pd.read_csv('data/alertas.csv', sep="|")
df = df[df["Text"] != " "]
print("Size of Data:", len(df))
print()
print("Columns : ", list(df.columns))

Size of Data: 666

Columns :  ['Filename', 'Text', 'Type', 'Year']


# 3. Functions

In [3]:
stopw = ["'s", "s", "@", '*', '’', "t", "gt", "http", "https", "amp", "m", 'i', 'u', 'youtu.be/Sj9uLcw-yl4', 
        "'m", '\-', '[', ']', '·', 're', '“', '”']
#m is the number of top ngrams.
def getNPartsOfSpeech(text, m, tag):
    h_dict = {}   
    tokens = nltk.word_tokenize(text)
    pos_tags = nltk.pos_tag(tokens)
    
    #lemmatization and filtering allowed tags
    filtered_tags = [wordnet_lemmatizer.lemmatize(pt[0], pos="v") for pt in pos_tags if pt[1].startswith(tag)]
    filtered_tags = [f for f in filtered_tags if f not in stopw]
    
    for ft in filtered_tags:    
        if ft not in h_dict:
            h_dict[ft] = 0
        h_dict[ft] += 1
        
    sorted_dict = sorted(h_dict.items(), key=operator.itemgetter(1), reverse=True)
    top_values = list(islice(sorted_dict, m))
    return [(r[0], r[1], round(r[1]/len(filtered_tags)*100,2)) for r in top_values]

def printNPOS(data, m):
    postags = []
    
    verbs = getNPartsOfSpeech(data, m, 'V')
    verbs += [(None, None)] * (m - len(verbs))
    
    adjs = getNPartsOfSpeech(data, m , 'J')
    adjs +=[(None, None)] * (m - len(adjs))
    
    nouns = getNPartsOfSpeech(data, m , 'N')
    nouns +=[(None, None)] * (m - len(nouns))
    
    for i in range(0,m):
        if all(verbs[i]) or all(adj[i]) or all(nouns[i]):
            postags.append(verbs[i] + adjs[i] + nouns[i])
    df = pd.DataFrame(postags, columns=['Verbs', 'Absolute Freq', 'Relative Freq', 
                                       'Adjectives', 'Absolute Freq', 'Relative Freq',
                                        'Nouns', 'Absolute Freq', 'Relative Freq' ]) 
    return df

# 4. Top 20 Verbs, Adj, and Nouns in the whole corpus

In [None]:
printNPOS(' '.join(df['Text']), 20)

## 4.1 Top 20 Verbs, Adj, and Nouns in Advertencias

In [None]:
dfg = df[df["Type"] == "advertencia"]
printNPOS(' '.join(dfg['Text']), 20)

## 4.1 Top 20 Verbs, Adj, and Nouns in Seguimientos

In [None]:
dfg = df[df["Type"] == "seguimiento"]
printNPOS(' '.join(dfg['Text']), 20)

## 5. Exporting to html

In [5]:
!jupyter nbconvert --to html 4

[NbConvertApp] Converting notebook 8_NLP_Word_Tagging.ipynb to html
[NbConvertApp] Writing 586885 bytes to 8_NLP_Word_Tagging.html
