# 1. Installs, Imports and Settings

In [1]:
#!pip install spacy==3.1.1 
#restart runtime after this
#!python -m spacy download es_core_news_lg

import spacy
nlp = spacy.load("es_core_news_lg")

import csv
import pandas as pd
pd.options.display.max_rows
pd.set_option('display.max_colwidth', None)

import string
import operator
from itertools import islice
from collections import Counter

#  2. Reading files 

In [6]:
df = pd.read_csv('data/alertas.csv', sep="|")
df = df[df["Text"] != " "]
print("Size of Data:", len(df))
print()
print("Columns : ", list(df.columns))

Size of Data: 666

Columns :  ['Filename', 'Text', 'Type', 'Year']


# 3. Functions

In [4]:
stopw = ["'s", "s", "@", '*', '’', "t", "gt", "http", "https", "amp", "m", 'i', 'u', 'youtu.be/Sj9uLcw-yl4', 
        "'m", '\-', '[', ']', '·', 're', '“', '”']
#m is the number of top ngrams.
def getNPartsOfSpeech(text, m, tag):
    #lemmatization and filtering allowed tags
    filtered_tags = []
    
    nlp.max_length = len(text) + 100
    doc = nlp(text)

    for token in doc:
        if token.tag_.startswith(tag): 
            filtered_tags.append([token.lemma_, token.tag_])
    dt = pd.Dataframe(filtered_tags, columns=['word', 'tag'])
    dt = dt.groupby(['word', 'tag']).count().sort_values(ascending=False)
    return dt[:m]    
    

def printNPOS(data, m):
    postags = []
    
    verbs = getNPartsOfSpeech(data, m, 'V')
    verbs += [(None, None)] * (m - len(verbs))
    
    adjs = getNPartsOfSpeech(data, m , 'J')
    adjs +=[(None, None)] * (m - len(adjs))
    
    nouns = getNPartsOfSpeech(data, m , 'N')
    nouns +=[(None, None)] * (m - len(nouns))
    
    for i in range(0,m):
        if all(verbs[i]) or all(adj[i]) or all(nouns[i]):
            postags.append(verbs[i] + adjs[i] + nouns[i])
    df = pd.DataFrame(postags, columns=['Verbs', 'Absolute Freq', 'Relative Freq', 
                                       'Adjectives', 'Absolute Freq', 'Relative Freq',
                                        'Nouns', 'Absolute Freq', 'Relative Freq' ]) 
    return df

In [None]:
filtered_tags = []
text = ' '.join(df['Text'])  
nlp.max_length = len(text) + 100
doc = nlp(text)

In [None]:
for token in doc:
    if token.tag_.startswith(tag): 
        filtered_tags.append([token.lemma_, token.tag_])

In [None]:
dt = pd.Dataframe(filtered_tags, columns=['word', 'tag'])
dt = dt.groupby(['word', 'tag']).count().sort_values(ascending=False)
return dt[:m] 

In [7]:
getNPartsOfSpeech(' '.join(df['Text']), 10, 'V')

KeyboardInterrupt: 

# 4. Top 20 Verbs, Adj, and Nouns in the whole corpus

In [4]:
printNPOS(' '.join(df['Text']), 20)

Unnamed: 0,Verbs,Absolute Freq,Relative Freq,Adjectives,Absolute Freq.1,Relative Freq.1,Nouns,Absolute Freq.2,Relative Freq.2
0,y,14311,5.87,los,19216,4.59,y,82741,3.53
1,el,9997,4.1,un,16615,3.97,la,65256,2.78
2,que,5750,2.36,las,16007,3.83,que,50737,2.16
3,se,5519,2.27,el,13066,3.12,el,36618,1.56
4,en,4772,1.96,que,12648,3.02,con,32216,1.37
5,del,4740,1.95,una,12548,3.0,del,28036,1.2
6,para,4689,1.92,y,9785,2.34,por,24685,1.05
7,las,4307,1.77,civil,7812,1.87,para,24152,1.03
8,por,3277,1.35,se,7555,1.81,los,22271,0.95
9,con,2922,1.2,la,6729,1.61,las,20749,0.89


## 4.1 Top 20 Verbs, Adj, and Nouns in Advertencias

In [None]:
dfg = df[df["Type"] == "advertencia"]
printNPOS(' '.join(dfg['Text']), 20)

## 4.1 Top 20 Verbs, Adj, and Nouns in Seguimientos

In [None]:
dfg = df[df["Type"] == "seguimiento"]
printNPOS(' '.join(dfg['Text']), 20)

## 5. Exporting to html

In [5]:
!jupyter nbconvert --to html 4

[NbConvertApp] Converting notebook 8_NLP_Word_Tagging.ipynb to html
[NbConvertApp] Writing 586885 bytes to 8_NLP_Word_Tagging.html
