# 1. Installs, Imports and Settings

In [5]:
#!pip install spacy==3.1.1 #restart runtime after this
#!python -m spacy download es_core_news_sm

import csv
import pandas as pd
pd.options.display.max_rows
pd.set_option('display.max_colwidth', None)

import string
import operator
from itertools import islice
from collections import Counter

import spacy
nlp = spacy.load('es_core_news_sm')

from nltk import ngrams
import nltk as nltk
from nltk.corpus import stopwords

#  2. Reading files 

In [6]:
df = pd.read_csv('data/alertas.csv', sep="|")
df = df[df["Text"] != " "]
print("Size of Data:", len(df))
print()
print("Columns : ", list(df.columns))

Size of Data: 666

Columns :  ['Filename', 'Text', 'Type', 'Year']


# 3. Functions

In [9]:
stop_words = stopwords.words('spanish') + ["at", "ala", "026", "1", "27", "06", "18", "www", "01", "8000914814", "¡"]

punct_signs = list(string.punctuation)
punct_signs += ['…','¿','•','”','“','–','&','∑','[',']', '▪', "I ’m", "️❤", "⠀ ⠀ ⠀", "⠀ ⠀"]


def clean_text(text):   
    clean_text = []
    for p in punct_signs:
        text = text.replace(p, ' ') 
    
    clean_text = text.lower().split()
    clean_text = [w for w in clean_text if w not in stop_words]
    
    nlp.max_length = len(' '.join(clean_text)) + 100
    doc = nlp(' '.join(clean_text))
    txt = []
    for token in doc:  
        if token.lemma:
            txt.append(token.lemma_)
    return txt

#m is the number of top ngrams.
def getNGrams(text, n, m):
    h_dict = {}   
    ngramas = list(ngrams(text, n))
    for grams in ngramas:
        words = ' '.join(grams).strip()
        if words not in h_dict:
            h_dict[words] = 0
        h_dict[words] = h_dict[words] + 1
        
    sorted_dict = sorted(h_dict.items(), key=operator.itemgetter(1), reverse=True)
    top_values = list(islice(sorted_dict, m))
    return [(r[0], r[1], round(r[1]/len(ngramas)*100,2)) for r in top_values]

def printNgrams(data, m):
    Ngrams = []
    unigrams = getNGrams(data, 1, m)
    bigrams = getNGrams(data, 2 , m)
    trigrams = getNGrams(data, 3 , m)
    for i in range(0,m):
        Ngrams.append(unigrams[i] + bigrams[i] + trigrams[i])
    df = pd.DataFrame(Ngrams, columns=['Unigrams', 'Absolute Freq', 'Relative Freq', 
                                       'Bigrams', 'Absolute Freq', 'Relative Freq', 
                                       'Trigrams', 'Absolute Freq', 'Relative Freq',]) 
    return df

# 4. Top 20 Ngrams in the whole corpus

#### Stopwords, emojies, and punctuation has been removed from the corpus. 
#### Ngrams are formed by the lemma of the word. For example: "go" includes the frequencies of go, goes, went, gone, etc..

In [None]:
printNgrams(clean_text(' '.join(df['Text'])), 20)

## 4.1 Top 20 Ngrams in Advertencias

In [None]:
dfg = df[df["Type"] == "advertencia"]
printNgrams(clean_text(' '.join(dfg['Text'])), 20)

## 4.2 Top 20 Ngrams in Seguimiento

In [10]:
dfg = df[df["Type"] == "seguimiento"]
printNgrams(clean_text(' '.join(dfg['Text'])), 20)

Unnamed: 0,Unigrams,Absolute Freq,Relative Freq,Bigrams,Absolute Freq.1,Relative Freq.1,Trigrams,Absolute Freq.2,Relative Freq.2
0,líder,366,0.81,derechos humanos,280,0.62,defensora derechos humanos,124,0.28
1,nacional,359,0.8,líder social,217,0.48,persona defensora derechos,108,0.24
2,derechos,352,0.78,persona defensora,130,0.29,derechos humanos líder,98,0.22
3,riesgo,329,0.73,defensora derechos,126,0.28,humanos líder social,92,0.2
4,social,302,0.67,humanos líder,99,0.22,defensorio gov co,90,0.2
5,humanos,295,0.66,defensorio gov,94,0.21,co plantilla vigente,88,0.2
6,acción,272,0.61,gov co,93,0.21,gov co plantilla,84,0.19
7,municipio,237,0.53,plantilla vigente,92,0.2,seguimiento persona defensora,76,0.17
8,pueblo,236,0.53,co plantilla,88,0.2,informe seguimiento persona,75,0.17
9,2018,223,0.5,informe seguimiento,85,0.19,defensor derechos humanos,53,0.12


## 5. Exporting to html

In [None]:
!jupyter nbconvert --to html 5_NLP_Word_Tokenization.ipynb