# 1. Installs, Imports and Settings

In [1]:
#!pip install spacy==3.1.1 #restart runtime after this
#!python -m spacy download en_core_web_sm

import csv
import pandas as pd
pd.options.display.max_rows
pd.set_option('display.max_colwidth', None)

import string
import operator
from itertools import islice
from collections import Counter

#!pip install nltk
from nltk import ngrams
import nltk as nltk
from nltk.corpus import stopwords
#nltk.download('stopwords')
#nltk.download('punkt')
#nltk.download('averaged_perceptron_tagger')
#nltk.download('wordnet');

import spacy
nlp = spacy.load('en_core_web_lg')

#  2. Reading files 

In [2]:
df = pd.read_csv('data/alertas.csv', sep="|")
df = df[df["Text"] != " "]
print("Size of Data:", len(df))
print()
print("Columns : ", list(df.columns))

Size of Data: 666

Columns :  ['Filename', 'Text', 'Type', 'Year']


# 3. Functions

In [3]:
nlp.max_length = 3920760000
stop_words = stopwords.words('english') 

punct_signs = list(string.punctuation)
punct_signs += ['…','¿','•','”','“','–','&','∑','[',']', '▪', "I ’m", "️❤", "⠀ ⠀ ⠀", "⠀ ⠀"]

sim_dict = {
    "'" : ['’'],
   # "️❤": ["️ ❤ ️", "❤ ️", "️ ❤"]
    
}

emojies = ["🤚","🙋","😳","🏻‍", "♀", "🏻‍", "📈", "️ ⬆", "❤", "🏼", "💜", "👏"]

def replace_simliars(text, sim_dict):
    for key in sim_dict.keys():
        for v in sim_dict[key]:
            text = text.replace(v, key)
    return text

#clean stopwords and punctuation, lemmatization
def clean_text(text):   
    clean_text = []
    text = replace_simliars(text, sim_dict)
    for p in punct_signs:
        text = text.replace(p, ' ') 
        
    #for e in emojies:
     #   text = text.replace(e, ' ') 
    
    clean_text = text.lower().split()
    clean_text = [w for w in clean_text if w not in stop_words]
    
    nlp.max_length = len(' '.join(clean_text)) + 100
    doc = nlp(' '.join(clean_text))
    txt = []
    for token in doc:  
        if token.lemma_ not in stop_words and token.lemma:
            txt.append(token.lemma_)
    return txt

#m is the number of top ngrams.
def getNGrams(text, n, m):
    h_dict = {}   
    ngramas = list(ngrams(text, n))
    for grams in ngramas:
        words = ' '.join(grams).strip()
        if words not in h_dict:
            h_dict[words] = 0
        h_dict[words] = h_dict[words] + 1
        
    sorted_dict = sorted(h_dict.items(), key=operator.itemgetter(1), reverse=True)
    top_values = list(islice(sorted_dict, m))
    return [(r[0], r[1], round(r[1]/len(ngramas)*100,2)) for r in top_values]

def printNgrams(data, m):
    Ngrams = []
    unigrams = getNGrams(data, 1, m)
    bigrams = getNGrams(data, 2 , m)
    trigrams = getNGrams(data, 3 , m)
    for i in range(0,m):
        Ngrams.append(unigrams[i] + bigrams[i] + trigrams[i])
    df = pd.DataFrame(Ngrams, columns=['Unigrams', 'Absolute Freq', 'Relative Freq', 
                                       'Bigrams', 'Absolute Freq', 'Relative Freq', 
                                       'Trigrams', 'Absolute Freq', 'Relative Freq',]) 
    return df

# 4. Top 20 Ngrams in the whole corpus

#### Stopwords, emojies, and punctuation has been removed from the corpus. 
#### Ngrams are formed by the lemma of the word. For example: "go" includes the frequencies of go, goes, went, gone, etc..

In [None]:
printNgrams(clean_text(' '.join(df['Text'])), 20)

## 4.1 Top 20 Ngrams in Advertencias

In [None]:
dfg = df[df["Type"] == "advertencia"]
printNgrams(clean_text(' '.join(dfg['Text'])), 20)

## 4.2 Top 20 Ngrams in Seguimiento

In [None]:
dfg = df[df["Type"] == "seguimiento"]
printNgrams(clean_text(' '.join(dfg['Text'])), 20)

## 5. Exporting to html

In [None]:
!jupyter nbconvert --to html 9_NLP_Word_Tokenization.ipynb