# 1. Installs, Imports and Settings

In [10]:
#!pip install spacy==3.1.1 #restart runtime after this
#!python -m spacy download en_core_web_sm

import timer
import csv
import pandas as pd
pd.options.display.max_rows
pd.set_option('display.max_colwidth', None)

import string
import operator
from itertools import islice
from collections import Counter

#!pip install nltk
from nltk import ngrams
import nltk as nltk
from nltk.corpus import stopwords
#nltk.download('stopwords')
#nltk.download('punkt')
#nltk.download('averaged_perceptron_tagger')
#nltk.download('wordnet');

import spacy
nlp = spacy.load("es_core_news_md")

#!pip install pyenchant
import enchant

import time

#  2. Reading files 

In [11]:
df = pd.read_csv("data/alertas_NNA_sentences.csv", sep="|")

print("Size of Data:", len(df))
print()
print("Columns : ", list(df.columns))

Size of Data: 1753

Columns :  ['Filename', 'Text', 'Subtype', 'Type', 'Year', 'Path', 'Departamento', 'Text_Original', 'Recommendations', 'NNAJ']


# 3. Functions

In [12]:
exceptions = []
red_flags = []
green_flags = ['posdesmovilización', 'nnaj']
stop_words = stopwords.words('spanish') + list('abcdefghijklmñnopqrstuvwxyz') + red_flags
punct_signs = string.punctuation + '…¿•”“·*'

sim_dict = {'nnaj': ['niños', 'niñas', 'adolescentes', 'jovenes', 'jóvenes', 'niño', 'niña', 'j0ve', 'adolescente', 'joven'], 
            'reclutamiento' : ['eclutaniento'],
            'uso' : ['utilización'],
            'situación' : ['sit11 cién']
           }

d = enchant.Dict("es_CO")
def replace_similars(text, sim_dict = sim_dict):
    text = ' '.join(text)
    for key in sim_dict.keys():
        for v in sim_dict[key]:
            text = text.replace(v, key)
            
        text = text.replace(' '.join([key]*3), key)
        text = text.replace(' '.join([key]*2), key)

    return text.split()

def replace_accents(w):
    return w.lower().replace('á','a').replace('é','e').replace('í','i').replace('ó','o').replace('ú','u')

def check_spelling(text):
    new = []
    for token in text.split():
        if token in green_flags:
            new.append(token)
        elif d.check(token):
            new.append(token)
        elif (not d.check(token) and len(d.suggest(token)) > 0 and (replace_accents(token) == replace_accents(d.suggest(token)[0]))):
                new.append(d.suggest(token)[0].lower())
    return ' '.join(new)
            
def remove_exceptions(text):
    clean_text = text
    for e in exceptions:
        clean_text = clean_text.replace(e,' ')
    return clean_text

#clean stopwords and punctuation, lemmatization
def clean_text(text):   
    
    if type(text) == str:
        text = text.strip("']['").split("', '")
        
    if len(text) == 0:
        return ""
    
    tic = time.perf_counter()
    
    clean_text = ' '.join(text)
    clean_text = clean_text.translate(str.maketrans('', '', punct_signs))
    clean_text = check_spelling(clean_text)
    
    #clean_text = remove_exceptions (clean_text)
    nlp.max_length = len(clean_text) + 100
    doc = nlp(clean_text.lower())
    
    txt = []
    for token in doc:  
        if token.lemma_ not in stop_words and token.lemma:
            if token.pos_ in ['NOUN', 'ADJ', 'VERB']:
                txt.append(token.lemma_)
                    
    txt = replace_similars(txt)
    toc = time.perf_counter()  
    print(f"Performed in {toc - tic:0.4f} seconds")
    return txt

def get_list_nnaj_terms(text, n):
    if type(text) == str:
        text = text.strip("']['").split("', '")
        
    if len(text) == 0:
        return []

    ngramas = list(ngrams(text, n))
    
    return [' '.join (n) for n in ngramas]

In [13]:
clean_text(['violencia sexual es uno de los crimenes de guerra',
                     'drogas ilicitas y el mercado del dolar', 'anterior al dia de los muertos'])

Performed in 0.0690 seconds


['violencia',
 'sexual',
 'crimen',
 'guerra',
 'droga',
 'ilícito',
 'mercado',
 'dólar',
 'anterior',
 'día',
 'muerto']

## 4 Extract Ngrams

In [14]:
x = clean_text(df.loc[0]['NNAJ'])

Performed in 0.0852 seconds


In [15]:
list2_ = []
list1_ = df['NNAJ']

In [16]:
len(df)

1753

In [17]:
for l in list1_:
    list2_.append (clean_text(l))

Performed in 0.0637 seconds
Performed in 1.2234 seconds
Performed in 0.8432 seconds
Performed in 0.8629 seconds
Performed in 9.0348 seconds
Performed in 5.3602 seconds
Performed in 0.8385 seconds
Performed in 0.0814 seconds
Performed in 3.8377 seconds
Performed in 1.0629 seconds
Performed in 0.0747 seconds
Performed in 0.6711 seconds
Performed in 0.0002 seconds
Performed in 1.5531 seconds
Performed in 0.0003 seconds
Performed in 0.0001 seconds
Performed in 0.0001 seconds
Performed in 4.8306 seconds
Performed in 0.5586 seconds
Performed in 3.6998 seconds
Performed in 0.6634 seconds
Performed in 1.3950 seconds
Performed in 0.7184 seconds
Performed in 1.2621 seconds
Performed in 0.0002 seconds
Performed in 0.6421 seconds
Performed in 1.3843 seconds
Performed in 5.1312 seconds
Performed in 0.6337 seconds
Performed in 0.8443 seconds
Performed in 1.1193 seconds
Performed in 0.4159 seconds
Performed in 1.1315 seconds
Performed in 0.5168 seconds
Performed in 0.4655 seconds
Performed in 0.3145 

Performed in 0.7184 seconds
Performed in 1.5671 seconds
Performed in 0.1519 seconds
Performed in 0.0001 seconds
Performed in 0.0001 seconds
Performed in 0.0001 seconds
Performed in 0.2971 seconds
Performed in 0.0002 seconds
Performed in 0.0001 seconds
Performed in 0.0001 seconds
Performed in 1.8192 seconds
Performed in 0.0002 seconds
Performed in 0.8190 seconds
Performed in 0.0002 seconds
Performed in 0.0001 seconds
Performed in 0.0001 seconds
Performed in 2.8290 seconds
Performed in 0.1342 seconds
Performed in 1.9019 seconds
Performed in 0.0006 seconds
Performed in 2.8614 seconds
Performed in 0.0002 seconds
Performed in 0.0001 seconds
Performed in 1.6938 seconds
Performed in 0.0002 seconds
Performed in 0.0001 seconds
Performed in 0.0001 seconds
Performed in 2.0628 seconds
Performed in 0.0002 seconds
Performed in 0.0001 seconds
Performed in 0.7189 seconds
Performed in 0.7430 seconds
Performed in 0.0003 seconds
Performed in 1.5925 seconds
Performed in 0.0001 seconds
Performed in 2.1875 

Performed in 0.7612 seconds
Performed in 4.8490 seconds
Performed in 0.0002 seconds
Performed in 0.0001 seconds
Performed in 0.0001 seconds
Performed in 0.7387 seconds
Performed in 0.2215 seconds
Performed in 0.0002 seconds
Performed in 0.7273 seconds
Performed in 0.0002 seconds
Performed in 0.8089 seconds
Performed in 0.2955 seconds
Performed in 0.0767 seconds
Performed in 0.4328 seconds
Performed in 0.0002 seconds
Performed in 1.4753 seconds
Performed in 0.0002 seconds
Performed in 0.0001 seconds
Performed in 0.0001 seconds
Performed in 1.5480 seconds
Performed in 0.0002 seconds
Performed in 0.0001 seconds
Performed in 0.0001 seconds
Performed in 1.0604 seconds
Performed in 1.3060 seconds
Performed in 0.0002 seconds
Performed in 0.0001 seconds
Performed in 1.4640 seconds
Performed in 0.0002 seconds
Performed in 0.0001 seconds
Performed in 0.5251 seconds
Performed in 0.0001 seconds
Performed in 0.0780 seconds
Performed in 0.0002 seconds
Performed in 0.5918 seconds
Performed in 1.5295 

Performed in 0.2432 seconds
Performed in 1.6350 seconds
Performed in 2.2173 seconds
Performed in 0.1664 seconds
Performed in 3.8487 seconds
Performed in 0.4175 seconds
Performed in 0.0002 seconds
Performed in 6.3234 seconds
Performed in 2.9807 seconds
Performed in 2.3176 seconds
Performed in 0.0002 seconds
Performed in 0.0580 seconds
Performed in 0.0002 seconds
Performed in 0.0001 seconds
Performed in 2.0834 seconds
Performed in 0.5344 seconds
Performed in 0.0002 seconds
Performed in 15.5303 seconds
Performed in 14.3151 seconds
Performed in 1.1798 seconds
Performed in 0.4845 seconds
Performed in 2.4279 seconds
Performed in 0.0002 seconds
Performed in 3.5431 seconds
Performed in 3.2355 seconds
Performed in 1.0034 seconds
Performed in 0.5396 seconds
Performed in 6.9557 seconds
Performed in 0.1636 seconds
Performed in 0.1341 seconds
Performed in 0.0002 seconds
Performed in 0.0001 seconds
Performed in 1.6070 seconds
Performed in 0.0003 seconds
Performed in 1.7247 seconds
Performed in 0.000

Performed in 0.6345 seconds
Performed in 0.3528 seconds
Performed in 0.0002 seconds
Performed in 0.3902 seconds
Performed in 0.0002 seconds
Performed in 0.0001 seconds
Performed in 0.0001 seconds
Performed in 0.0001 seconds
Performed in 0.0000 seconds
Performed in 0.6611 seconds
Performed in 0.0002 seconds
Performed in 0.0001 seconds
Performed in 0.0001 seconds
Performed in 0.0001 seconds
Performed in 0.0001 seconds
Performed in 0.0001 seconds
Performed in 0.0001 seconds
Performed in 0.2985 seconds
Performed in 0.0002 seconds
Performed in 0.0001 seconds
Performed in 0.0125 seconds
Performed in 0.0001 seconds
Performed in 0.0001 seconds
Performed in 0.0001 seconds
Performed in 0.0001 seconds
Performed in 0.0001 seconds
Performed in 0.0001 seconds
Performed in 0.0001 seconds
Performed in 0.0001 seconds
Performed in 0.2278 seconds
Performed in 0.0002 seconds
Performed in 0.7125 seconds
Performed in 0.0002 seconds
Performed in 1.7256 seconds
Performed in 0.0003 seconds
Performed in 1.8594 

Performed in 7.9742 seconds
Performed in 8.1126 seconds
Performed in 10.1602 seconds
Performed in 1.7754 seconds
Performed in 3.9583 seconds
Performed in 3.2220 seconds
Performed in 4.8943 seconds
Performed in 2.6905 seconds
Performed in 5.2235 seconds
Performed in 3.1200 seconds
Performed in 1.4241 seconds
Performed in 4.5930 seconds
Performed in 0.9853 seconds
Performed in 1.9155 seconds
Performed in 4.6828 seconds
Performed in 1.1003 seconds
Performed in 0.9446 seconds
Performed in 1.0702 seconds
Performed in 2.7627 seconds
Performed in 0.9802 seconds
Performed in 1.9821 seconds
Performed in 6.7662 seconds
Performed in 13.5952 seconds
Performed in 5.6333 seconds
Performed in 2.4956 seconds
Performed in 5.4225 seconds
Performed in 0.8757 seconds
Performed in 2.8200 seconds
Performed in 3.8271 seconds
Performed in 3.6862 seconds
Performed in 2.1375 seconds
Performed in 1.9634 seconds
Performed in 1.2175 seconds
Performed in 11.8570 seconds
Performed in 7.1000 seconds
Performed in 2.87

In [18]:
df['NNAJ_clean_text'] = list2_

In [19]:
df.to_csv("data/alertas_NNAJ_keywords", sep="|", index=False)

# 5. Extract NGRMAS

In [20]:
df = pd.read_csv("data/alertas_NNAJ_keywords", sep="|")
df.columns

Index(['Filename', 'Text', 'Subtype', 'Type', 'Year', 'Path', 'Departamento',
       'Text_Original', 'Recommendations', 'NNAJ', 'NNAJ_clean_text'],
      dtype='object')

In [21]:
df['NNAJ_keywords'] = df['NNAJ_clean_text'].apply(get_list_nnaj_terms,n=1)
df['NNAJ_keywords2'] = df['NNAJ_clean_text'].apply(get_list_nnaj_terms,n=2)
df['NNAJ_keywords3'] = df['NNAJ_clean_text'].apply(get_list_nnaj_terms,n=3)

df.to_csv("data/alertas_NNAJ_keywords.csv", sep="|", index=False)

In [22]:
df['NNAJ_keywords'].head(1)

0    [anterior, sistema, alerta, temprano, advertir, inminencia, violación, masivo, derechos, humanos, infracción, amenaza, asesinato, selectivo, desplazamiento, individual, reclutamiento, forzado, uso, ilícito, nnaj, desplazamiento, masivo, restricción, movilidad, desaparición, forzado, confinamiento, combat, interposición, población, civil, ataque, indiscriminado]
Name: NNAJ_keywords, dtype: object

In [23]:
df['NNAJ_keywords2'].head(1)

0    [anterior sistema, sistema alerta, alerta temprano, temprano advertir, advertir inminencia, inminencia violación, violación masivo, masivo derechos, derechos humanos, humanos infracción, infracción amenaza, amenaza asesinato, asesinato selectivo, selectivo desplazamiento, desplazamiento individual, individual reclutamiento, reclutamiento forzado, forzado uso, uso ilícito, ilícito nnaj, nnaj desplazamiento, desplazamiento masivo, masivo restricción, restricción movilidad, movilidad desaparición, desaparición forzado, forzado confinamiento, confinamiento combat, combat interposición, interposición población, población civil, civil ataque, ataque indiscriminado]
Name: NNAJ_keywords2, dtype: object

## 5. Exporting to html

In [24]:
!jupyter nbconvert --to html 8_NNAJ_NGrams.ipynb

[NbConvertApp] Converting notebook 8_NNAJ_NGrams.ipynb to html
[NbConvertApp] Writing 665417 bytes to 8_NNAJ_NGrams.html
