# 1. Installs, Imports and Settings

In [2]:
#!pip install spacy==3.1.1 #restart runtime after this
#!python -m spacy download en_core_web_sm

import timer
import csv
import pandas as pd
pd.options.display.max_rows
pd.set_option('display.max_colwidth', None)

import string
import operator
from itertools import islice
from collections import Counter

#!pip install nltk
from nltk import ngrams
import nltk as nltk
from nltk.corpus import stopwords
#nltk.download('stopwords')
#nltk.download('punkt')
#nltk.download('averaged_perceptron_tagger')
#nltk.download('wordnet');

import spacy
nlp = spacy.load("es_core_news_md")

#!pip install pyenchant
import enchant

import time

#  2. Reading files 

In [3]:
df = pd.read_csv("data/alertas_NNA_sentences.csv", sep="|")

print("Size of Data:", len(df))
print()
print("Columns : ", list(df.columns))

Size of Data: 1753

Columns :  ['Filename', 'Text', 'Subtype', 'Type', 'Year', 'Path', 'Departamento', 'NNAJ']


# 3. Functions

In [4]:
exceptions = []
red_flags = []
green_flags = ['posdesmovilización', 'nnaj']
stop_words = stopwords.words('spanish') + list('abcdefghijklmñnopqrstuvwxyz') + red_flags
punct_signs = string.punctuation + '…¿•”“·*'

sim_dict = {'nnaj': ['niños', 'niñas', 'adolescentes', 'jovenes', 'jóvenes', 'niño', 'niña', 'j0ve', 'adolescente', 'joven'], 
            'reclutamiento' : ['eclutaniento'],
            'uso' : ['utilización'],
            'situación' : ['sit11 cién']
           }

d = enchant.Dict("es_CO")
def replace_similars(text, sim_dict = sim_dict):
    text = ' '.join(text)
    for key in sim_dict.keys():
        for v in sim_dict[key]:
            text = text.replace(v, key)
            
        text = text.replace(' '.join([key]*3), key)
        text = text.replace(' '.join([key]*2), key)

    return text.split()

def replace_accents(w):
    return w.lower().replace('á','a').replace('é','e').replace('í','i').replace('ó','o').replace('ú','u')

def check_spelling(text):
    new = []
    for token in text.split():
        if token in green_flags:
            new.append(token)
        elif d.check(token):
            new.append(token)
        elif (not d.check(token) and len(d.suggest(token)) > 0 and (replace_accents(token) == replace_accents(d.suggest(token)[0]))):
                new.append(d.suggest(token)[0].lower())
    return ' '.join(new)
            
def remove_exceptions(text):
    clean_text = text
    for e in exceptions:
        clean_text = clean_text.replace(e,' ')
    return clean_text

#clean stopwords and punctuation, lemmatization
def clean_text(text):   
    
    if type(text) == str:
        text = text.strip("']['").split("', '")
        
    if len(text) == 0:
        return ""
    
    tic = time.perf_counter()
    
    clean_text = ' '.join(text)
    clean_text = clean_text.translate(str.maketrans('', '', punct_signs))
    clean_text = check_spelling(clean_text)
    
    #clean_text = remove_exceptions (clean_text)
    nlp.max_length = len(clean_text) + 100
    doc = nlp(clean_text.lower())
    
    txt = []
    for token in doc:  
        if token.lemma_ not in stop_words and token.lemma:
            if token.pos_ in ['NOUN', 'ADJ', 'VERB']:
                txt.append(token.lemma_)
                    
    txt = replace_similars(txt)
    toc = time.perf_counter()  
    print(f"Performedy in {toc - tic:0.4f} seconds")
    return txt

def get_list_nnaj_terms(text, n):
    if type(text) == str:
        text = text.strip("']['").split("', '")
        
    if len(text) == 0:
        return []

    ngramas = list(ngrams(text, n))
    
    return [' '.join (n) for n in ngramas]

In [5]:
clean_text(['violencia sexual es uno de los crimenes de guerra',
                     'drogas ilicitas y el mercado del dolar', 'anterior al dia de los muertos'])

Performedy in 0.1761 seconds


['violencia',
 'sexual',
 'crimen',
 'guerra',
 'droga',
 'ilícito',
 'mercado',
 'dólar',
 'anterior',
 'día',
 'muerto']

## 4 Extract Ngrams

In [6]:
x = clean_text(df.loc[0]['NNAJ'])

Performedy in 1.2769 seconds


In [17]:
list2_ = []
list1_ = df['NNAJ']

In [18]:
len(df)

1753

In [19]:
for l in list1_:
    list2_.append (clean_text(l))

Performedy in 1.2246 seconds
Performedy in 2.3825 seconds
Performedy in 1.2620 seconds
Performedy in 0.9154 seconds
Performedy in 12.6172 seconds
Performedy in 7.2407 seconds
Performedy in 1.7798 seconds
Performedy in 0.3567 seconds
Performedy in 5.1204 seconds
Performedy in 1.9409 seconds
Performedy in 0.0976 seconds
Performedy in 0.6883 seconds
Performedy in 0.0003 seconds
Performedy in 2.1252 seconds
Performedy in 0.0003 seconds
Performedy in 0.2761 seconds
Performedy in 0.2660 seconds
Performedy in 6.9550 seconds
Performedy in 0.7176 seconds
Performedy in 6.4837 seconds
Performedy in 0.7762 seconds
Performedy in 2.3261 seconds
Performedy in 1.3530 seconds
Performedy in 1.7468 seconds
Performedy in 0.2281 seconds
Performedy in 0.7699 seconds
Performedy in 2.0745 seconds
Performedy in 9.6844 seconds
Performedy in 0.8891 seconds
Performedy in 1.3752 seconds
Performedy in 2.2415 seconds
Performedy in 0.6845 seconds
Performedy in 5.4037 seconds
Performedy in 0.6654 seconds
Performedy in

Performedy in 0.9700 seconds
Performedy in 2.2198 seconds
Performedy in 0.1921 seconds
Performedy in 0.0001 seconds
Performedy in 0.0001 seconds
Performedy in 0.0001 seconds
Performedy in 2.5323 seconds
Performedy in 0.0002 seconds
Performedy in 0.0001 seconds
Performedy in 0.0001 seconds
Performedy in 2.2215 seconds
Performedy in 0.0001 seconds
Performedy in 1.3368 seconds
Performedy in 0.0003 seconds
Performedy in 0.0002 seconds
Performedy in 0.0002 seconds
Performedy in 3.7612 seconds
Performedy in 0.2463 seconds
Performedy in 2.3702 seconds
Performedy in 0.0004 seconds
Performedy in 3.1518 seconds
Performedy in 0.0002 seconds
Performedy in 0.0001 seconds
Performedy in 2.1549 seconds
Performedy in 0.0003 seconds
Performedy in 0.0002 seconds
Performedy in 0.0002 seconds
Performedy in 2.9180 seconds
Performedy in 0.0003 seconds
Performedy in 0.0001 seconds
Performedy in 0.9713 seconds
Performedy in 1.0761 seconds
Performedy in 0.0004 seconds
Performedy in 2.2555 seconds
Performedy in 

Performedy in 1.4928 seconds
Performedy in 0.0003 seconds
Performedy in 1.1188 seconds
Performedy in 1.6957 seconds
Performedy in 1.0363 seconds
Performedy in 0.3086 seconds
Performedy in 1.8754 seconds
Performedy in 0.0003 seconds
Performedy in 0.0003 seconds
Performedy in 0.0002 seconds
Performedy in 0.0002 seconds
Performedy in 0.3479 seconds
Performedy in 0.9952 seconds
Performedy in 7.3758 seconds
Performedy in 0.0004 seconds
Performedy in 0.0111 seconds
Performedy in 0.0002 seconds
Performedy in 1.1179 seconds
Performedy in 0.3062 seconds
Performedy in 0.0614 seconds
Performedy in 1.0471 seconds
Performedy in 0.0004 seconds
Performedy in 1.3626 seconds
Performedy in 0.3957 seconds
Performedy in 0.2359 seconds
Performedy in 0.7177 seconds
Performedy in 0.0003 seconds
Performedy in 2.6551 seconds
Performedy in 0.0004 seconds
Performedy in 0.0001 seconds
Performedy in 0.0002 seconds
Performedy in 2.5522 seconds
Performedy in 0.0004 seconds
Performedy in 0.1000 seconds
Performedy in 

Performedy in 3.3690 seconds
Performedy in 2.2422 seconds
Performedy in 0.0002 seconds
Performedy in 0.0283 seconds
Performedy in 2.7575 seconds
Performedy in 0.5101 seconds
Performedy in 0.0002 seconds
Performedy in 3.7105 seconds
Performedy in 17.0715 seconds
Performedy in 2.1579 seconds
Performedy in 10.7894 seconds
Performedy in 0.0003 seconds
Performedy in 0.2576 seconds
Performedy in 0.0003 seconds
Performedy in 6.4516 seconds
Performedy in 1.1715 seconds
Performedy in 0.1961 seconds
Performedy in 0.0002 seconds
Performedy in 1.1314 seconds
Performedy in 0.9230 seconds
Performedy in 4.8087 seconds
Performedy in 6.4216 seconds
Performedy in 6.5455 seconds
Performedy in 0.3632 seconds
Performedy in 2.0575 seconds
Performedy in 3.1462 seconds
Performedy in 0.1926 seconds
Performedy in 5.3632 seconds
Performedy in 0.5591 seconds
Performedy in 0.0002 seconds
Performedy in 9.6725 seconds
Performedy in 4.3761 seconds
Performedy in 4.6532 seconds
Performedy in 0.0003 seconds
Performedy i

Performedy in 0.8472 seconds
Performedy in 0.0002 seconds
Performedy in 0.0001 seconds
Performedy in 0.0001 seconds
Performedy in 0.0156 seconds
Performedy in 1.8329 seconds
Performedy in 0.0005 seconds
Performedy in 0.3054 seconds
Performedy in 0.0003 seconds
Performedy in 0.7317 seconds
Performedy in 0.0677 seconds
Performedy in 8.8727 seconds
Performedy in 1.9981 seconds
Performedy in 0.0882 seconds
Performedy in 0.0003 seconds
Performedy in 0.0002 seconds
Performedy in 0.0001 seconds
Performedy in 0.0001 seconds
Performedy in 0.0002 seconds
Performedy in 0.1199 seconds
Performedy in 0.0002 seconds
Performedy in 0.7168 seconds
Performedy in 0.0003 seconds
Performedy in 0.6953 seconds
Performedy in 0.0002 seconds
Performedy in 0.0002 seconds
Performedy in 0.1810 seconds
Performedy in 0.4248 seconds
Performedy in 0.2894 seconds
Performedy in 0.2777 seconds
Performedy in 0.5151 seconds
Performedy in 0.0003 seconds
Performedy in 0.0002 seconds
Performedy in 0.0002 seconds
Performedy in 

Performedy in 5.0021 seconds
Performedy in 11.8399 seconds
Performedy in 7.1293 seconds
Performedy in 3.9212 seconds
Performedy in 1.3189 seconds
Performedy in 2.1392 seconds
Performedy in 3.8612 seconds
Performedy in 7.5072 seconds
Performedy in 4.9976 seconds
Performedy in 7.1193 seconds
Performedy in 2.8342 seconds
Performedy in 1.8685 seconds
Performedy in 5.7812 seconds
Performedy in 0.9323 seconds
Performedy in 14.5010 seconds
Performedy in 2.7147 seconds
Performedy in 5.1540 seconds
Performedy in 1.3532 seconds
Performedy in 5.5182 seconds
Performedy in 11.2739 seconds
Performedy in 7.4472 seconds
Performedy in 5.3547 seconds
Performedy in 3.3096 seconds
Performedy in 3.6961 seconds
Performedy in 4.8207 seconds
Performedy in 4.1509 seconds
Performedy in 2.9182 seconds
Performedy in 3.2511 seconds
Performedy in 1.6117 seconds
Performedy in 1.5049 seconds
Performedy in 6.8174 seconds
Performedy in 1.2012 seconds
Performedy in 4.7693 seconds
Performedy in 6.1170 seconds
Performedy 

Performedy in 14.4030 seconds
Performedy in 4.8192 seconds
Performedy in 41.7206 seconds
Performedy in 0.1366 seconds
Performedy in 0.7065 seconds
Performedy in 0.9694 seconds
Performedy in 1.4009 seconds
Performedy in 2.0169 seconds
Performedy in 3.6457 seconds
Performedy in 9.2605 seconds
Performedy in 23.4276 seconds
Performedy in 14.0688 seconds


In [24]:
df['NNAJ_clean_text'] = list2_

In [26]:
df.to_csv("data/alertas_NNAJ_keywords", sep="|", index=False)

# 5. Extract NGRMAS

In [7]:
df = pd.read_csv("data/alertas_NNAJ_keywords", sep="|")
df.columns

Index(['Filename', 'Text', 'Subtype', 'Type', 'Year', 'Path', 'Departamento',
       'NNAJ', 'NNAJ_clean_text', 'NNAJ_keywords', 'NNAJ_keywords2',
       'NNAJ_keywords3'],
      dtype='object')

In [8]:
df['NNAJ_keywords'] = df['NNAJ_clean_text'].apply(get_list_nnaj_terms,n=1)
df['NNAJ_keywords2'] = df['NNAJ_clean_text'].apply(get_list_nnaj_terms,n=2)
df['NNAJ_keywords3'] = df['NNAJ_clean_text'].apply(get_list_nnaj_terms,n=3)

df.to_csv("data/alertas_NNAJ_keywords.csv", sep="|", index=False)

In [9]:
df['NNAJ_keywords'].head(1)

0    [anterior, sistema, alerta, temprano, advertir, inminencia, violación, masivo, derechos, humanos, infracción, amenaza, asesinato, selectivo, desplazamiento, individual, reclutamiento, forzado, uso, ilícito, nnaj, desplazamiento, masivo, restricción, movilidad, desaparición, forzado, confinamiento, combat, interposición, población, civil, ataque, indiscriminado, recomendación, secretaria, técnico, inicié, seguimiento, impacto, medida, adoptado, continuidad, riesgo, coordinación, gobernación, nariño, alcaldía, municipal, policarpa, cumbitara, magüí, payán, roberto, payán, convocar, instancia, territorial, instancia, territorial, propósito, tener, cuenta, enfoque, territorial, diferencial, étnico, género, establecido, decreto, 2124, fin, promover, adoptar, medida, efectivo, protección, nuevo, riesgo, amenazar, población, civil, favor, líder, especial, protección, constitucional, nnaj, población, situación, desplazamiento, forzado, grupo, poblacional, riesgo, comisión, prevención, rec

In [10]:
df['NNAJ_keywords2'].head(1)

0    [anterior sistema, sistema alerta, alerta temprano, temprano advertir, advertir inminencia, inminencia violación, violación masivo, masivo derechos, derechos humanos, humanos infracción, infracción amenaza, amenaza asesinato, asesinato selectivo, selectivo desplazamiento, desplazamiento individual, individual reclutamiento, reclutamiento forzado, forzado uso, uso ilícito, ilícito nnaj, nnaj desplazamiento, desplazamiento masivo, masivo restricción, restricción movilidad, movilidad desaparición, desaparición forzado, forzado confinamiento, confinamiento combat, combat interposición, interposición población, población civil, civil ataque, ataque indiscriminado, indiscriminado recomendación, recomendación secretaria, secretaria técnico, técnico inicié, inicié seguimiento, seguimiento impacto, impacto medida, medida adoptado, adoptado continuidad, continuidad riesgo, riesgo coordinación, coordinación gobernación, gobernación nariño, nariño alcaldía, alcaldía municipal, municipal polic

## 5. Exporting to html

In [11]:
!jupyter nbconvert --to html 8_NNAJ_NGrams.ipynb

[NbConvertApp] Converting notebook 8_NNAJ_NGrams.ipynb to html
[NbConvertApp] Writing 667703 bytes to 8_NNAJ_NGrams.html
