# 1. Install, Imports, Settings

In [25]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
import spacy
from spacy.lang.es.examples import sentences 
#!python -m spacy download es_core_news_md
nlp = spacy.load("es_core_news_md")
from collections import Counter

import string, math

In [26]:
plt.rcParams["figure.figsize"] = [10, 6]
%config InlineBackend.figure_format = 'retina'

punct = string.punctuation +'”“'

# 2. Loading data

In [None]:
df = pd.read_csv("data/alertas_NNAJ_keywords_qwe.csv", sep="|")
print("Total Number of documents:", len(df))
print("Number of documents with no accesible text (pass protected):", len(df[df['Text'].isnull()]))
print("Number of documents with accesible text:",  len(df[df['Text'].notnull()]))
print("Number of documents mentioning NNAJ:",  len(df[df['NNAJ'] != "[]"]))

In [None]:
df.drop(['NNAJ_clean_text', 'NNAJ_keywords', 'NNAJ_keywords2','NNAJ_keywords3'], axis = 1).head()

# 3. Basic Text Cleaning 

In [None]:
df.loc[25]["Text"]

# 4. Extracting Actores del Conflicto

In [None]:
#https://es.wikipedia.org/wiki/C%C3%A1rtel_(organizaci%C3%B3n_il%C3%ADcita)#Colombia
#https://es.wikipedia.org/wiki/Bandas_y_grupos_emergentes_en_Colombia
#https://www.datos.gov.co/Mapas-Nacionales/Departamentos-y-municipios-de-Colombia/xdk5-pm3f

In [None]:
grupos = {
    "Grupo Armado Organizado" : ["GAO"],
    "Grupo Armado Ilegal" : ["GAI"],
    "Grupo Armado Organizado Residual" : ["GAOR", "GAO-R", "grupo armado organizado - residual" ], 
    "Ejército de Liberación Nacional" : ["ELN"],
    "Autodefensas Gaitanistas de Colombia" : ["AGC"],
    "Fuerzas Armadas Revolucionarias" : ["FARC", ],
    "Grupo delincuencial organizado" : ["GDO", ],
    "Ejército del Pueblo" : ["EP"],
    "Jalisco Nueva Generación": [],
    "Bandas Criminales" : ["BACRIM", "Bandas Emergentes"],
    "Autodefensas Unidas de Colombia" : ["AUC", ],
    "Grupos posdesmovilización": [],
    "Ejército de Liberación Popular": ["EPL"],
    "Clan del Golfo" : ["AGC"],
}

children = ["NNAJ", "niños", "niñas", "adolescentes", "jóvenes", "NNA"]

narco_grupos  = ["Cartel del Amazonas",  
        "Cartel de Los Nevados",
        "Cartel de Bogotá",
        "Cartel de Buga",
        "Cartel de Medellín",
        "Oficina de Envigado",
        "Cartel de Cali",
        "Cartel del Caquetá",
        "Cartel del Norte del Valle",
        "Los Rastrojos",
        "Cartel de la Costa Atlántica",
        "Cartel de La Guajira",
        "Cartel del Cauca",
        "Los Caparrapos",
        "Los Paisas",
        "Águilas Negras",
        "Los Puntilleros",
        "Ejército Revolucionario Popular Antisubversivo de Colombia",
        "Los Pelusos",
        "GDO Los Pachenca",
        "Clan Gnecco Cerchar",
        "Clan Bustamante", 
        "Activo Primer Frente",
        "Segunda Marquetalia", 
        "Los Zetas", 
        "Cartel de Sinaloa", 
        "Oficina de Envigado", 
        "La Terraza",
        "Robledo",
        "Pachelly",
        "La Sierra", 
        "Caicedo", 
        "Los Triana",
        "Clan del Oriente", 
        "La Unión",
        "Clan del Norte",
        "Los Costeños", 
        "Los Pachencha", 
        "La Cordillera",
        "La Gran Alianza",
        "Banda Local",
        "Norte del Valle",
        "Los Chuckys",
        "Los Chacales",
        "La Constru",
        "Los Caqueteños"]

POI = {
    "Evaristo Porras": "Papa Doc",
    "Luis Agustín Caicedo Velandia" : "Don Lucho", 
    "Víctor Manuel Mejía Múnera": "Pablo Arauca",
    "Miguel Ángel Mejía Múnera": "El Mellizo",
    "Ramón Quintero":"",
    "Pablo Escobar" : "El patrón",
    "Gonzalo Rodríguez Gacha": "",
    "Diego Fernando Murillo": "",
    "Gilberto Rodríguez Orejuela": "El Ajedrecista",
    "Miguel Rodríguez Orejuela": "",
    "Leonidas Vargas": "El Rey de Caquetá",
    "Luis Enrique Calle Serna": "",
    "Wilber Varela": "",
    "Alberto Orlandez Gamboa": "", 
    "Hermágoras González": "",
    "Nicolás Rodríguez Bautista" : " Gabino ", 
    "Iván Márquez": "",
    "Gentil Duarte": "",
    "Jorge Gnecco Cerchar": "", 
    "Jesús María Aguirre" :"Chucho Mercancía",
    "Megateo": "",  
    "Vicente Castaño": "", 
    "Emiliano Alcides Osorio": " Caín ",
    "Edgar Bustamante" : "El Yuca",
    "Manuel Marulanda Vélez" :" Tirofijo ",     
    "Jorge Briceño Suarez Mono": " Jojoy ",
    "Dairo Antonio Úsuga David": "",
    "Salvatore Mancuso": "",
    "Rodrigo Tovar": "",
    "Carlos Castaño Gil": "",
    "Justo Pastor Perafán": "Don Pepe"
}

loc = pd.read_csv("resources/Departamentos_y_municipios_de_Colombia.csv")
loc.head()

In [None]:
df = df[df['NNAJ'] != "[]"]

# 4.2. Extract Actors

In [None]:
i = 0
def replace_accents(text):
    return text.replace('á', 'a').replace('é', 'e').replace('í', 'i').replace('ó', 'o').replace('ú', 'u')

def extract_actors(text):
    if text != text:
        return []
    
    entities  = []
    temp_text = replace_accents(text.lower())
    
    #grupos
    for key, value in grupos.items():
        temp_key =  replace_accents(key.lower())
        if temp_key in temp_text: 
                entities += [key] * temp_text.count(temp_key)
        for v in value:
            if v == "":
                temp_value = replace_accents(v.lower()) if v !="" else ""
                if temp_value in temp_text:
                    entities += [v] * temp_text.count(temp_value)
                
    #narco_grupos           
    for value in narco_grupos:
        if replace_accents(value.lower()) in temp_text:
            entities += [value] * temp_text.count(replace_accents(value.lower()))
    
    doc = nlp(text)
    for token in doc.ents:
          if (token.label_ == "ORG" and token.label_ not in entities):
            entities.append(token.text)
    print("*")
    return entities

def merge_text(text):
    if type(text) == str:
        text = text.strip("']['").split("', '")
    
    text = ' '.join(text)                    
    return text

In [None]:
df['NNAJ_Text'] = df['NNAJ'].apply(merge_text)

In [None]:
df["Actors_NNAJ"] = df["NNAJ_Text"].apply(extract_actors)

In [None]:
df.columns

In [None]:
df = df[['Filename', 'Subtype', 'Type', 'Year', 'Path', 'Departamento', 'Actors_NNAJ', 'NNAJ_Text']]

In [None]:
df.to_csv("data/alertas_actors_NNAJ.csv", sep="|", index=False)

In [None]:
df.head(2)

In [None]:
def clean_actors(entities):
    if len(entities) == 0:
        return []
    entities = [e.translate(str.maketrans('', '', punct)) for e in entities]
    for key, value in grupos.items():
        for v in value:
            temp_value = replace_accents(v.lower()) 
            entities = [key if replace_accents(e.lower()) == temp_value else e for e in entities]
    return entities

In [None]:
df["Actors_NNAJ"] = df["Actors_NNAJ"].apply(clean_actors)

In [None]:
df.head(1)

# 5. Saving text in csv

In [None]:
df.to_csv("data/alertas_actors_NNAJ.csv", sep="|", index=False)

# 6. Export HTML

In [None]:
!jupyter nbconvert --to html 13_Actores_del_Conflicto.ipynb