# 1. Installs, Imports and Settings

In [1]:
#!pip install spacy==3.1.1 
#restart runtime after this
#!python -m spacy download es_core_news_lg

import spacy
nlp = spacy.load("es_core_news_lg")

import csv
import pandas as pd
pd.options.display.max_rows
pd.set_option('display.max_colwidth', None)

#  2. Reading files 

In [2]:
df = pd.read_csv('data/alertas.csv', sep="|")
df = df[df["Text"].notnull()]
print("Size of Data:", len(df))
print()
print("Columns : ", list(df.columns))

Size of Data: 666

Columns :  ['Filename', 'Text', 'Type', 'Year']


In [None]:
' '.join(df["Text"]).split()

In [3]:
len(' '.join(df["Text"]).split())

4602954

In [4]:
import re
exceptions = [
    "SAT \n \nSede Central Calle 55 Nº 10 32 Of 115 \nTels 3147300 Ext 2437 Telefax 6915300 \nEmail", 
    "DE LA", 
    "NI OTRO", 
    "Tels 3147300 Ext 2437 Telefax 6915300", 
    "Ext 2437 Telefax 6915300", 
    "Tels 3147300 Ext 2437 Telefax 6915300", 
    "Sede Central Calle 55 No", 
    "Tels 3147300 Ext 2437 Telefax 6915300", 
    "SAT Sede Central Calle 55 Nº 10 32 Of 115 Email", 
    "SAT Sede Central Calle 55 Nº 1032 Of 115 Email", 
    "SA Email",
    "SAT INFORME DE RIESGO No",
    "DEL CONFLICTO ARMADO Sistema",
    "LA VIDA LA LIBERTAD Y",
    "DELEGADA PARA LA EVALUACIÓN DEL RIESGO POBLACIÓN CIVIL",
    "SAT Sede Central Calle 55 Nº 1032 Of 115 Tels 3147300 Ext 2437 Telefax 6915300 Email", 
    "SAT Sede Central Calle 55 Nº 10 32 Of 115 Tels 3147300 Ext 2437 Telefax 6915300 Email", 
    "INFORME DE RIESGO No", 
    "LA INTEGRIDAD FISICA DE LA POBLACIÓN", 
    "Tels 3147300 Ext 2437 Telefax", 
    "DELEGADA PARA LA EVALUACIÓN", 
    "LA POBLACIÓN CIVIL", 
    "Tels 3147300 Ext 2437 Telefax 6915300 Correo Electrónico", 
    "VILLEGAS Defensor Delegado",
    "Tels 3147300 Ext 2452 Telefax",
    "2437 Telefax",
    "CHACÓN Defensor Delegado", 
    "Tels 3147300 ext 24372464 Fax ext 2452 Bogotá",
    "Tels 3147300 Ext 2437 Telefax ext", 
    "Tels 3147300 Ext 2452 Telefax", 
    "Telefax Ext 2452 Correo Electrónico", 
    "2437 Telefax", 
    "Tels 3147300 ext 24372464 Fax ext 2452 Bogotá", 
    "2464 Fax ext 2452 Bogotá",
    "X", 
    "Inminencia alta Urgente Grado 2 Urgencia intermedia Grado 3 No", 
    "SAT Sede Central Calle 55 Nº 1032 Bloque C Tercer piso Tels 3147300 Ext 2437 Telefax", 
    "O SITUACIÓN CRÓNICA",
    "AUTORIDADES VINCULADAS AL", 
    
    
]
def cleantext1(text):
    text = text.lower()
    text = text.replace("\n", " ")
    for s in exceptions:
        text = text.replace(s.lower(), " ")
    text = re.sub(' +', ' ', text)
    return text    

df["Text"] = df["Text"].apply(cleantext1)

# 3. Functions

In [5]:
def acrfull(x):
    return ': '.join([x, spacy.explain(x)])

def extract_named_entities(text):    
    nlp.max_length = len(text) + 100
    doc = nlp(text)

    dt = pd.DataFrame(columns=['entity', 'label'])
    i = 0
    for ent in doc.ents:
        if ent.text not in exceptions:
            dt.loc[i] = [ent.text, ent.label_] 
            i = i +1
        
    dt = dt.reset_index()
    dt = dt.groupby(['entity', 'label']).agg({'index' : 'count'}).sort_values('index', ascending=False).reset_index()
    dt.columns = ['Entity', 'Type', '#Ocurrences']
    dt = dt.set_index('Entity')
    dt['Type'] = dt['Type'].apply(acrfull)
    
    dt = dt.sort_values('#Ocurrences', ascending=False) #sort before deleting duplicates
    dt = dt[~dt.index.duplicated(keep='first')]                 
    
    #recognizing handlers
    dt.loc[dt.index.str.startswith('@') , 'Type'] = 'Social Media Handler'
    
    return dt.sort_values('#Ocurrences', ascending=False)

# 4. Top 20 named entities by Year - Advertencias

In [7]:
years = list(df[df["Type"] == "advertencia"]['Year'].unique())
years = sorted(years)
years

[2002,
 2003,
 2004,
 2005,
 2006,
 2007,
 2008,
 2009,
 2010,
 2011,
 2012,
 2013,
 2018,
 2019,
 2020,
 2021,
 2022]

In [8]:
for y in years:
    dfg = df[(df["Type"] == "advertencia") & (df["Year"] == y)]
    print(y,"--", len(dfg), "documents")
    dt = extract_named_entities(' '.join(dfg['Text'])).head(20)
    display(dt)

2002 -- 27 documents


Unnamed: 0_level_0,Type,#Ocurrences
Entity,Unnamed: 1_level_1,Unnamed: 2_level_1
sat,"ORG: Companies, agencies, institutions, etc.",82
eln,"MISC: Miscellaneous entities, e.g. events, nationalities, products or works of art",55
farc,"ORG: Companies, agencies, institutions, etc.",54
medellín,"LOC: Non-GPE locations, mountain ranges, bodies of water",22
farc eln auc,"MISC: Miscellaneous entities, e.g. events, nationalities, products or works of art",19
antioquia,"LOC: Non-GPE locations, mountain ranges, bodies of water",15
magdalena,"LOC: Non-GPE locations, mountain ranges, bodies of water",12
farc eln,"MISC: Miscellaneous entities, e.g. events, nationalities, products or works of art",8
valledupar,"LOC: Non-GPE locations, mountain ranges, bodies of water",7
sierra nevada de santa marta,"LOC: Non-GPE locations, mountain ranges, bodies of water",7


2003 -- 58 documents


Unnamed: 0_level_0,Type,#Ocurrences
Entity,Unnamed: 1_level_1,Unnamed: 2_level_1
farc,"ORG: Companies, agencies, institutions, etc.",169
sat,"ORG: Companies, agencies, institutions, etc.",120
eln,"MISC: Miscellaneous entities, e.g. events, nationalities, products or works of art",89
dc colombia,"LOC: Non-GPE locations, mountain ranges, bodies of water",51
farc eln auc,"MISC: Miscellaneous entities, e.g. events, nationalities, products or works of art",39
bogotá dc colombia,"LOC: Non-GPE locations, mountain ranges, bodies of water",26
antioquia,"LOC: Non-GPE locations, mountain ranges, bodies of water",23
san miguel,"LOC: Non-GPE locations, mountain ranges, bodies of water",22
bajo putumayo,"LOC: Non-GPE locations, mountain ranges, bodies of water",19
córdoba,"LOC: Non-GPE locations, mountain ranges, bodies of water",16


2004 -- 88 documents


Unnamed: 0_level_0,Type,#Ocurrences
Entity,Unnamed: 1_level_1,Unnamed: 2_level_1
farc,"ORG: Companies, agencies, institutions, etc.",472
dc colombia,"LOC: Non-GPE locations, mountain ranges, bodies of water",158
eln,"MISC: Miscellaneous entities, e.g. events, nationalities, products or works of art",139
bogotá dc colombia,"LOC: Non-GPE locations, mountain ranges, bodies of water",134
arauca,"LOC: Non-GPE locations, mountain ranges, bodies of water",113
ciat,"LOC: Non-GPE locations, mountain ranges, bodies of water",66
oea,"ORG: Companies, agencies, institutions, etc.",65
santander,"LOC: Non-GPE locations, mountain ranges, bodies of water",58
 a no ser,"MISC: Miscellaneous entities, e.g. events, nationalities, products or works of art",45
s farc,"ORG: Companies, agencies, institutions, etc.",39


2005 -- 65 documents


Unnamed: 0_level_0,Type,#Ocurrences
Entity,Unnamed: 1_level_1,Unnamed: 2_level_1
farc,"ORG: Companies, agencies, institutions, etc.",354
eln,"MISC: Miscellaneous entities, e.g. events, nationalities, products or works of art",117
bogotá dc colombia,"LOC: Non-GPE locations, mountain ranges, bodies of water",113
dc colombia,"LOC: Non-GPE locations, mountain ranges, bodies of water",97
sat,"ORG: Companies, agencies, institutions, etc.",79
arauca,"LOC: Non-GPE locations, mountain ranges, bodies of water",73
magdalena,"LOC: Non-GPE locations, mountain ranges, bodies of water",51
ciat,"LOC: Non-GPE locations, mountain ranges, bodies of water",49
argelia,"LOC: Non-GPE locations, mountain ranges, bodies of water",41
dario mejia villegas,PER: Named person or family.,41


2006 -- 52 documents


Unnamed: 0_level_0,Type,#Ocurrences
Entity,Unnamed: 1_level_1,Unnamed: 2_level_1
farc,"ORG: Companies, agencies, institutions, etc.",291
bogotá dc colombia,"LOC: Non-GPE locations, mountain ranges, bodies of water",177
eln,"MISC: Miscellaneous entities, e.g. events, nationalities, products or works of art",118
dc colombia,"LOC: Non-GPE locations, mountain ranges, bodies of water",99
antioquia,"LOC: Non-GPE locations, mountain ranges, bodies of water",64
santander,"LOC: Non-GPE locations, mountain ranges, bodies of water",54
arauca,"LOC: Non-GPE locations, mountain ranges, bodies of water",46
valle del cauca,"LOC: Non-GPE locations, mountain ranges, bodies of water",45
ciat,"LOC: Non-GPE locations, mountain ranges, bodies of water",43
magdalena,"LOC: Non-GPE locations, mountain ranges, bodies of water",39


2007 -- 36 documents


Unnamed: 0_level_0,Type,#Ocurrences
Entity,Unnamed: 1_level_1,Unnamed: 2_level_1
farc,"ORG: Companies, agencies, institutions, etc.",225
bogotá dc colombia,"LOC: Non-GPE locations, mountain ranges, bodies of water",180
dc colombia,"LOC: Non-GPE locations, mountain ranges, bodies of water",178
eln,"MISC: Miscellaneous entities, e.g. events, nationalities, products or works of art",111
santander,"LOC: Non-GPE locations, mountain ranges, bodies of water",81
tels 3147300,"MISC: Miscellaneous entities, e.g. events, nationalities, products or works of art",65
colombia,"LOC: Non-GPE locations, mountain ranges, bodies of water",48
jorge enrique calero,PER: Named person or family.,43
,PER: Named person or family.,39
bucaramanga,"LOC: Non-GPE locations, mountain ranges, bodies of water",39


2008 -- 31 documents


Unnamed: 0_level_0,Type,#Ocurrences
Entity,Unnamed: 1_level_1,Unnamed: 2_level_1
dc colombia,"LOC: Non-GPE locations, mountain ranges, bodies of water",280
farc,"ORG: Companies, agencies, institutions, etc.",193
eln,"MISC: Miscellaneous entities, e.g. events, nationalities, products or works of art",85
arauca,"LOC: Non-GPE locations, mountain ranges, bodies of water",68
tels 3147300,"MISC: Miscellaneous entities, e.g. events, nationalities, products or works of art",48
bogotá dc colombia,"LOC: Non-GPE locations, mountain ranges, bodies of water",47
santander,"LOC: Non-GPE locations, mountain ranges, bodies of water",43
san juan,"LOC: Non-GPE locations, mountain ranges, bodies of water",39
colombia,"LOC: Non-GPE locations, mountain ranges, bodies of water",38
barranquilla,"LOC: Non-GPE locations, mountain ranges, bodies of water",37


2009 -- 30 documents


Unnamed: 0_level_0,Type,#Ocurrences
Entity,Unnamed: 1_level_1,Unnamed: 2_level_1
bogotá dc colombia,"LOC: Non-GPE locations, mountain ranges, bodies of water",242
farc,"ORG: Companies, agencies, institutions, etc.",118
colombia,"LOC: Non-GPE locations, mountain ranges, bodies of water",114
antioquia,"LOC: Non-GPE locations, mountain ranges, bodies of water",96
eln,"MISC: Miscellaneous entities, e.g. events, nationalities, products or works of art",91
dc colombia,"LOC: Non-GPE locations, mountain ranges, bodies of water",88
córdoba,"LOC: Non-GPE locations, mountain ranges, bodies of water",66
valledupar,"LOC: Non-GPE locations, mountain ranges, bodies of water",60
bajo cauca,"LOC: Non-GPE locations, mountain ranges, bodies of water",50
arauca,"LOC: Non-GPE locations, mountain ranges, bodies of water",48


2010 -- 18 documents


Unnamed: 0_level_0,Type,#Ocurrences
Entity,Unnamed: 1_level_1,Unnamed: 2_level_1
dc colombia,"LOC: Non-GPE locations, mountain ranges, bodies of water",164
medellín,"LOC: Non-GPE locations, mountain ranges, bodies of water",151
bogotá dc colombia,"LOC: Non-GPE locations, mountain ranges, bodies of water",126
farc,"ORG: Companies, agencies, institutions, etc.",125
san juan,"LOC: Non-GPE locations, mountain ranges, bodies of water",51
villavicencio,"LOC: Non-GPE locations, mountain ranges, bodies of water",44
uribia,"LOC: Non-GPE locations, mountain ranges, bodies of water",41
cauca,"LOC: Non-GPE locations, mountain ranges, bodies of water",34
tels 3147300,"MISC: Miscellaneous entities, e.g. events, nationalities, products or works of art",33
pablo,PER: Named person or family.,32


2011 -- 17 documents


Unnamed: 0_level_0,Type,#Ocurrences
Entity,Unnamed: 1_level_1,Unnamed: 2_level_1
dc colombia,"LOC: Non-GPE locations, mountain ranges, bodies of water",199
farc,"ORG: Companies, agencies, institutions, etc.",128
eln,"MISC: Miscellaneous entities, e.g. events, nationalities, products or works of art",78
bogotá dc colombia,"LOC: Non-GPE locations, mountain ranges, bodies of water",45
acandí,"LOC: Non-GPE locations, mountain ranges, bodies of water",45
unguía,"LOC: Non-GPE locations, mountain ranges, bodies of water",42
cravo norte,"LOC: Non-GPE locations, mountain ranges, bodies of water",39
tiquisio,"LOC: Non-GPE locations, mountain ranges, bodies of water",30
cundinamarca,"LOC: Non-GPE locations, mountain ranges, bodies of water",30
algeciras,"LOC: Non-GPE locations, mountain ranges, bodies of water",30


2012 -- 27 documents


Unnamed: 0_level_0,Type,#Ocurrences
Entity,Unnamed: 1_level_1,Unnamed: 2_level_1
farc,"ORG: Companies, agencies, institutions, etc.",270
dc colombia,"LOC: Non-GPE locations, mountain ranges, bodies of water",266
eln,"MISC: Miscellaneous entities, e.g. events, nationalities, products or works of art",106
bogotá dc colombia,"LOC: Non-GPE locations, mountain ranges, bodies of water",101
florida,"LOC: Non-GPE locations, mountain ranges, bodies of water",79
san martín,"LOC: Non-GPE locations, mountain ranges, bodies of water",61
antioquia,"LOC: Non-GPE locations, mountain ranges, bodies of water",61
granada,"LOC: Non-GPE locations, mountain ranges, bodies of water",60
valle del cauca,"LOC: Non-GPE locations, mountain ranges, bodies of water",58
segovia,"LOC: Non-GPE locations, mountain ranges, bodies of water",52


2013 -- 5 documents


Unnamed: 0_level_0,Type,#Ocurrences
Entity,Unnamed: 1_level_1,Unnamed: 2_level_1
sincelejo,"LOC: Non-GPE locations, mountain ranges, bodies of water",31
san juan,"LOC: Non-GPE locations, mountain ranges, bodies of water",23
bogotá,"LOC: Non-GPE locations, mountain ranges, bodies of water",13
río san juan,"LOC: Non-GPE locations, mountain ranges, bodies of water",12
risaralda,"LOC: Non-GPE locations, mountain ranges, bodies of water",9
santa cecilia,"LOC: Non-GPE locations, mountain ranges, bodies of water",9
farc,"ORG: Companies, agencies, institutions, etc.",8
eln,"MISC: Miscellaneous entities, e.g. events, nationalities, products or works of art",7
sat,"ORG: Companies, agencies, institutions, etc.",6
aurelio rodríguez s farc,PER: Named person or family.,5


2018 -- 59 documents


Unnamed: 0_level_0,Type,#Ocurrences
Entity,Unnamed: 1_level_1,Unnamed: 2_level_1
eln,"MISC: Miscellaneous entities, e.g. events, nationalities, products or works of art",387
bogotá,"LOC: Non-GPE locations, mountain ranges, bodies of water",291
ciprat,PER: Named person or family.,215
colombia,"LOC: Non-GPE locations, mountain ranges, bodies of water",195
sat,"ORG: Companies, agencies, institutions, etc.",180
farc,"ORG: Companies, agencies, institutions, etc.",150
farc ep,"ORG: Companies, agencies, institutions, etc.",109
antioquia,"LOC: Non-GPE locations, mountain ranges, bodies of water",102
arauca,"LOC: Non-GPE locations, mountain ranges, bodies of water",91
córdoba,"LOC: Non-GPE locations, mountain ranges, bodies of water",77


2019 -- 51 documents


Unnamed: 0_level_0,Type,#Ocurrences
Entity,Unnamed: 1_level_1,Unnamed: 2_level_1
eln,"MISC: Miscellaneous entities, e.g. events, nationalities, products or works of art",734
bogotá,"LOC: Non-GPE locations, mountain ranges, bodies of water",366
antioquia,"LOC: Non-GPE locations, mountain ranges, bodies of water",234
farc,"ORG: Companies, agencies, institutions, etc.",213
ciprat,PER: Named person or family.,199
ppp02,"MISC: Miscellaneous entities, e.g. events, nationalities, products or works of art",160
ppp01,"MISC: Miscellaneous entities, e.g. events, nationalities, products or works of art",159
santander,"LOC: Non-GPE locations, mountain ranges, bodies of water",147
sat,"ORG: Companies, agencies, institutions, etc.",142
colombia,"LOC: Non-GPE locations, mountain ranges, bodies of water",135


2020 -- 54 documents


Unnamed: 0_level_0,Type,#Ocurrences
Entity,Unnamed: 1_level_1,Unnamed: 2_level_1
ppp01,"MISC: Miscellaneous entities, e.g. events, nationalities, products or works of art",765
eln,"MISC: Miscellaneous entities, e.g. events, nationalities, products or works of art",522
bogotá,"LOC: Non-GPE locations, mountain ranges, bodies of water",338
medellín,"LOC: Non-GPE locations, mountain ranges, bodies of water",301
antioquia,"LOC: Non-GPE locations, mountain ranges, bodies of water",236
bajo cauca,"LOC: Non-GPE locations, mountain ranges, bodies of water",206
cauca,"LOC: Non-GPE locations, mountain ranges, bodies of water",189
farc ep,"ORG: Companies, agencies, institutions, etc.",178
ciprat,PER: Named person or family.,169
uribe,"LOC: Non-GPE locations, mountain ranges, bodies of water",161


2021 -- 25 documents


Unnamed: 0_level_0,Type,#Ocurrences
Entity,Unnamed: 1_level_1,Unnamed: 2_level_1
eln,"MISC: Miscellaneous entities, e.g. events, nationalities, products or works of art",430
ppp01,"MISC: Miscellaneous entities, e.g. events, nationalities, products or works of art",417
farc ep,"ORG: Companies, agencies, institutions, etc.",140
puerto carreño,"LOC: Non-GPE locations, mountain ranges, bodies of water",133
ppp02,"MISC: Miscellaneous entities, e.g. events, nationalities, products or works of art",121
cravo norte,"LOC: Non-GPE locations, mountain ranges, bodies of water",115
arauca,"LOC: Non-GPE locations, mountain ranges, bodies of water",111
colombia,"LOC: Non-GPE locations, mountain ranges, bodies of water",107
antioquia,"LOC: Non-GPE locations, mountain ranges, bodies of water",100
santander,"LOC: Non-GPE locations, mountain ranges, bodies of water",99


2022 -- 14 documents


Unnamed: 0_level_0,Type,#Ocurrences
Entity,Unnamed: 1_level_1,Unnamed: 2_level_1
eln,"MISC: Miscellaneous entities, e.g. events, nationalities, products or works of art",345
ppp01,"MISC: Miscellaneous entities, e.g. events, nationalities, products or works of art",276
farc,"ORG: Companies, agencies, institutions, etc.",225
cali,"LOC: Non-GPE locations, mountain ranges, bodies of water",198
gdo,"MISC: Miscellaneous entities, e.g. events, nationalities, products or works of art",115
farc ep,"ORG: Companies, agencies, institutions, etc.",113
valle del cauca,"LOC: Non-GPE locations, mountain ranges, bodies of water",107
antioquia,"LOC: Non-GPE locations, mountain ranges, bodies of water",99
· bogotá,"LOC: Non-GPE locations, mountain ranges, bodies of water",90
colombia,"LOC: Non-GPE locations, mountain ranges, bodies of water",74


# 5 Top 20 named entities in Seguimiento

In [9]:
for y in [2018, 2019]:
    print (y)
    dfg = df[(df["Type"] == "seguimiento") & (df["Year"] == y)]
    dt = extract_named_entities(' '.join(dfg['Text'])).head(20)
    display(dt)

2018


Unnamed: 0_level_0,Type,#Ocurrences
Entity,Unnamed: 1_level_1,Unnamed: 2_level_1
suarez,"LOC: Non-GPE locations, mountain ranges, bodies of water",28
ntc gp1000,"MISC: Miscellaneous entities, e.g. events, nationalities, products or works of art",22
córdoba,"LOC: Non-GPE locations, mountain ranges, bodies of water",16
cauca,"LOC: Non-GPE locations, mountain ranges, bodies of water",14
jaime orlando,PER: Named person or family.,14
lazona,"LOC: Non-GPE locations, mountain ranges, bodies of water",14
caldas,"LOC: Non-GPE locations, mountain ranges, bodies of water",12
anserma,PER: Named person or family.,12
ciprat,PER: Named person or family.,12
bureau,"ORG: Companies, agencies, institutions, etc.",12


2019


Unnamed: 0_level_0,Type,#Ocurrences
Entity,Unnamed: 1_level_1,Unnamed: 2_level_1
colombia,"LOC: Non-GPE locations, mountain ranges, bodies of water",27
sat,"ORG: Companies, agencies, institutions, etc.",24
santander,"LOC: Non-GPE locations, mountain ranges, bodies of water",23
vulneratorias,PER: Named person or family.,22
antioquia,"LOC: Non-GPE locations, mountain ranges, bodies of water",22
eln,"MISC: Miscellaneous entities, e.g. events, nationalities, products or works of art",9
s naciones unidas,"ORG: Companies, agencies, institutions, etc.",8
valle del cauca,"LOC: Non-GPE locations, mountain ranges, bodies of water",7
farc,"ORG: Companies, agencies, institutions, etc.",7
at 026 18,"MISC: Miscellaneous entities, e.g. events, nationalities, products or works of art",7


# 5. Exporting to html

In [10]:
!jupyter nbconvert --to html 3_NLP_Named_Entities.ipynb

[NbConvertApp] Converting notebook 3_NLP_Named_Entities.ipynb to html
[NbConvertApp] Writing 652346 bytes to 3_NLP_Named_Entities.html
