# 1. Installs, Imports and Settings

In [2]:
#!pip install spacy==3.1.1 
#restart runtime after this
#!python -m spacy download es_core_news_sm

import spacy
nlp = spacy.load("es_core_news_sm")

import csv
import pandas as pd
pd.options.display.max_rows
pd.set_option('display.max_colwidth', None)

#  2. Reading files 

In [3]:
df = pd.read_csv('data/alertas.csv', sep="|")
df = df[df["Text"] != " "]
print("Size of Data:", len(df))
print()
print("Columns : ", list(df.columns))

Size of Data: 666

Columns :  ['Filename', 'Text', 'Type', 'Year']


# 3. Functions

In [4]:
exceptions = []

emojies  = ['😳','👏', '✨','🤡','😳','🕊']

def is_emoji(s):
    return s in emojies

def acrfull(x):
    return ': '.join([x, spacy.explain(x)])

def extract_named_entities(text):    
    nlp.max_length = len(text) + 100
    doc = nlp(text)

    dt = pd.DataFrame(columns=['entity', 'label'])
    i = 0
    for ent in doc.ents:
        if ent.text not in exceptions:
            dt.loc[i] = [ent.text, ent.label_] 
            i = i +1
        
    dt = dt.reset_index()
    dt = dt.groupby(['entity', 'label']).agg({'index' : 'count'}).sort_values('index', ascending=False).reset_index()
    dt.columns = ['Entity', 'Type', '#Ocurrences']
    dt = dt.set_index('Entity')
    dt['Type'] = dt['Type'].apply(acrfull)
    
    dt = dt.sort_values('#Ocurrences', ascending=False) #sort before deleting duplicates
    dt = dt[~dt.index.duplicated(keep='first')]                 
    
    #recognizing handlers
    dt.loc[dt.index.str.startswith('@') , 'Type'] = 'Social Media Handler'
    
    #emojies - comparing it with list instead of all emojies for optimization
    #dt.loc[dt.index.map(is_emoji) , 'Type'] = 'Emojie'
    
    return dt.sort_values('#Ocurrences', ascending=False)

# 4. Top 30 named entities in the whole corpus

In [5]:
text = ' '.join(df['Text'])
len(text)

29208384

In [None]:
extract_named_entities(' '.join(df['Text'])).head(30)

## 4.1 Top 30 named entities in Advertencias

In [None]:
dfg = df[df["Type"] == "advertencia"]
extract_named_entities(' '.join(dfg['Text'])).head(30)

## 4.2 Top 30 named entities in Seguimiento

In [6]:
dfg = df[df["Type"] == "seguimiento"]
extract_named_entities(' '.join(dfg['Text'])).head(30)

Unnamed: 0_level_0,Type,#Ocurrences
Entity,Unnamed: 1_level_1,Unnamed: 2_level_1
DERECHOS HUMANOS,"MISC: Miscellaneous entities, e.g. events, nationalities, products or works of art",77
PERSONAS DEFENSORAS,"ORG: Companies, agencies, institutions, etc.",76
AT,"ORG: Companies, agencies, institutions, etc.",60
Nacional,"MISC: Miscellaneous entities, e.g. events, nationalities, products or works of art",50
LÍDERES,"MISC: Miscellaneous entities, e.g. events, nationalities, products or works of art",49
Plantilla Vigente,"MISC: Miscellaneous entities, e.g. events, nationalities, products or works of art",48
Defensoría del Pueblo,"MISC: Miscellaneous entities, e.g. events, nationalities, products or works of art",46
Alertas Tempranas,"ORG: Companies, agencies, institutions, etc.",38
Agrario,"LOC: Non-GPE locations, mountain ranges, bodies of water",34
PNIS,"ORG: Companies, agencies, institutions, etc.",33


# 5. Exporting to html

In [None]:
!jupyter nbconvert --to html 3