# 1. Installs, Imports and Settings

In [5]:
#!pip install spacy==3.1.1 
#restart runtime after this
#!python -m spacy download en_core_web_sm

import spacy
nlp = spacy.load("en_core_web_lg")

import csv
import pandas as pd
pd.options.display.max_rows
pd.set_option('display.max_colwidth', None)

#!pip install emoji
from emoji import UNICODE_EMOJI

#  2. Reading files 

In [7]:
df = pd.read_csv('data/alertas.csv', sep="|")
df = df[df["Text"] != " "]
print("Size of Data:", len(df))
print()
print("Columns : ", list(df.columns))

Size of Data: 666

Columns :  ['Filename', 'Text', 'Type', 'Year']


# 3. Functions

In [8]:
exceptions = ['⠀ ⠀',  '⠀', '⠀ ⠀', '⠀⠀',  '⠀','#####']

emojies  = ['😳','👏', '✨','🤡','😳','🕊']

def is_emoji(s):
    return s in emojies

def acrfull(x):
    return ': '.join([x, spacy.explain(x)])

def extract_named_entities(text):    
    nlp.max_length = len(text) + 100
    doc = nlp(text)

    dt = pd.DataFrame(columns=['entity', 'label'])
    i = 0
    for ent in doc.ents:
        if ent.text not in exceptions:
            dt.loc[i] = [ent.text, ent.label_] 
            i = i +1
        
    dt = dt.reset_index()
    dt = dt.groupby(['entity', 'label']).agg({'index' : 'count'}).sort_values('index', ascending=False).reset_index()
    dt.columns = ['Entity', 'Type', '#Ocurrences']
    dt = dt.set_index('Entity')
    dt['Type'] = dt['Type'].apply(acrfull)
    
    dt = dt.sort_values('#Ocurrences', ascending=False) #sort before deleting duplicates
    dt = dt[~dt.index.duplicated(keep='first')]                 
    
    #recognizing handlers
    dt.loc[dt.index.str.startswith('@') , 'Type'] = 'Social Media Handler'
    
    #emojies - comparing it with list instead of all emojies for optimization
    #dt.loc[dt.index.map(is_emoji) , 'Type'] = 'Emojie'
    
    return dt.sort_values('#Ocurrences', ascending=False)

# 4. Top 30 named entities in the whole corpus

In [None]:
extract_named_entities(' '.join(df['Text'])).head(30)

## 4.1 Top 30 named entities in Advertencias

In [None]:
dfg = df[df["Type"] == "advertencia"]
extract_named_entities(' '.join(dfg['Text'])).head(30)

## 4.2 Top 30 named entities in Seguimiento

In [None]:
dfg = df[df["Type"] == "seguimiento"]
extract_named_entities(' '.join(dfg['Text'])).head(30)

# 5. Exporting to html

In [None]:
!jupyter nbconvert --to html 3