In [3]:
# import csv as df
import pandas as pd

# import data utf 8
df = pd.read_csv('dataset_spain_sdgs.csv', encoding='ISO-8859-1', sep=",")
df.head()

Unnamed: 0,Title,Text,Source,Links
0,EL PUERTO DE CARTAGENA ACOGE EL EJERCICIO CLE...,Está mañana el puerto de Cartagena ha acogido ...,apc.es,https://www.apc.es/webapc/publicaciones/notici...
1,La Autoridad Portuaria de Cartagena y el Gobie...,La Autoridad Portuaria de Cartagena y el Gobie...,apc.es,https://www.apc.es/webapc/publicaciones/notici...
2,Éxito del IV Congreso Workshop Intervención O...,La Autoridad Portuaria de Cartagena ha inaugur...,apc.es,https://www.apc.es/webapc/publicaciones/notici...
3,FERMÍN ROL TOMA POSESIÓN COMO DIRECTOR GENERAL.,"El pasado 26 de septiembre, el Consejo de Admi...",apc.es,https://www.apc.es/webapc/publicaciones/notici...
4,EL PRESIDENTE DE LA APC RECIBE A REPRESENTANTE...,"Antonio Sevilla, presidente de la Autoridad Po...",apc.es,https://www.apc.es/webapc/publicaciones/notici...


In [4]:
import spacy
nlp = spacy.load("es_core_news_sm")

In [5]:
nlp.pipe_names

['tok2vec', 'morphologizer', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [6]:
# get all unique entities in the dataset and save them to a csv with the label

entities = []
labels = []

for ent in df['Text']:
    doc = nlp(ent)
    for ent in doc.ents:
        entities.append(ent.text)
        labels.append(ent.label_)

# create a dataframe with the entities and labels
entities_df = pd.DataFrame({'entities': entities, 'labels': labels})
entities_df.head()


Unnamed: 0,entities,labels
0,Cartagena,LOC
1,Clean Port Cartagena,MISC
2,mar de fenol,LOC
3,Con,MISC
4,Plan Interior Marítimo del puerto de Cartagena,MISC


In [7]:
# filter only unique entities and add a count column
unique_entities = entities_df['entities'].unique()
unique_entities_df = pd.DataFrame({'entities': unique_entities})
unique_entities_df['count'] = 0

# count the number of times each entity appears in the dataset
for ent in entities_df['entities']:
    unique_entities_df.loc[unique_entities_df['entities'] == ent, 'count'] += 1


# sort the dataframe by the count column
unique_entities_df.sort_values(by=['count'], inplace=True, ascending=False)
unique_entities_df.head()

Unnamed: 0,entities,count
177,UPCT,1052
0,Cartagena,604
173,Universidad Politécnica de Cartagena,228
121,Región de Murcia,140
61,España,124


In [8]:
# add label column
unique_entities_df['label'] = ''

# add label to each entity
for ent in unique_entities_df['entities']:
    unique_entities_df.loc[unique_entities_df['entities'] == ent, 'label'] = entities_df.loc[entities_df['entities'] == ent, 'labels'].values[0]

# sort by label and then by count
unique_entities_df.sort_values(by=['label', 'count'], inplace=True, ascending=False)

# save the dataframe to a csv
unique_entities_df.to_csv('entities.csv', index=False, encoding='ISO-8859-1')

In [9]:
from spacy import displacy

displacy.render(nlp(df['Text'][0]), style='ent', jupyter=True)