In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import clear_output
from collections import Counter

In [2]:
!pip install pandarallel

clear_output(wait=True)

[0m

In [3]:
from pandarallel import pandarallel
import multiprocessing
num_processors = multiprocessing.cpu_count()
workers = num_processors-1
print(f'Available CPUs: {num_processors}')

Available CPUs: 16


In [4]:
!pip install spacy --user

!python3 -m spacy download en_core_web_sm

clear_output(wait=True)

print("Downloaded spacy curpus")

Downloaded spacy curpus


In [5]:
import spacy
nlp = spacy.load("en_core_web_sm",  enable=['ner'])
nlp.add_pipe('merge_entities')
clear_output(wait=True)
nlp.pipe_names

['ner', 'merge_entities']

In [6]:
analysis = nlp.analyze_pipes(pretty=True)
print(analysis)

[1m

#   Component        Assigns          Requires         Scores          Retokenizes
-   --------------   --------------   --------------   -------------   -----------
0   ner              doc.ents                          ents_f          False      
                     token.ent_iob                     ents_p                     
                     token.ent_type                    ents_r                     
                                                       ents_per_type              
                                                                                  
1   merge_entities                    doc.ents                         True       
                                      token.ent_iob                               
                                      token.ent_type                              

[38;5;2m✔ No problems found.[0m
{'summary': {'ner': {'assigns': ['doc.ents', 'token.ent_iob', 'token.ent_type'], 'requires': [], 'scores': ['ents_f', 'ents_p', '

In [7]:
df = pd.read_csv('df_news_recomb.csv')

CPU times: user 2.45 s, sys: 425 ms, total: 2.87 s
Wall time: 2.86 s


In [8]:
df.head(2)

Unnamed: 0,token_text
0,artificial intelligence improves parking effic...
1,child autism saw learning social skill boosted...


In [9]:
org_entities_spacy = []

for doc in nlp.pipe(texts = df.loc[:,'token_text'], n_process = workers, batch_size = 300):
    for token in doc:
        if token.ent_type_ == 'ORG':
            org_entities_spacy.append(token.lower_)


CPU times: user 5min 24s, sys: 9.2 s, total: 5min 33s
Wall time: 5min 36s


In [10]:
person_entities_spacy = []

for doc in nlp.pipe(texts = df.loc[:,'token_text'], n_process = workers, batch_size = 300):
    for token in doc:
        if token.ent_type_ == 'PERSON':
            person_entities_spacy.append(token.lower_)


CPU times: user 5min 1s, sys: 9.16 s, total: 5min 10s
Wall time: 5min 13s


In [12]:
Counter(org_entities_spacy).most_common(20)

[('microsoft', 29942),
 ('google', 22000),
 ('ibm', 8674),
 ('gpt', 5997),
 ('fcc', 4641),
 ('intel', 4379),
 ('cbs', 4121),
 ('abc', 3490),
 ('samsung', 3469),
 ('cnn', 3464),
 ('fda', 3269),
 ('quantum', 2676),
 ('npr', 2667),
 ('npr news', 2592),
 ('sony', 2542),
 ('congress', 2526),
 ('sec', 2498),
 ('white house', 2389),
 ('ford', 2245),
 ('nbc', 2199)]

In [13]:
Counter(person_entities_spacy).most_common(20)

[('android', 1744),
 ('dashboard logout', 1742),
 ('bureaucircle', 1707),
 ('bulgaria burkina', 1642),
 ('austria azerbaijan', 1626),
 ('verde cayman island central african', 1618),
 ('jordan kansa', 1610),
 ('pichai', 1400),
 ('sam altman', 1380),
 ('apps newsplugin', 1204),
 ('jun cdt', 1149),
 ('mar cdt', 1006),
 ('mar pm', 977),
 ('donald trump', 967),
 ('joe biden', 946),
 ('apps', 918),
 ('bill gate', 874),
 ('eu', 837),
 ('mike', 816),
 ('mar', 811)]