### Creating an NLP pipeline

- Use case: controlling information, Social Dilemma/sentiment analysis



In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Python support modules
import re
import string
import datetime
import pickle 
from collections import Counter

# Spacy
import spacy
from spacy.lemmatizer import Lemmatizer
from spacy.lang.en.stop_words import STOP_WORDS
#import en_core_web_sm
from spacy.pipeline import SentenceSegmenter

# NLTK
import nltk
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
from nltk.corpus import words, stopwords, wordnet

# Sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# Helper functions in py file
from preprocessing_headlines import cleaned_headline
import importlib
#importlib.reload(cleaned_headline)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/Tara8082/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
data = pd.read_csv('/Users/Tara8082/GIT/ProjectGIT/jumbo_headlines.csv')
pd.set_option('max_colwidth', 100)

In [3]:
data.head()

Unnamed: 0,user,date,content
0,"{'username': 'WSJ', 'displayname': 'The Wall Street Journal', 'id': 3108351, 'description': 'Sig...",2020-11-06T01:00:12+00:00,The U.S. Postal Service has been ordered by a federal judge to conduct rigorous sweeps for remai...
1,"{'username': 'WSJ', 'displayname': 'The Wall Street Journal', 'id': 3108351, 'description': 'Sig...",2020-11-06T00:45:06+00:00,“I never played music maliciously.” The “Gilligan’s Island” theme song erupts as a battle cry in...
2,"{'username': 'WSJ', 'displayname': 'The Wall Street Journal', 'id': 3108351, 'description': 'Sig...",2020-11-06T00:37:02+00:00,President Trump lashed out at pollsters and the news media in a speech from the White House Thur...
3,"{'username': 'WSJ', 'displayname': 'The Wall Street Journal', 'id': 3108351, 'description': 'Sig...",2020-11-06T00:30:08+00:00,From @WSJopinion: Covid-19 may push up to 150 million people into extreme poverty. The recipe fo...
4,"{'username': 'WSJ', 'displayname': 'The Wall Street Journal', 'id': 3108351, 'description': 'Sig...",2020-11-06T00:15:08+00:00,Denmark’s entire population of farmed mink has been ordered culled after researchers discovered ...


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2448364 entries, 0 to 2448363
Data columns (total 3 columns):
 #   Column   Dtype 
---  ------   ----- 
 0   user     object
 1   date     object
 2   content  object
dtypes: object(3)
memory usage: 56.0+ MB


In [5]:
# Preprocessing date column, converting to date time, dropping old date column

def get_date(df):
    df.date=df.date.str[:10]
    df['date'] = df['date'].str.replace('-','/')
    df['datetime'] = pd.to_datetime(df['date'], format="%Y/%m/%d")
    df.drop('date', axis=1, inplace=True)
    return df  

In [6]:
get_date(data)

Unnamed: 0,user,content,datetime
0,"{'username': 'WSJ', 'displayname': 'The Wall Street Journal', 'id': 3108351, 'description': 'Sig...",The U.S. Postal Service has been ordered by a federal judge to conduct rigorous sweeps for remai...,2020-11-06
1,"{'username': 'WSJ', 'displayname': 'The Wall Street Journal', 'id': 3108351, 'description': 'Sig...",“I never played music maliciously.” The “Gilligan’s Island” theme song erupts as a battle cry in...,2020-11-06
2,"{'username': 'WSJ', 'displayname': 'The Wall Street Journal', 'id': 3108351, 'description': 'Sig...",President Trump lashed out at pollsters and the news media in a speech from the White House Thur...,2020-11-06
3,"{'username': 'WSJ', 'displayname': 'The Wall Street Journal', 'id': 3108351, 'description': 'Sig...",From @WSJopinion: Covid-19 may push up to 150 million people into extreme poverty. The recipe fo...,2020-11-06
4,"{'username': 'WSJ', 'displayname': 'The Wall Street Journal', 'id': 3108351, 'description': 'Sig...",Denmark’s entire population of farmed mink has been ordered culled after researchers discovered ...,2020-11-06
...,...,...,...
2448359,"{'username': 'nypost', 'displayname': 'New York Post', 'id': 17469289, 'description': 'Breaking ...",Defending PGA champion let golden opportunity slip away https://t.co/vWknW2bsCR https://t.co/jIz...,2018-08-13
2448360,"{'username': 'nypost', 'displayname': 'New York Post', 'id': 17469289, 'description': 'Breaking ...",Non-compete deal forces wine maker to sit on product for nearly a decade https://t.co/xzQHXvrdzg...,2018-08-13
2448361,"{'username': 'nypost', 'displayname': 'New York Post', 'id': 17469289, 'description': 'Breaking ...",Diamondbacks will keep Bartolo Colon from adding to wins mark https://t.co/GDyKSNSB1R https://t....,2018-08-13
2448362,"{'username': 'nypost', 'displayname': 'New York Post', 'id': 17469289, 'description': 'Breaking ...",Woman goes missing while swimming in Jamaica Bay https://t.co/p3JtZPCUL1 https://t.co/oTA42CnjcO,2018-08-13


In [126]:
# def get_user(df):
#     df['username'] = df.user
#     for index, row in enumerate(df['username']):
#         row = row.split(',')[0]
#         row = row.replace("{'username': ", "").strip()
#         row = re.sub(r"\W", "", row, flags=re.I)
#         df['username'] = row
#     df.drop('user', axis = 1, inplace = True)
#     return df


# get_user(data)

In [10]:
data.head()

Unnamed: 0,user,content,datetime
0,"{'username': 'WSJ', 'displayname': 'The Wall Street Journal', 'id': 3108351, 'description': 'Sig...",The U.S. Postal Service has been ordered by a federal judge to conduct rigorous sweeps for remai...,2020-11-06
1,"{'username': 'WSJ', 'displayname': 'The Wall Street Journal', 'id': 3108351, 'description': 'Sig...",“I never played music maliciously.” The “Gilligan’s Island” theme song erupts as a battle cry in...,2020-11-06
2,"{'username': 'WSJ', 'displayname': 'The Wall Street Journal', 'id': 3108351, 'description': 'Sig...",President Trump lashed out at pollsters and the news media in a speech from the White House Thur...,2020-11-06
3,"{'username': 'WSJ', 'displayname': 'The Wall Street Journal', 'id': 3108351, 'description': 'Sig...",From @WSJopinion: Covid-19 may push up to 150 million people into extreme poverty. The recipe fo...,2020-11-06
4,"{'username': 'WSJ', 'displayname': 'The Wall Street Journal', 'id': 3108351, 'description': 'Sig...",Denmark’s entire population of farmed mink has been ordered culled after researchers discovered ...,2020-11-06


In [12]:
# Adding column for 'year'
def df_by_year_colmumn(dt):
    dt['year'] = dt['datetime'].dt.year
    return [dt[dt['year'] == y] for y in dt['year'].unique()]

list_of_dfs = df_by_year_column(data)

In [39]:
data.head()

Unnamed: 0,user,content,datetime,year
0,"{'username': 'WSJ', 'displayname': 'The Wall Street Journal', 'id': 3108351, 'description': 'Sig...",The U.S. Postal Service has been ordered by a federal judge to conduct rigorous sweeps for remai...,2020-11-06,2020
1,"{'username': 'WSJ', 'displayname': 'The Wall Street Journal', 'id': 3108351, 'description': 'Sig...",“I never played music maliciously.” The “Gilligan’s Island” theme song erupts as a battle cry in...,2020-11-06,2020
2,"{'username': 'WSJ', 'displayname': 'The Wall Street Journal', 'id': 3108351, 'description': 'Sig...",President Trump lashed out at pollsters and the news media in a speech from the White House Thur...,2020-11-06,2020
3,"{'username': 'WSJ', 'displayname': 'The Wall Street Journal', 'id': 3108351, 'description': 'Sig...",From @WSJopinion: Covid-19 may push up to 150 million people into extreme poverty. The recipe fo...,2020-11-06,2020
4,"{'username': 'WSJ', 'displayname': 'The Wall Street Journal', 'id': 3108351, 'description': 'Sig...",Denmark’s entire population of farmed mink has been ordered culled after researchers discovered ...,2020-11-06,2020


In [66]:
# Splitting dataframes by year 

grouped = data.groupby(data.year)

df_2016 = grouped.get_group(int(2016))
df_2017 = grouped.get_group(int(2017))
df_2018 = grouped.get_group(int(2018))
df_2019 = grouped.get_group(int(2019))
df_2020= grouped.get_group(int(2020))

In [67]:
df_by_year = [df_2016, df_2017, df_2018, df_2019, df_2020]
year_list = ['2016', '2017', '2018', '2019', '2020']

## Headlines Cleaning

In [49]:
# DOESN'T WORK

# headline_by_year_list = []

# def clean_content(df_by_year, year_list):
#     for df in df_by_year:
#         for year in year_list:
#             new_df = 'headlines_' + 'year'
#             new_df = df_by_year[['content']].astype(str)
#             new_df['clean_content'] = new_df['content'].map(cleaned_headline)
#             headline_by_year_list.append(new_df)
#     return headline_by_year_list

In [None]:
# %%time

# headlines = clean_content(df_by_year, year_list)

In [68]:
# Changing content column to str type

headlines_2016 = df_2016[['content']].astype(str)
headlines_2017 = df_2017[['content']].astype(str)
headlines_2018 = df_2018[['content']].astype(str)
headlines_2019 = df_2019[['content']].astype(str)
headlines_2020 = df_2020[['content']].astype(str)

In [71]:
# Cleaning headlines

%%time

headlines_2016['clean_content'] = headlines_2016['content'].map(cleaned_headline)
headlines_2017['clean_content'] = headlines_2017['content'].map(cleaned_headline)
headlines_2018['clean_content'] = headlines_2018['content'].map(cleaned_headline)
headlines_2019['clean_content'] = headlines_2019['content'].map(cleaned_headline)
headlines_2020['clean_content'] = headlines_2020['content'].map(cleaned_headline)

CPU times: user 1min 36s, sys: 1.27 s, total: 1min 38s
Wall time: 1min 41s


In [72]:
headlines_2016.head()

Unnamed: 0,content,clean_content
140738,"'I prefer a more visceral, natural style of filmmaking.' An interview with 'Patriots Day' direct...",i prefer a more visceral natural style of filmmaking an interview with patriots day director pet...
140739,The best architecture of 2016 https://t.co/4Sxh6qJgfp,the best architecture of
140740,"What scientific concept should be more widely known? Answers from Steven Pinker, Alison Gopnik, ...",what scientific concept should be more widely known answers from steven pinker alison gopnik jar...
140741,"Year in review: @wsjnumbers columnist @mcjomcg revisits cicadas, Volkswagens and electoral votes...",year in review wsjnumbers columnist mcjomcg revisits cicadas volkswagens and electoral votes
140742,How markets moved in 2016 https://t.co/Vy04ZutyMr,how markets moved in


In [73]:
# Pickling headlines for EDA

with open('headlines_cleaned_2016.pkl', 'wb') as to_write:
    pickle.dump(headlines_2016, to_write)

with open('headlines_cleaned_2017.pkl', 'wb') as to_write:
    pickle.dump(headlines_2017, to_write)

with open('headlines_cleaned_2018.pkl', 'wb') as to_write:
    pickle.dump(headlines_2018, to_write)

with open('headlines_cleaned_2019.pkl', 'wb') as to_write:
    pickle.dump(headlines_2019, to_write)

with open('headlines_cleaned_2020.pkl', 'wb') as to_write:
    pickle.dump(headlines_2020, to_write)


## Setting up a list of stop words and common words to remove
- Source for Twitter Stop Words: https://github.com/ravikiranj/twitter-sentiment-analyzer


In [74]:
standard_stop_words = stopwords.words("english")

In [75]:
# Open list of Twitter stop words

with open('stopwords.txt') as fp:
    # 1. iterate over file line-by-line
    # 2. strip line of newline symbols
    # 3. split line by spaces into list (of number strings)
    # 4. convert to string
    # 5. convert map object to list
    stopwords_data = [list(map(str, line.strip().split(' '))) for line in fp]

In [76]:
# Combining standard list and imported list of stop words

flat_stopwords_data = [item for sublist in stopwords_data for item in sublist]
stopwords = standard_stop_words + flat_stopwords_data
stopwords_set = set(stopwords) 

#adding 'breaking' to list of stop words
stopwords_set.update(['breaking', 'report', 'seven', 'people', 'happening', 'now', 'wsjbooks', 'wsj', 'wsjopinion', 'zyahna', 'zyairr', 'zydeco', 'zymere', 'zymergen', 'zynga', 'zz', 'wsj', 'nbc', 'zziya', 'zzz', 'zzzs', 'zwaan', 'zwack', 'zwang', 'zweibel', 'zweli', 'zwetsloot', 'zwickau','zwift', 'zwillinger', 'zwoolfe', 'zurfi', 'zuri', 'zury', 'zushaelinson', 'zutors','zuurbekom', 'zuzana', 'zverev', 'zvyagintsev', 'morning', 'brief', 'briefing', 'page', 'story', 'rundown', 'update', 'even', 'day', 'evening', 'publish', 'edition', 'monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday', 'ft'])

#stopwords_set included standard stop words, twitter words, and common words to media accounts
print("There are {} stop words in this list.".format(len(stopwords_set)))

There are 542 stop words in this list.


In [25]:
# Pickling stopwords to use EDA/Wordcloud
# with open('stopwords_set.pkl', 'wb') as to_write:
    # pickle.dump(stopwords_set, to_write)

## SpaCy Pipeline

- Need to link entities

In [77]:
disabled_components = ['tagger', 'parser']

nlp = spacy.load('en', disable=disabled_components)  # need to fix english model, need smaller english model
nlp.add_pipe(nlp.create_pipe('sentencizer'))

print(nlp.pipe_names)


['ner', 'sentencizer']


In [78]:
def lemmatize_nlp_pipe(doc):
    '''
    function that normalizes the text (lemmatization) if text is alpha and not in list of stopwords
    '''
    lemma_list = [str(token.lemma_) for token in doc
                 if token.is_alpha and token.text not in stopwords_set]
    lem_string = " ".join(lemma_list)
    return lem_string

def entity_lower(doc):
    for ent in doc.ents:
        print(ent.text, ent.start_char, ent.end_char, ent.label_)
    # if not entity, lowercase 

def preprocessed_pipe(texts, batch_size=100):
    preprocessed_pipe = []
    for doc in nlp.pipe(texts, batch_size=batch_size):
        preprocessed_pipe.append(lemmatize_nlp_pipe(doc))
    return preprocessed_pipe

In [79]:
%%time

headlines_2016['processed'] = preprocessed_pipe(headlines_2016['clean_content'], batch_size=10000)

CPU times: user 1min 48s, sys: 30.3 s, total: 2min 18s
Wall time: 2min 21s


In [80]:
%%time

headlines_2017['processed'] = preprocessed_pipe(headlines_2017['clean_content'], batch_size=10000)

CPU times: user 3min 11s, sys: 52.9 s, total: 4min 3s
Wall time: 4min 9s


In [81]:
%%time

headlines_2018['processed'] = preprocessed_pipe(headlines_2018['clean_content'], batch_size=10000)

CPU times: user 5min 2s, sys: 1min 24s, total: 6min 26s
Wall time: 6min 39s


In [82]:
%%time

headlines_2019['processed'] = preprocessed_pipe(headlines_2019['clean_content'], batch_size=10000)

CPU times: user 7min 40s, sys: 2min 8s, total: 9min 49s
Wall time: 10min 28s


In [83]:
%%time

headlines_2020['processed'] = preprocessed_pipe(headlines_2020['clean_content'], batch_size=10000)

CPU times: user 7min 31s, sys: 2min 3s, total: 9min 35s
Wall time: 9min 54s


In [87]:
headlines_2020.tail()

Unnamed: 0,content,clean_content,processed
2338226,Cole Haan shoes and handbags 65% off for end-of-year sale https://t.co/fxVlwf4SSz https://t.co/1...,cole haan shoes and handbags off for end of year sale,cole haan shoe handbag sale
2338227,Cuomo signs 'design build' bill that could save NYC billions https://t.co/wILP0DKuzW https://t.c...,cuomo signs design build bill that could save nyc billions,cuomo sign design build bill save nyc billion
2338228,Kim Jong Un says North Korea no longer bound by nuke test ban https://t.co/0EzmkIo2WS https://t....,kim jong un says north korea no longer bound by nuke test ban,kim jong un north korea bind nuke test ban
2338229,Alabama cops apologize for their ‘homeless quilt’ made of cardboard signs https://t.co/fRUay7S3W...,alabama cops apologize for their homeless quilt made of cardboard signs,alabama cop apologize homeless quilt cardboard sign
2338230,"With drones overhead, Times Square is the safest place on Earth to spend New Year’s Eve: NYPD ht...",with drones overhead times square is the safest place on earth to spend new year s eve nypd,drone overhead time square safe earth spend eve nypd


In [30]:
# Pickling processed headlines

# with open('processed_headlines.pkl', 'wb') as to_write:
    # pickle.dump(headlines, to_write)


In [88]:
# Creating CSV to avoid recursion error in base environement for CorEx

headlines_2016.to_csv(r'/Users/Tara8082/GIT/ProjectGIT/Project_4/miscellaneous_support/final_processed_2016.csv')
headlines_2017.to_csv(r'/Users/Tara8082/GIT/ProjectGIT/Project_4/miscellaneous_support/final_processed_2017.csv')
headlines_2018.to_csv(r'/Users/Tara8082/GIT/ProjectGIT/Project_4/miscellaneous_support/final_processed_2018.csv')
headlines_2019.to_csv(r'/Users/Tara8082/GIT/ProjectGIT/Project_4/miscellaneous_support/final_processed_2019.csv')
headlines_2020.to_csv(r'/Users/Tara8082/GIT/ProjectGIT/Project_4/miscellaneous_support/final_processed_2020.csv')

## Top 50 Words for Each Year

In [92]:
# function to find top words
def top_words_frequency(text, num_words=100, freq_threshold=500):
    all_words = []
    for tweet in list(text):
        words = tweet.split()
        for word in words:
            if word not in stopwords_set:
                all_words.append(word.lower())
    
    top_words = Counter(all_words).most_common(num_words)
    
    top_list = []
    for pair in top_words:
        if pair[1] > freq_threshold:
            top_list.append(pair)
            
    return top_list

In [94]:
top_words_frequency(headlines_2016['processed'], num_words=50, freq_threshold=500)


[('trump', 19229),
 ('donald', 6836),
 ('clinton', 6698),
 ('video', 6177),
 ('police', 5522),
 ('kill', 4696),
 ('china', 4455),
 ('woman', 4450),
 ('president', 4399),
 ('obama', 4201),
 ('live', 4078),
 ('world', 3972),
 ('market', 3827),
 ('attack', 3770),
 ('time', 3654),
 ('shoot', 3640),
 ('bank', 3435),
 ('watch', 3413),
 ('win', 3315),
 ('election', 3313),
 ('via', 3307),
 ('hillary', 3212),
 ('brexit', 3202),
 ('look', 3181),
 ('amp', 3119),
 ('call', 3094),
 ('vote', 3092),
 ('million', 3073),
 ('plan', 2950),
 ('uk', 2803),
 ('deal', 2780),
 ('death', 2670),
 ('hit', 2643),
 ('opinion', 2608),
 ('official', 2606),
 ('die', 2579),
 ('fastft', 2564),
 ('help', 2521),
 ('oil', 2518),
 ('city', 2511),
 ('share', 2456),
 ('pay', 2446),
 ('rise', 2436),
 ('top', 2428),
 ('house', 2347),
 ('home', 2344),
 ('stock', 2336),
 ('leave', 2300),
 ('debate', 2293),
 ('company', 2252)]

In [95]:
top_words_frequency(headlines_2017['processed'], num_words=200, freq_threshold=500)

[('trump', 48156),
 ('president', 15508),
 ('house', 9272),
 ('police', 7531),
 ('white', 6957),
 ('woman', 6532),
 ('north', 6526),
 ('opinion', 6454),
 ('kill', 6428),
 ('time', 6287),
 ('via', 6257),
 ('attack', 6153),
 ('china', 6116),
 ('korea', 6085),
 ('plan', 6051),
 ('call', 5983),
 ('analysis', 5688),
 ('bill', 5595),
 ('world', 5571),
 ('shoot', 5281),
 ('live', 5091),
 ('pres', 5012),
 ('russia', 5001),
 ('official', 4968),
 ('help', 4962),
 ('fire', 4919),
 ('health', 4866),
 ('tax', 4812),
 ('look', 4704),
 ('watch', 4490),
 ('hurricane', 4426),
 ('former', 4338),
 ('video', 4323),
 ('meet', 4317),
 ('senate', 4317),
 ('million', 4201),
 ('election', 4180),
 ('deal', 4174),
 ('donald', 4132),
 ('city', 4036),
 ('week', 4001),
 ('leave', 3961),
 ('gop', 3935),
 ('die', 3926),
 ('change', 3909),
 ('court', 3908),
 ('home', 3886),
 ('vote', 3842),
 ('market', 3811),
 ('leader', 3808),
 ('news', 3772),
 ('care', 3771),
 ('company', 3703),
 ('hit', 3651),
 ('tell', 3594),
 ('d

In [96]:
top_words_frequency(headlines_2018['processed'], num_words=200, freq_threshold=500)

[('trump', 52786),
 ('president', 27404),
 ('house', 13828),
 ('woman', 12511),
 ('time', 11690),
 ('world', 10436),
 ('former', 10382),
 ('police', 10023),
 ('kill', 9866),
 ('opinion', 9509),
 ('white', 9437),
 ('call', 9276),
 ('school', 9210),
 ('shoot', 9005),
 ('official', 8685),
 ('live', 8113),
 ('week', 8098),
 ('analysis', 8054),
 ('via', 7848),
 ('north', 7811),
 ('tell', 7783),
 ('country', 7618),
 ('company', 7575),
 ('court', 7490),
 ('help', 7304),
 ('election', 7265),
 ('look', 7163),
 ('child', 7146),
 ('china', 7098),
 ('family', 6860),
 ('plan', 6687),
 ('government', 6556),
 ('watch', 6517),
 ('fire', 6511),
 ('million', 6487),
 ('meet', 6412),
 ('pay', 6251),
 ('change', 6059),
 ('home', 6028),
 ('pres', 5944),
 ('leave', 5937),
 ('city', 5879),
 ('accord', 5871),
 ('news', 5776),
 ('american', 5761),
 ('florida', 5621),
 ('write', 5603),
 ('vote', 5601),
 ('die', 5583),
 ('deal', 5576),
 ('democrat', 5434),
 ('donald', 5428),
 ('leader', 5411),
 ('russia', 5405),


In [97]:
top_words_frequency(headlines_2019['processed'], num_words=200, freq_threshold=500)

[('trump', 58403),
 ('president', 33727),
 ('house', 19006),
 ('woman', 17458),
 ('world', 17032),
 ('time', 16911),
 ('china', 16768),
 ('police', 14287),
 ('call', 14189),
 ('company', 13072),
 ('week', 12919),
 ('plan', 12725),
 ('former', 12473),
 ('million', 12115),
 ('kill', 12001),
 ('trade', 11730),
 ('official', 11377),
 ('government', 11373),
 ('deal', 11339),
 ('country', 11087),
 ('help', 10790),
 ('city', 10667),
 ('look', 10426),
 ('opinion', 10309),
 ('pay', 10308),
 ('accord', 9957),
 ('month', 9902),
 ('white', 9775),
 ('shoot', 9766),
 ('tell', 9575),
 ('change', 9506),
 ('live', 9461),
 ('fire', 9334),
 ('leave', 9315),
 ('home', 9093),
 ('school', 9028),
 ('election', 8942),
 ('market', 8900),
 ('billion', 8836),
 ('attack', 8813),
 ('family', 8675),
 ('impeachment', 8436),
 ('watch', 8404),
 ('charge', 8404),
 ('court', 8394),
 ('die', 8371),
 ('bank', 8365),
 ('democratic', 8289),
 ('leader', 8279),
 ('child', 8254),
 ('american', 8204),
 ('democrat', 8078),
 ('de

In [98]:
top_words_frequency(headlines_2020['processed'], num_words=200, freq_threshold=500)

[('coronavirus', 90768),
 ('trump', 49872),
 ('president', 32270),
 ('covid', 30452),
 ('pandemic', 26221),
 ('china', 20877),
 ('time', 17142),
 ('world', 16782),
 ('death', 16024),
 ('week', 15921),
 ('test', 15593),
 ('country', 15572),
 ('million', 15003),
 ('biden', 14954),
 ('home', 14899),
 ('police', 14525),
 ('plan', 14164),
 ('live', 14000),
 ('house', 13611),
 ('city', 13502),
 ('election', 13147),
 ('company', 13122),
 ('health', 13010),
 ('official', 12832),
 ('call', 12665),
 ('help', 12642),
 ('government', 12532),
 ('month', 11919),
 ('virus', 11199),
 ('york', 10817),
 ('billion', 10791),
 ('vote', 10666),
 ('accord', 10461),
 ('woman', 10446),
 ('joe', 10417),
 ('market', 10257),
 ('court', 10115),
 ('business', 10094),
 ('former', 10072),
 ('outbreak', 9988),
 ('watch', 9977),
 ('amid', 9930),
 ('black', 9837),
 ('white', 9719),
 ('look', 9580),
 ('record', 9450),
 ('spread', 9057),
 ('kill', 8991),
 ('tell', 8910),
 ('economy', 8825),
 ('close', 8790),
 ('hit', 8716

In [None]:
len(he)