In [1]:
from yanytapi import SearchAPI
import pandas as pd
from datetime import datetime
import time

In [2]:
keyword_dict = {
    'S&P': ['S. P.', 'Wall Street', 'Fed', 'Federal Reserve'],
    'AAPL': ['Apple'],
    'MSFT': ['Microsoft'],
    'AMZN': ['Amazon'],
    'TSLA': ['Tesla'],
    'GOOGL': ['Alphabet', 'Google', 'GOOG'],
    'BRK.B': ['Berkshire Hathaway', 'BRK'],
    'UNH': ['UnitedHealth'],
    'JNJ': ['Johnson', 'JnJ'],
    'XOM': ['ExxonMobil', 'Exxon'],
    'JPM': ['JP Morgan', 'JPMorgan'],
    'META': ['Meta', 'Facebook'],
    'NVDA': ['NVIDIA'],
    'PG': ['Procter'], 
    'V': ['Visa'],
    'HD': ['Home Depot'],
    'CVX': ['Chevron Corporation'],
    'MA': ['Mastercard'],
    'PFE': ['Pfizer'],
    'ABBV': ['AbbVie'],
    'BAC': ['Bank of America'],
    'LLY': ['Eli Lilly and Company'],
    'KO': ['Coca-Cola'],
    'PEP': ['PepsiCo', 'Pepsi'],
    'COST': ['Costco'],
    'MRK': ['Merck'], 
    'TMO': ['Thermo Fisher', 'Thermo Fisher Scientific'],
    'AVGO': ['Broadcom'],
    'DIS': ['Walt Disney'],
    'WMT': ['Walmart'],
    'MCD': ['McDonald\'s'],
}

In [3]:
api = SearchAPI("RXkqQPmyy2JObRlC0qFYQj2vNxtX4oxP")

In [4]:
dates = []
keywords = []
headlines = []
abstracts = []
lead_paragraph = []
sections = []
hits = []
word_counts = []

for company, keyword_list in keyword_dict.items():
    print(f'### {company} ###')
    for keyword in keyword_list:
        count = 0
        articles = api.search(keyword, fq={"body": keyword, "source": ["Reuters","AP","The New York Times"]},
                        begin_date="20160101",
                        end_date = "20220901",
                        facet_field=["source", "day_of_week"],
                        facet_filter=True)
        
        for item in articles:
            relevant = False
            if (count % 100 == 0):
                time.sleep(2)

            # Filtering for S&P
            if keyword == 'S. P.':
                sp_keywords = ["s&p", "s.&p.", "standard & poor", "stock", "index", "market"]
                for kw in sp_keywords:
                    if kw in item.headline["main"].lower() or kw in item.abstract.lower():
                        relevant = True
                        break
            else:
                relevant = True

            if relevant:
                count += 1
                dates.append(datetime.strptime(item.pub_date[:10], '%Y-%m-%d'))
                keywords.append(keyword)
                headlines.append(item.headline["main"])
                abstracts.append(item.abstract)
                lead_paragraph.append(item.lead_paragraph)
                sections.append(item.section_name)
                hits.append(item.meta.hits)
                word_counts.append(item.word_count)
        print(f'{keyword}: {count}')
    
    df = pd.DataFrame({'date': dates, 'keyword': keywords, 'headline': headlines, 'abstract': abstracts, 'lead_paragraph':lead_paragraph, 'section': sections, 'hits': hits, 'word_count': word_counts})
    df.set_index('date', inplace=True)
    df = df.sort_index()

    df.to_excel('nyt_2016_2022_raw_data.xlsx')
    time.sleep(5)

### MCD ###
McDonald's: 1000


In [None]:
# filter articles belonging to selected sections
section_list = ['Blogs', 'Business Day', 'Opinion', 'Technology', 'The Upshot', 'U.S.', 'New York' 'World']
filtered_df = df[df['section'].isin(section_list)]

In [None]:
# track keywords of each article
headline = filtered_df["headline"]
df_single_kw = filtered_df.drop(filtered_df[headline.isin(headline[headline.duplicated()])].index)
df_multiple_kw = filtered_df[headline.isin(headline[headline.duplicated()])]

headline = filtered_df["headline"]
df_single_kw = filtered_df.drop(filtered_df[headline.isin(headline[headline.duplicated()])].index)
df_multiple_kw = filtered_df[headline.isin(headline[headline.duplicated()])]

tracker = {}
for i in range(len(df_multiple_kw)):
    headline = df_multiple_kw.iloc[i]['headline']
    if headline in tracker:
        tracker[headline].append(df_multiple_kw.iloc[i]['keyword'])
    else:
        tracker[headline] = [df_multiple_kw.iloc[i]['keyword']]

df_multiple_kw_no_dup = df_multiple_kw.drop_duplicates(subset=['headline'])
kw = []
for i in range(len(df_multiple_kw_no_dup)):
    headline = df_multiple_kw_no_dup.iloc[i]['headline']
    kw.append(list(set(tracker[headline])))
    
df_multiple_kw_no_dup['keyword'] = kw
df_single_kw['keyword'] = [[kw] for kw in df_single_kw['keyword']]
final_df = pd.concat([df_single_kw, df_multiple_kw_no_dup], axis=0).sort_index()

In [None]:
final_df.to_excel("nyt_2016_2022_final.xlsx")

In [None]:
df_no_duplicate = final_df.drop_duplicates(subset=['headline'])
temp = pd.DataFrame(df_no_duplicate.groupby('date').count()['headline'])
temp = temp.rename(columns={'headline': 'article_count'})
temp = temp.reindex(pd.date_range(start="2019-01-01", end="2022-09-01"))
temp = temp.fillna(0)

In [3]:
filtered_df = df[df['section'].isin(['Blogs', 'Business Day', 'Opinion', 'Technology', 'The Upshot', 'U.S.', 'New York', 'World'])]
print(f'rows filtered: {len(df) - len(filtered_df)}')
print(f'rows left: {len(filtered_df)}')

rows filtered: 9797
rows left: 21851


In [6]:
temp = pd.DataFrame(final_df.groupby('date').count()['headline'])
temp = temp.rename(columns={'headline': 'article_count'})
temp = temp.reindex(pd.date_range(start="2016-01-01", end="2022-09-01"))
temp = temp.fillna(0)

In [7]:
temp.mean()

article_count    7.692529
dtype: float64

In [8]:
zero = len(temp[temp['article_count']==0])
print(f'0 articles: {zero}')

less_five = len(temp[temp['article_count']<5])
print(f'<5 articles: {less_five}')

0 articles: 27
<5 articles: 647
