In [1]:
#!pip install newsapi-python

Collecting newsapi-python
  Downloading newsapi_python-0.2.7-py2.py3-none-any.whl (7.9 kB)
Installing collected packages: newsapi-python
Successfully installed newsapi-python-0.2.7


In [1]:
from newsapi import NewsApiClient
import configparser
import pandas as pd
from datetime import datetime, timedelta

import requests
import json

In [2]:
# read newsapi key
config = configparser.ConfigParser()
config.read('config.ini')
api_key = config['newsapi']['api_key']

In [3]:
# Initiate an instance
newsapi = NewsApiClient(api_key=api_key)

In [4]:
# get /v2/top-headlines/sources
sources = newsapi.get_sources()

In [5]:
# check keys
sources.keys()

dict_keys(['status', 'sources'])

In [6]:
# create a list with all English sources
sources_en = []
for source in sources['sources']:
    if source['language'] != 'en':
        continue
    sources_en.append(source['id'])    

In [7]:
print('There are {} English news sources.'.format(len(sources_en)))

There are 81 English news sources.


In [8]:
# due to the request limit for API access
# I will split the sources list to 2 parts
sources_en1 = sources_en[:40]
sources_en2 = sources_en[40:]
print('The first list has {} news sources and the second has {}.'.format(len(sources_en1), len(sources_en2)))

The first list has 40 news sources and the second has 41.


In [9]:
# define a function to convert api outout to pandas dataframe
def json_articles_torows(json_articles): 
    # create a list for all entries
    df_rows = [] 
    for item in json_articles:
        row_dict = {}
        row_dict['source'] = item['source']
        row_dict['author'] = item['author']
        row_dict['title'] = item['title']
        row_dict['description'] = item['description']
        row_dict['url'] = item["url"]
        row_dict['photo_url'] = item['urlToImage']
        row_dict['pub_date'] = item['publishedAt']
        row_dict['content'] = item['content']
        df_rows.append(row_dict)
    return df_rows

In [10]:
# define the past period of interets
daylimit = 30

today = datetime.now().date()
fromdate = today - timedelta(days = 30)
print(str(fromdate))

2023-09-12


In [28]:
# define a function to extract Tesla news from all English sources published since a user-defined date 

def request_tesla(sources, api_key, fromdate, sortby = 'popularity', samplesize = 100, language = 'en'):
    url = 'https://newsapi.org/v2/everything?'
    i = 0
    for source in sources:
        params_dict = {
            'q': 'Tesla',
            'sources': source,
            'sortBy': sortby,
            'pageSize': samplesize,
            'apiKey': api_key,
            'language': language,
            'from' : str(fromdate)  
        }

        # make a request 
        response = requests.get(url, params = params_dict)
    
        print('Request status code for {} is {}.'.format(source, response.status_code))
        # check the request status
        if response.status_code != 200:
            raise Exception("Request for {} returned an error: {} {}".format(source, response.status_code, response.text))

        # change response to json format
        response_json = response.json()
        print('There are total of {} articals from {} about Tesla.'.format(response_json['totalResults'], source))
        json_articles = response_json['articles']
    
        # convert to dataframe
        if i == 0:
            articles_df = pd.DataFrame(json_articles_torows(json_articles))
        else:
            new_df = pd.DataFrame(json_articles_torows(json_articles))
            articles_df = pd.concat([articles_df, new_df]).reset_index(drop = True)
        
        i += 1
    return articles_df

In [12]:
articles_df1 = request_tesla(sources_en1, api_key, fromdate)

Request status code for abc-news is 200.
There are total of 33 articals from abc-news about Tesla.
Request status code for abc-news-au is 200.
There are total of 12 articals from abc-news-au about Tesla.
Request status code for al-jazeera-english is 200.
There are total of 7 articals from al-jazeera-english about Tesla.
Request status code for ars-technica is 200.
There are total of 14 articals from ars-technica about Tesla.
Request status code for associated-press is 200.
There are total of 1 articals from associated-press about Tesla.
Request status code for australian-financial-review is 200.
There are total of 0 articals from australian-financial-review about Tesla.
Request status code for axios is 200.
There are total of 0 articals from axios about Tesla.
Request status code for bbc-news is 200.
There are total of 12 articals from bbc-news about Tesla.
Request status code for bbc-sport is 200.
There are total of 12 articals from bbc-sport about Tesla.
Request status code for bleac

In [26]:
sources_en

['abc-news',
 'abc-news-au',
 'al-jazeera-english',
 'ars-technica',
 'associated-press',
 'australian-financial-review',
 'axios',
 'bbc-news',
 'bbc-sport',
 'bleacher-report',
 'bloomberg',
 'breitbart-news',
 'business-insider',
 'business-insider-uk',
 'buzzfeed',
 'cbc-news',
 'cbs-news',
 'cnn',
 'crypto-coins-news',
 'engadget',
 'entertainment-weekly',
 'espn',
 'espn-cric-info',
 'financial-post',
 'football-italia',
 'fortune',
 'four-four-two',
 'fox-news',
 'fox-sports',
 'google-news',
 'google-news-au',
 'google-news-ca',
 'google-news-in',
 'google-news-uk',
 'hacker-news',
 'ign',
 'independent',
 'mashable',
 'medical-news-today',
 'msnbc',
 'mtv-news',
 'mtv-news-uk',
 'national-geographic',
 'national-review',
 'nbc-news',
 'news24',
 'new-scientist',
 'news-com-au',
 'newsweek',
 'new-york-magazine',
 'next-big-future',
 'nfl-news',
 'nhl-news',
 'politico',
 'polygon',
 'recode',
 'reddit-r-all',
 'reuters',
 'rte',
 'talksport',
 'techcrunch',
 'techradar',
 'the

In [13]:
articles_df1.shape

(898, 8)

In [14]:
articles_df1.head()

Unnamed: 0,source,author,title,description,url,photo_url,pub_date,content
0,"{'id': 'abc-news', 'name': 'ABC News'}","Brittany Gaddy, Deena Zaru","EEOC sues Tesla, alleging race discrimination ...","Tesla, the electric car company, is facing acc...",https://abcnews.go.com/US/eeoc-files-federal-l...,https://i.abcnewsfe.com/a/63a26308-5b81-4b2f-b...,2023-09-28T23:53:26Z,The U.S. Equal Employment Opportunity Commissi...
1,"{'id': 'abc-news', 'name': 'ABC News'}",Max Zahn,Soaring CEO pay commands spotlight in UAW stri...,Autoworkers have demanded wage increases that ...,https://abcnews.go.com/Business/soaring-ceo-pa...,https://i.abcnewsfe.com/a/be523951-d505-41ac-b...,2023-09-16T10:04:31Z,A strike launched by thousands of autoworkers ...
2,"{'id': 'abc-news', 'name': 'ABC News'}",The Associated Press,Turkey's President Erdogan and Elon Musk discu...,Turkish President Recep Tayyip Erdogan has cal...,https://abcnews.go.com/Technology/wireStory/tu...,https://s.abcnews.com/images/US/abc_news_defau...,2023-09-18T07:10:02Z,ISTANBUL -- Turkish President Recep Tayyip Erd...
3,"{'id': 'abc-news', 'name': 'ABC News'}",The Associated Press,Netanyahu visits Elon Musk in California with ...,Prime Minister Benjamin Netanyahu is starting ...,https://abcnews.go.com/Technology/wireStory/ne...,https://s.abcnews.com/images/US/abc_news_defau...,2023-09-18T16:22:23Z,"SAN JOSE, Calif. -- Prime Minister Benjamin Ne..."
4,"{'id': 'abc-news', 'name': 'ABC News'}",The Associated Press,Elon Musk facing defamation lawsuit in Texas o...,A California man who says he was harassed afte...,https://abcnews.go.com/Technology/wireStory/el...,https://i.abcnewsfe.com/a/10421d31-8c0d-47ff-8...,2023-10-02T23:02:32Z,"AUSTIN, Texas -- A California man who says he ..."


In [16]:
articles_df1.source.value_counts()

{'id': 'business-insider', 'name': 'Business Insider'}                  200
{'id': 'the-times-of-india', 'name': 'The Times of India'}              100
{'id': 'fortune', 'name': 'Fortune'}                                     68
{'id': None, 'name': '[Removed]'}                                        49
{'id': 'abc-news', 'name': 'ABC News'}                                   33
{'id': 'the-jerusalem-post', 'name': 'The Jerusalem Post'}               30
{'id': 'next-big-future', 'name': 'Next Big Future'}                     30
{'id': 'cnn', 'name': 'CNN'}                                             29
{'id': 'breitbart-news', 'name': 'Breitbart News'}                       28
{'id': 'bbc-news', 'name': 'BBC News'}                                   24
{'id': 'reuters', 'name': 'Reuters'}                                     24
{'id': 'newsweek', 'name': 'Newsweek'}                                   23
{'id': 'the-irish-times', 'name': 'The Irish Times'}                     23
{'id': 'tech

In [24]:
articles_df1.source.value_counts().shape

(42,)

In [29]:
articles_df = request_tesla(sources_en, api_key, fromdate)

Request status code for abc-news is 200.
There are total of 33 articals from abc-news about Tesla.
Request status code for abc-news-au is 200.
There are total of 12 articals from abc-news-au about Tesla.
Request status code for al-jazeera-english is 200.
There are total of 7 articals from al-jazeera-english about Tesla.
Request status code for ars-technica is 200.
There are total of 14 articals from ars-technica about Tesla.
Request status code for associated-press is 200.
There are total of 1 articals from associated-press about Tesla.
Request status code for australian-financial-review is 200.
There are total of 0 articals from australian-financial-review about Tesla.
Request status code for axios is 200.
There are total of 0 articals from axios about Tesla.
Request status code for bbc-news is 200.
There are total of 12 articals from bbc-news about Tesla.
Request status code for bbc-sport is 200.
There are total of 12 articals from bbc-sport about Tesla.
Request status code for bleac

Exception: Request for crypto-coins-news returned an error: 429 {"status":"error","code":"rateLimited","message":"You have made too many requests recently. Developer accounts are limited to 100 requests over a 24 hour period (50 requests available every 12 hours). Please upgrade to a paid plan if you need more requests."}

In [18]:
articles_df2.shape

(898, 8)

In [50]:
articles_df.head()

Unnamed: 0,source,author,title,description,url,photo_url,pub_date,content
0,"{'id': 'abc-news', 'name': 'ABC News'}","Brittany Gaddy, Deena Zaru","EEOC sues Tesla, alleging race discrimination ...","Tesla, the electric car company, is facing acc...",https://abcnews.go.com/US/eeoc-files-federal-l...,https://i.abcnewsfe.com/a/63a26308-5b81-4b2f-b...,2023-09-28T23:53:26Z,The U.S. Equal Employment Opportunity Commissi...
1,"{'id': 'abc-news', 'name': 'ABC News'}",Max Zahn,Soaring CEO pay commands spotlight in UAW stri...,Autoworkers have demanded wage increases that ...,https://abcnews.go.com/Business/soaring-ceo-pa...,https://i.abcnewsfe.com/a/be523951-d505-41ac-b...,2023-09-16T10:04:31Z,A strike launched by thousands of autoworkers ...
2,"{'id': 'abc-news', 'name': 'ABC News'}",The Associated Press,Turkey's President Erdogan and Elon Musk discu...,Turkish President Recep Tayyip Erdogan has cal...,https://abcnews.go.com/Technology/wireStory/tu...,https://s.abcnews.com/images/US/abc_news_defau...,2023-09-18T07:10:02Z,ISTANBUL -- Turkish President Recep Tayyip Erd...
3,"{'id': 'abc-news', 'name': 'ABC News'}",The Associated Press,Netanyahu visits Elon Musk in California with ...,Prime Minister Benjamin Netanyahu is starting ...,https://abcnews.go.com/Technology/wireStory/ne...,https://s.abcnews.com/images/US/abc_news_defau...,2023-09-18T16:22:23Z,"SAN JOSE, Calif. -- Prime Minister Benjamin Ne..."
4,"{'id': 'abc-news', 'name': 'ABC News'}",The Associated Press,Elon Musk facing defamation lawsuit in Texas o...,A California man who says he was harassed afte...,https://abcnews.go.com/Technology/wireStory/el...,https://i.abcnewsfe.com/a/10421d31-8c0d-47ff-8...,2023-10-02T23:02:32Z,"AUSTIN, Texas -- A California man who says he ..."


In [51]:
articles_df.source.value_counts()

{'id': 'business-insider', 'name': 'Business Insider'}        200
{'id': 'abc-news', 'name': 'ABC News'}                         33
{'id': 'breitbart-news', 'name': 'Breitbart News'}             29
{'id': 'bbc-news', 'name': 'BBC News'}                         24
{'id': 'ars-technica', 'name': 'Ars Technica'}                 14
{'id': 'abc-news-au', 'name': 'ABC News (AU)'}                 12
{'id': 'al-jazeera-english', 'name': 'Al Jazeera English'}      7
{'id': 'associated-press', 'name': 'Associated Press'}          1
Name: source, dtype: int64

In [52]:
# extract date
articles_df['pub_date'] = pd.to_datetime(articles_df['pub_date'])
articles_df['date'] = articles_df['pub_date'].dt.strftime('%Y-%m-%d')
articles_df['date'] = pd.to_datetime(articles_df['date'])
articles_df.head()

Unnamed: 0,source,author,title,description,url,photo_url,pub_date,content,date
0,"{'id': 'abc-news', 'name': 'ABC News'}","Brittany Gaddy, Deena Zaru","EEOC sues Tesla, alleging race discrimination ...","Tesla, the electric car company, is facing acc...",https://abcnews.go.com/US/eeoc-files-federal-l...,https://i.abcnewsfe.com/a/63a26308-5b81-4b2f-b...,2023-09-28 23:53:26+00:00,The U.S. Equal Employment Opportunity Commissi...,2023-09-28
1,"{'id': 'abc-news', 'name': 'ABC News'}",Max Zahn,Soaring CEO pay commands spotlight in UAW stri...,Autoworkers have demanded wage increases that ...,https://abcnews.go.com/Business/soaring-ceo-pa...,https://i.abcnewsfe.com/a/be523951-d505-41ac-b...,2023-09-16 10:04:31+00:00,A strike launched by thousands of autoworkers ...,2023-09-16
2,"{'id': 'abc-news', 'name': 'ABC News'}",The Associated Press,Turkey's President Erdogan and Elon Musk discu...,Turkish President Recep Tayyip Erdogan has cal...,https://abcnews.go.com/Technology/wireStory/tu...,https://s.abcnews.com/images/US/abc_news_defau...,2023-09-18 07:10:02+00:00,ISTANBUL -- Turkish President Recep Tayyip Erd...,2023-09-18
3,"{'id': 'abc-news', 'name': 'ABC News'}",The Associated Press,Netanyahu visits Elon Musk in California with ...,Prime Minister Benjamin Netanyahu is starting ...,https://abcnews.go.com/Technology/wireStory/ne...,https://s.abcnews.com/images/US/abc_news_defau...,2023-09-18 16:22:23+00:00,"SAN JOSE, Calif. -- Prime Minister Benjamin Ne...",2023-09-18
4,"{'id': 'abc-news', 'name': 'ABC News'}",The Associated Press,Elon Musk facing defamation lawsuit in Texas o...,A California man who says he was harassed afte...,https://abcnews.go.com/Technology/wireStory/el...,https://i.abcnewsfe.com/a/10421d31-8c0d-47ff-8...,2023-10-02 23:02:32+00:00,"AUSTIN, Texas -- A California man who says he ...",2023-10-02


In [60]:
# extract source id and name
articles_df.loc[:, 'source_name'] = [x['name'] for x in articles_df['source']]
articles_df.loc[:, 'source_id'] = [x['id'] for x in articles_df['source']]
articles_df.head()

Unnamed: 0,source,author,title,description,url,photo_url,pub_date,content,date,source_name,source_id
0,"{'id': 'abc-news', 'name': 'ABC News'}","Brittany Gaddy, Deena Zaru","EEOC sues Tesla, alleging race discrimination ...","Tesla, the electric car company, is facing acc...",https://abcnews.go.com/US/eeoc-files-federal-l...,https://i.abcnewsfe.com/a/63a26308-5b81-4b2f-b...,2023-09-28 23:53:26+00:00,The U.S. Equal Employment Opportunity Commissi...,2023-09-28,ABC News,abc-news
1,"{'id': 'abc-news', 'name': 'ABC News'}",Max Zahn,Soaring CEO pay commands spotlight in UAW stri...,Autoworkers have demanded wage increases that ...,https://abcnews.go.com/Business/soaring-ceo-pa...,https://i.abcnewsfe.com/a/be523951-d505-41ac-b...,2023-09-16 10:04:31+00:00,A strike launched by thousands of autoworkers ...,2023-09-16,ABC News,abc-news
2,"{'id': 'abc-news', 'name': 'ABC News'}",The Associated Press,Turkey's President Erdogan and Elon Musk discu...,Turkish President Recep Tayyip Erdogan has cal...,https://abcnews.go.com/Technology/wireStory/tu...,https://s.abcnews.com/images/US/abc_news_defau...,2023-09-18 07:10:02+00:00,ISTANBUL -- Turkish President Recep Tayyip Erd...,2023-09-18,ABC News,abc-news
3,"{'id': 'abc-news', 'name': 'ABC News'}",The Associated Press,Netanyahu visits Elon Musk in California with ...,Prime Minister Benjamin Netanyahu is starting ...,https://abcnews.go.com/Technology/wireStory/ne...,https://s.abcnews.com/images/US/abc_news_defau...,2023-09-18 16:22:23+00:00,"SAN JOSE, Calif. -- Prime Minister Benjamin Ne...",2023-09-18,ABC News,abc-news
4,"{'id': 'abc-news', 'name': 'ABC News'}",The Associated Press,Elon Musk facing defamation lawsuit in Texas o...,A California man who says he was harassed afte...,https://abcnews.go.com/Technology/wireStory/el...,https://i.abcnewsfe.com/a/10421d31-8c0d-47ff-8...,2023-10-02 23:02:32+00:00,"AUSTIN, Texas -- A California man who says he ...",2023-10-02,ABC News,abc-news


In [62]:
# keep the usefull columns
cols_keep = ['source_name', 'author', 'date', 'title', 'description', 'content']
news_df = articles_df[cols_keep]
news_df.head()

Unnamed: 0,source_name,author,date,title,description,content
0,ABC News,"Brittany Gaddy, Deena Zaru",2023-09-28,"EEOC sues Tesla, alleging race discrimination ...","Tesla, the electric car company, is facing acc...",The U.S. Equal Employment Opportunity Commissi...
1,ABC News,Max Zahn,2023-09-16,Soaring CEO pay commands spotlight in UAW stri...,Autoworkers have demanded wage increases that ...,A strike launched by thousands of autoworkers ...
2,ABC News,The Associated Press,2023-09-18,Turkey's President Erdogan and Elon Musk discu...,Turkish President Recep Tayyip Erdogan has cal...,ISTANBUL -- Turkish President Recep Tayyip Erd...
3,ABC News,The Associated Press,2023-09-18,Netanyahu visits Elon Musk in California with ...,Prime Minister Benjamin Netanyahu is starting ...,"SAN JOSE, Calif. -- Prime Minister Benjamin Ne..."
4,ABC News,The Associated Press,2023-10-02,Elon Musk facing defamation lawsuit in Texas o...,A California man who says he was harassed afte...,"AUSTIN, Texas -- A California man who says he ..."


In [63]:
# check data for missing values and data types
news_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 320 entries, 0 to 319
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   source_name  320 non-null    object        
 1   author       317 non-null    object        
 2   date         320 non-null    datetime64[ns]
 3   title        320 non-null    object        
 4   description  320 non-null    object        
 5   content      320 non-null    object        
dtypes: datetime64[ns](1), object(5)
memory usage: 15.1+ KB


**Note:**
There is only missing authors, which is not a big issue for this task.

In [64]:
# save out the news data
news_df.to_csv('../data/Tesla_news.csv', index = False)