In [16]:
import json
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
import re

In [2]:
# URLs for the JSON files to be cleaned

url_historic = r'https://projects.fivethirtyeight.com/trump-approval-ratings/historical-approval.json'

In [3]:
#import the 'Graph values' of historical presidents from JSON to a DataFrame
historical_graph_df = pd.read_json(url_historic)
historical_graph_df.head()


Unnamed: 0,president,date,days,subgroup,approve_estimate,disapprove_estimate
0,Harry S. Truman,1945-06-06,55,All polls,87.0,3.0
1,Harry S. Truman,1945-06-07,56,All polls,87.0,3.0
2,Harry S. Truman,1945-06-08,57,All polls,87.0,3.0
3,Harry S. Truman,1945-06-09,58,All polls,87.0,3.0
4,Harry S. Truman,1945-06-10,59,All polls,87.0,3.0


In [4]:
#import the 'Graph values' of Trump from csv to a DataFrame
trump_graph_df = pd.read_csv('../data/graph_estimates.csv', sep = ';', decimal = ',', index_col = 0)
trump_graph_df.head()


Unnamed: 0,date,future,subgroup,approve_estimate,approve_hi,approve_lo,disapprove_estimate,disapprove_hi,disapprove_lo
0,2017-01-23,False,Adults,45.0,51.1347,38.8653,45.0,51.1347,38.8653
1,2017-01-23,False,All polls,45.46693,50.88971,40.04416,41.26452,46.68729,35.84175
2,2017-01-23,False,Voters,46.0,52.29238,39.70762,37.0,43.29238,30.70762
3,2017-01-24,False,Adults,45.0,50.98562,39.01438,45.74659,51.73221,39.76097
4,2017-01-24,False,All polls,45.44264,50.82922,40.05606,41.87849,47.26508,36.49191


In [5]:
# clean data to match "all polls" and keep columns interested.

trump_graph_clean = trump_graph_df.loc[trump_graph_df['subgroup'] == 'All polls', ['date', 'approve_estimate', 'disapprove_estimate']].reset_index(drop=True)
trump_graph_clean.head()

Unnamed: 0,date,approve_estimate,disapprove_estimate
0,2017-01-23,45.46693,41.26452
1,2017-01-24,45.44264,41.87849
2,2017-01-25,47.76497,42.52911
3,2017-01-26,44.37598,41.06081
4,2017-01-27,44.13586,41.67268


In [22]:
# add the name of the President and reorder columns

trump_graph_clean['president'] = 'Donald J. Trump'
trump_graph_clean['days'] = range(1,len(trump_graph_clean)+1)
trump_graph_clean = trump_graph_clean[['president','date','days','approve_estimate', 'disapprove_estimate']]
trump_graph_clean.head()

Unnamed: 0,president,date,days,approve_estimate,disapprove_estimate
0,Donald J. Trump,2017-01-23,1,45.46693,41.26452
1,Donald J. Trump,2017-01-24,2,45.44264,41.87849
2,Donald J. Trump,2017-01-25,3,47.76497,42.52911
3,Donald J. Trump,2017-01-26,4,44.37598,41.06081
4,Donald J. Trump,2017-01-27,5,44.13586,41.67268


In [23]:
# drop columns from historical presidents to keep the ones that we want

historical_graph_clean = historical_graph_df[['president','date','days', 'approve_estimate', 'disapprove_estimate']]
historical_graph_clean

Unnamed: 0,president,date,days,approve_estimate,disapprove_estimate
0,Harry S. Truman,1945-06-06,55,87.00000,3.00000
1,Harry S. Truman,1945-06-07,56,87.00000,3.00000
2,Harry S. Truman,1945-06-08,57,87.00000,3.00000
3,Harry S. Truman,1945-06-09,58,87.00000,3.00000
4,Harry S. Truman,1945-06-10,59,87.00000,3.00000
...,...,...,...,...,...
25996,Barack Obama,2017-01-16,2918,56.59080,39.03924
25997,Barack Obama,2017-01-17,2919,57.81492,38.76143
25998,Barack Obama,2017-01-18,2920,57.76558,38.39201
25999,Barack Obama,2017-01-19,2921,57.80378,38.34211


In [24]:
# change datetime type to str of historical data df
historical_graph_clean.date = historical_graph_clean.date.astype(str)
historical_graph_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26001 entries, 0 to 26000
Data columns (total 5 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   president            26001 non-null  object 
 1   date                 26001 non-null  object 
 2   days                 26001 non-null  int64  
 3   approve_estimate     26001 non-null  float64
 4   disapprove_estimate  26001 non-null  float64
dtypes: float64(2), int64(1), object(2)
memory usage: 1015.8+ KB


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [25]:
#concat the two dataframes

presidents_popularity = pd.concat([historical_graph_clean,trump_graph_clean],ignore_index=True)
presidents_popularity

Unnamed: 0,president,date,days,approve_estimate,disapprove_estimate
0,Harry S. Truman,1945-06-06,55,87.000000,3.000000
1,Harry S. Truman,1945-06-07,56,87.000000,3.000000
2,Harry S. Truman,1945-06-08,57,87.000000,3.000000
3,Harry S. Truman,1945-06-09,58,87.000000,3.000000
4,Harry S. Truman,1945-06-10,59,87.000000,3.000000
...,...,...,...,...,...
27266,Donald J. Trump,2020-07-11,1266,40.121716,55.814000
27267,Donald J. Trump,2020-07-12,1267,40.356987,55.592104
27268,Donald J. Trump,2020-07-13,1268,40.442940,55.169294
27269,Donald J. Trump,2020-07-14,1269,40.311076,55.205477


In [26]:
# # change the Middle names to initials

# for i in range(len(presidents_popularity['president'])):
#     presidents_popularity.loc[i, 'president'] = re.sub(r'\S([A-Z])\w+\S', r'\1', presidents_popularity.loc[i, 'president'])
               
# presidents_popularity['president'].value_counts()

In [27]:
#export to csv
presidents_popularity.to_csv('../data/presidents_popularity.csv', sep=';', decimal=',')