In [1]:
#import libraries
import requests
import json
import pandas as pd
pd.set_option('display.max_columns', 40)
pd.set_option('display.max_rows', 400)
import os

In [2]:
#import apikey
from api_key import api_key as key

In [3]:
#creating a dictionary to store the ids for url requests
chart_dict = {}

In [4]:
#list of charts for the url requests
names = ['retail_gas_price',
         'regular_gas_price', 'premium_gas_price', 
         'europe_brent', 'wti', 'oil_supply', 'crude_oil_production']
#list of ids for the url requests
series_ids = ['TOTAL.MGUCUUS.M',  
              'PET.EMM_EPMR_PTE_NUS_DPG.M', 'PET.EMM_EPMP_PTE_NUS_DPG.M',
             'PET.RBRTE.M', 'PET.RWTC.M', 'PET.MTTUA_NUS_1.M', 'TOTAL.PAPRP48.M']

In [5]:
def get_chart(names, series_id):
    """
    Takes in a list of names and series_ids
    to create a dictionary of 
    chart names and urls
    """
    urls = []
    for i, n in enumerate(series_ids):
        url = 'http://api.eia.gov/series/?api_key={}&series_id={}'.format(key, series_id[i])
        urls.append(url)
    
    for i, n in enumerate(names):
        chart_dict[names[i]] = urls[i]
    return chart_dict

In [6]:
def make_call(url_value):
    """
    makes an API call
    INPUT: the url
    OUTPUT: the result of an API call
    """
    
    response = requests.get(url_value)
    
    return response.json()['series'][0]['data']

In [7]:
#saving the dictionary of chart names and urls to a variable
chart_dict = get_chart(names, series_ids)

In [8]:
def data_to_df(chart_dict):
    
    """
    This function takes in the dictionary of names and urls.
    From here it: Makes an API call, Sorts the data to be put in a
    DataFrame, & Creates the DataFrame(s)
    """
    
    data = make_call(chart_dict[names[count]])
    
    #sorts the data by date
    sorted_data = sorted(data, key = lambda x: x[0])

    #adding a dash to the DATE columns to allow it to be parsed
    for i, n in enumerate(sorted_data):
        sorted_data[i][0] = sorted_data[i][0][:4] + '/' + sorted_data[i][0][4:]
    
    #creating a dictionary to create a dataframe
    dataframe = {'date':[i[0] for i in sorted_data], names[count]:[j[1] for j in sorted_data]}

    #checks to see if the DataFrame exists
    path = os.path.exists('../Data/GasPrices.csv')
    
    if path == False:
        
        #creates an initial dataframe if there isn't one
        df1 = pd.DataFrame(dataframe)

        df1.to_csv('../Data/GasPrices.csv', index = False)

    else:
        #creates a new dataframe to be merged with the original
        df2 = pd.DataFrame(dataframe)
        
        df1 = pd.read_csv('../Data/GasPrices.csv')
        
        gas_data = pd.merge( df1, df2, how = 'left')
        
        gas_data.to_csv('../Data/GasPrices.csv', index = False) 
        

In [9]:
count = 0

#iterating through the charts to have 1 combined dataframe
while count != len(names):

    data_to_df(chart_dict)
    count +=1

In [10]:
gas = pd.read_csv('../Data/GasPrices.csv')
pd.set_option('display.max_columns', 40)

pd.set_option('display.max_rows', 400)

In [11]:
gas.set_index('date', inplace = True)

In [12]:
def format_import_date(x):
    '''
    This function formats the dates in the imports csv
    to make it compatible with the other data.
    '''
    x=str(x)+'-01-01'
    return x

In [13]:
#getting the imports data
imports = pd.read_csv('../Data/additional_data/petroleum-consumption.csv', skiprows=4)
imports.rename(columns={'year': 'date'}, inplace = True)
imports = imports[['date', 'imports']]
imports['date'] = imports['date'].apply(format_import_date)
imports.to_csv('../Data/additional_data/imports.csv', index = False)

In [14]:
imports = pd.read_csv('additional_data/imports.csv')

In [15]:
#list of the csv files for exogenous variables
csv_list = ['employees_oil_extraction', 'imports', 'federal_gas_tax', 'state_gas_tax', 'inflation']

In [16]:
def add_new_data(csv_list, count):
    """
    This function takes in list of csvs collected from
    https://fred.stlouisfed.org/ & adds them to the main 
    dataset
    """
    df = pd.read_csv('../Data/additional_data/{}.csv'.format(csv_list[count]))
    
    #changing the column names to match the main dataset
    df.rename(columns={df.columns[0]:'date',df.columns[1]:csv_list[count]}, inplace = True)
    
    df['date'] = pd.to_datetime(df['date'])
    df['date'] = df['date'].astype('object')
    
    #opening the main dataset
    gas = pd.read_csv('../Data/GasPrices.csv')
    gas['date'] = pd.to_datetime(gas['date'])
    gas['date'] = gas['date'].astype('object')
    
    gas_data = pd.merge(gas, df, how = 'left')
    gas_data.to_csv('../Data/GasPrices.csv', index = False)
       

In [17]:
count = 0

#iterating through the csvs of exogenous variables to merge them with the dataframe
while count != len(csv_list):
    
    add_new_data(csv_list, count)
    count +=1

In [18]:
def format_date(x):
    '''
    This function is for the GPR data. It formats the date 
    column so it can be parsed into a
    datetime object
    '''
    x = x.replace('-','/')
    if int(x[-2]) > 3:
        x = x[:4] + '19' + x[-2:]
    else:
        x = x[:4] + '20' + x[-2:]
    return x

In [19]:
#opening the main dataset
gas = pd.read_csv('../Data/GasPrices.csv')
gas['date'] = pd.to_datetime(gas['date'])
gas['date'] = gas['date'].astype('object')

#opening the GPR dataset
gpr = pd.read_csv('../Data/additional_data/gpr_countries.csv')
gpr.rename(columns={'Date':'date'}, inplace = True)
gpr['date'] = gpr['date'].apply(format_date)
gpr['date'] = pd.to_datetime(gpr['date'])
gpr['date'] = gpr['date'].astype('object')

#concating the two to create the final dataset
gas_data = pd.merge(gas, gpr, how = 'left')

In [20]:
#saving the final data
gas_data.to_csv('../Data/GasPrices.csv', index = False)