# Data Collection

## Importing Libraries

In [1]:
#import libraries
import requests
import json
import pandas as pd
pd.set_option('display.max_columns', 40)
pd.set_option('display.max_rows', 400)
import os

#I will use this to put the data into a SQL database to be queried
from sqlalchemy import create_engine

## API Key

In [2]:
#import apikey
from api_key import api_key as key

# Data From EIA API

In [3]:
#creating a dictionary to store the ids for url requests
chart_dict = {}

In [4]:
#list of charts for the url requests
names = ['retail_gas_price',
         'regular_gas_price', 'premium_gas_price', 
         'europe_brent', 'wti', 'oil_supply', 'crude_oil_production']
#list of ids for the url requests
series_ids = ['TOTAL.MGUCUUS.M',  
              'PET.EMM_EPMR_PTE_NUS_DPG.M', 'PET.EMM_EPMP_PTE_NUS_DPG.M',
             'PET.RBRTE.M', 'PET.RWTC.M', 'PET.MTTUA_NUS_1.M', 'TOTAL.PAPRP48.M']

In [5]:
def get_chart(names, series_id):
    """
    Takes in a list of names and series_ids
    to create a dictionary of 
    chart names and urls
    """
    urls = []
    for i, n in enumerate(series_ids):
        url = 'http://api.eia.gov/series/?api_key={}&series_id={}'.format(key, series_id[i])
        urls.append(url)
    
    for i, n in enumerate(names):
        chart_dict[names[i]] = urls[i]
    return chart_dict

In [6]:
def make_call(url_value):
    """
    makes an API call
    INPUT: the url
    OUTPUT: the result of an API call
    """
    
    response = requests.get(url_value)
    
    return response.json()['series'][0]['data']

In [7]:
#saving the dictionary of chart names and urls to a variable
chart_dict = get_chart(names, series_ids)

In [8]:
def data_to_df(chart_dict):
    
    """
    This function takes in the dictionary of names and urls.
    From here it: Makes an API call, Sorts the data to be put in a
    DataFrame, & Creates the DataFrame(s)
    """
    
    data = make_call(chart_dict[names[count]])
    
    #sorts the data by date
    sorted_data = sorted(data, key = lambda x: x[0])

    #adding a dash to the DATE columns to allow it to be parsed
    for i, n in enumerate(sorted_data):
        sorted_data[i][0] = sorted_data[i][0][:4] + '/' + sorted_data[i][0][4:]
    
    #creating a dictionary to create a dataframe
    dataframe = {'date':[i[0] for i in sorted_data], names[count]:[j[1] for j in sorted_data]}

    #checks to see if the DataFrame exists
    path = os.path.exists('../Data/Prices.csv')
    
    if path == False:
        
        #creates an initial dataframe if there isn't one
        df1 = pd.DataFrame(dataframe)

        df1.to_csv('../Data/Prices.csv', index = False)

    else:
        #creates a new dataframe to be merged with the original
        df2 = pd.DataFrame(dataframe)
        
        df1 = pd.read_csv('../Data/Prices.csv')
        
        gas_data = pd.merge( df1, df2, how = 'left')
        
        gas_data.to_csv('../Data/Prices.csv', index = False) 
        

In [9]:
count = 0

#iterating through the charts to have 1 combined dataframe
while count != len(names):

    data_to_df(chart_dict)
    count +=1

In [10]:
gas = pd.read_csv('../Data/Prices.csv')
pd.set_option('display.max_columns', 40)

pd.set_option('display.max_rows', 400)

In [11]:
gas.set_index('date', inplace = True)

In [12]:
gas.tail()

Unnamed: 0_level_0,retail_gas_price,regular_gas_price,premium_gas_price,europe_brent,wti,oil_supply,crude_oil_production
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2021/02,2.559,2.501,3.14,62.28,59.04,13830.0,9316.459
2021/03,2.856,2.81,3.454,65.41,62.33,17592.0,10734.535
2021/04,2.907,2.858,3.519,64.81,61.72,33230.0,10722.489
2021/05,3.041,2.985,3.652,68.53,65.17,,10506.953
2021/06,3.245,3.064,3.744,73.16,71.38,,10709.293


# Data From Federal Reserve Economic Data

In [13]:
def format_import_date(x):
    '''
    This function formats the dates in the imports csv
    to make it compatible with the other data.
    '''
    x=str(x)+'-01-01'
    return x

In [14]:
#getting the imports data
imports = pd.read_csv('../Data/additional_data/petroleum-consumption.csv', skiprows=4)
imports.rename(columns={'year': 'date'}, inplace = True)
imports = imports[['date', 'imports']]
imports['date'] = imports['date'].apply(format_import_date)
imports.to_csv('../Data/additional_data/imports.csv', index = False)

In [15]:
imports = pd.read_csv('additional_data/imports.csv')

In [16]:
imports.head()

Unnamed: 0,date,imports
0,1950-01-01,0.85
1,1951-01-01,0.844
2,1952-01-01,0.952
3,1953-01-01,1.034
4,1954-01-01,1.052


In [17]:
#list of the csv files for exogenous variables
csv_list = ['employees_oil_extraction', 'imports', 'inflation']

In [18]:
def add_new_data(csv_list, count):
    """
    This function takes in list of csvs collected from
    https://fred.stlouisfed.org/ & adds them to the main 
    dataset
    """
    df = pd.read_csv('../Data/additional_data/{}.csv'.format(csv_list[count]))
    
    #changing the column names to match the main dataset
    df.rename(columns={df.columns[0]:'date',df.columns[1]:csv_list[count]}, inplace = True)
    
    df['date'] = pd.to_datetime(df['date'])
    df['date'] = df['date'].astype('object')
    
    df.to_csv('../Data/additional_data/{}.csv'.format(csv_list[count]), index = False)
    
     #checks to see if the DataFrame exists
    path = os.path.exists('../Data/OutsideData.csv')
    
    if path == False:
        
        #creates an initial dataframe if there isn't one
        df1 = pd.read_csv('../Data/additional_data/{}.csv'.format(csv_list[count]))

        df1.to_csv('../Data/OutsideData.csv', index = False)

    else:
        #creates a new dataframe to be merged with the original
        df2 = pd.read_csv('../Data/additional_data/{}.csv'.format(csv_list[count]))
        
        df1 = pd.read_csv('../Data/OutsideData.csv')
        
        outside_data = pd.merge( df1, df2, how = 'left')
        
        outside_data.to_csv('../Data/OutsideData.csv', index = False) 

In [19]:
count = 0

#iterating through the csvs of exogenous variables to merge them with the dataframe
while count != len(csv_list):
    
    add_new_data(csv_list, count)
    count +=1

In [20]:
outside=pd.read_csv('../Data/OutsideData.csv')

outside.head()

Unnamed: 0,date,employees_oil_extraction,imports,inflation
0,1972-01-01 00:00:00,141.2,4.741,3.272278
1,1972-02-01 00:00:00,140.9,,
2,1972-03-01 00:00:00,140.8,,
3,1972-04-01 00:00:00,140.2,,
4,1972-05-01 00:00:00,139.6,,


# Data From Matteo Iacovello's GPR Dataset

In [21]:
def format_date(x):
    '''
    This function is for the GPR data. It formats the date 
    column so it can be parsed into a
    datetime object
    '''
    x = x.replace('-','/')
    if int(x[-2]) > 3:
        x = x[:4] + '19' + x[-2:]
    else:
        x = x[:4] + '20' + x[-2:]
    return x

In [22]:
#opening the GPR dataset
gpr = pd.read_csv('../Data/additional_data/gpr_countries.csv')
gpr.rename(columns={'Date':'date'}, inplace = True)
gpr['date'] = gpr['date'].apply(format_date)
gpr['date'] = pd.to_datetime(gpr['date'])
gpr['date'] = gpr['date'].astype('object')

outside = pd.read_csv('../Data/OutsideData.csv')

In [23]:
gpr.head()

Unnamed: 0,date,GPR_TURKEY,GPR_MEXICO,GPR_KOREA,GPR_RUSSIA,GPR_INDIA,GPR_BRAZIL,GPR_CHINA,GPR_INDONESIA,GPR_SAUDI_ARABIA,GPR_SOUTH_AFRICA,GPR_ARGENTINA,GPR_COLOMBIA,GPR_VENEZUELA,GPR_THAILAND,GPR_UKRAINE,GPR_ISRAEL,GPR_MALAYSIA,GPR_PHILIPPINES,GPR_HONG_KONG
0,1985-01-01 00:00:00,74.398603,71.834645,50.010939,107.273026,64.703527,136.391765,75.861629,44.390573,44.932456,106.480231,116.967718,75.838925,52.717917,112.433888,95.716378,74.056598,24.552243,77.71687,63.651006
1,1985-02-01 00:00:00,64.273115,66.904664,105.825328,68.845354,59.322351,114.677505,67.389634,69.089087,65.595838,124.38528,170.897506,64.985289,89.321653,99.368654,104.417867,71.946724,79.002782,111.065937,54.693568
2,1985-03-01 00:00:00,111.375154,87.125647,98.235773,97.093057,90.753782,165.492686,81.764022,55.324211,50.966551,140.265543,211.254196,78.473388,80.637604,97.096086,143.574568,65.432104,62.897881,108.276866,79.800972
3,1985-04-01 00:00:00,123.198545,86.839838,77.02932,91.826265,60.033003,136.572356,71.426214,59.253488,63.286368,209.352245,187.033352,65.917646,70.695981,87.736872,53.840463,73.311768,99.962704,68.075857,60.581997
4,1985-05-01 00:00:00,52.541386,91.348379,88.949804,114.140725,131.693643,132.046243,56.53056,60.16508,48.395458,146.59356,139.151251,75.774491,82.42955,90.642066,99.87796,60.871206,77.748769,100.429197,41.708665


In [24]:
outside.head()

Unnamed: 0,date,employees_oil_extraction,imports,inflation
0,1972-01-01 00:00:00,141.2,4.741,3.272278
1,1972-02-01 00:00:00,140.9,,
2,1972-03-01 00:00:00,140.8,,
3,1972-04-01 00:00:00,140.2,,
4,1972-05-01 00:00:00,139.6,,


In [25]:
#concating the two to create the final dataset
outside_data = pd.merge(outside, gpr, how = 'left')

In [26]:
outside_data.head()

Unnamed: 0,date,employees_oil_extraction,imports,inflation,GPR_TURKEY,GPR_MEXICO,GPR_KOREA,GPR_RUSSIA,GPR_INDIA,GPR_BRAZIL,GPR_CHINA,GPR_INDONESIA,GPR_SAUDI_ARABIA,GPR_SOUTH_AFRICA,GPR_ARGENTINA,GPR_COLOMBIA,GPR_VENEZUELA,GPR_THAILAND,GPR_UKRAINE,GPR_ISRAEL,GPR_MALAYSIA,GPR_PHILIPPINES,GPR_HONG_KONG
0,1972-01-01 00:00:00,141.2,4.741,3.272278,,,,,,,,,,,,,,,,,,,
1,1972-02-01 00:00:00,140.9,,,,,,,,,,,,,,,,,,,,,
2,1972-03-01 00:00:00,140.8,,,,,,,,,,,,,,,,,,,,,
3,1972-04-01 00:00:00,140.2,,,,,,,,,,,,,,,,,,,,,
4,1972-05-01 00:00:00,139.6,,,,,,,,,,,,,,,,,,,,,


In [27]:
#saving the final data
outside_data.to_csv('../Data/OutsideData.csv')