In [164]:
import pandas as pd
import numpy as np
import requests
import json
import os
import time
import fred_msa
from datetime import date

In [6]:
csvs_to_ignore = ['msa_series.csv', 'msas-to-use.csv']

In [10]:
filepath = '..\\fred-data'
data_files = set(os.listdir(filepath)).difference(set(csvs_to_ignore))

In [12]:
all_data = {}
for file in data_files:
    all_data[file.replace('.csv', '')] = pd.read_csv(os.path.join(filepath, file))


In [47]:
series = pd.read_csv(os.path.join(filepath, 'msa_series.csv'))

In [50]:
series['clean_title'] = series.title.str.replace(':', ' -')
series.clean_title = [t.split(' in')[0] for t in series.clean_title]

In [54]:
ny = series.query("city == 'New York'")

frequencies = ny[['clean_title', 'frequency', 'seasonal_adjustment']]
used_series = frequencies[frequencies['clean_title'].isin(list(all_data.keys()))].reset_index(drop=True)

In [66]:
series_counts = pd.DataFrame(used_series.groupby(['clean_title'])['frequency'].count()).rename(columns={'frequency' : 'number'}).reset_index()
duplicates = series_counts[series_counts.number > 1].clean_title

In [117]:
list(duplicates)

['All Employees - Accommodation and Food Services',
 'All Employees - Administrative and Support and Waste Management and Remediation Services',
 'All Employees - Education and Health Services',
 'All Employees - Education and Health Services - Health Care and Social Assistance',
 'All Employees - Financial Activities',
 'All Employees - Financial Activities - Finance and Insurance',
 'All Employees - Financial Activities - Insurance Carriers and Related Activities',
 'All Employees - Goods Producing',
 'All Employees - Government',
 'All Employees - Government - Federal Government',
 'All Employees - Government - Local Government',
 'All Employees - Government - State Government',
 'All Employees - Health Care - Ambulatory Health Care Services',
 'All Employees - Health Care - Hospitals',
 'All Employees - Information',
 'All Employees - Information - Telecommunications',
 'Housing Inventory - Median Home Size',
 'Unemployment Rate']

In [121]:
series[(series.city == 'Chicago') & (series.title.str.contains('All Employees: Accomodation'))]

Unnamed: 0,id,realtime_start,realtime_end,title,observation_start,observation_end,frequency,frequency_short,units,units_short,seasonal_adjustment,seasonal_adjustment_short,last_updated,popularity,group_popularity,notes,msa,city,state,clean_title


In [114]:
sorted_freq = frequencies.sort_values(['clean_title', 'frequency', 'seasonal_adjustment'], ascending=[True, False, False])

In [115]:
sorted_freq.groupby(['clean_title']).head(1)

Unnamed: 0,clean_title,frequency,seasonal_adjustment
206,All Employees - Accommodation and Food Services,Monthly,Seasonally Adjusted
192,All Employees - Administrative and Support and...,Monthly,Seasonally Adjusted
67,All Employees - Education and Health Services,Monthly,Seasonally Adjusted
196,All Employees - Education and Health Services ...,Monthly,Seasonally Adjusted
210,All Employees - Federal Government,Annual,Not Seasonally Adjusted
...,...,...,...
110,Total Quantity Indexes for Real GDP for New Yo...,Annual,Not Seasonally Adjusted
9,Total Quarterly Wages,Quarterly,Seasonally Adjusted
111,Total Real Gross Domestic Product for New York...,Annual,Not Seasonally Adjusted
28,Unemployed Persons,Monthly,Not Seasonally Adjusted


In [83]:
dup_selections = dup_frequencies[(dup_frequencies.frequency == 'Monthly') & (dup_frequencies.seasonal_adjustment.str.contains('Not') == False)]

In [85]:
set(duplicates).difference(set(dup_selections.clean_title))

{'Housing Inventory - Median Home Size'}

In [107]:
string = ny[ny.clean_title=='Housing Inventory - Median Home Size'].title[55]

In [110]:
string.split(' in ')[0] + ' in ' + string.split(' in ')[1]

'Housing Inventory: Median Home Size in Square Feet Year-Over-Year'

In [38]:
keyword_list = ['Per Capita Personal Income', 'Resident Population', 'Unemployment Rate',
                'New Private Housing Units Authorized by Building Permits',
                'Regional Price Parities']

ny = series.query("city == 'New York'")
ny_housing = ny.title[ny.title.str.contains('Housing Inventory')]
housing_series = [s.split(' in')[0] for s in ny_housing]

ny_employees = ny.title[ny.title.str.contains('All Employees')]
employees_series = list(np.unique([s.split(' in')[0] for s in ny_employees]))

keyword_list = housing_series + employees_series

In [40]:
keyword_list

['Housing Inventory: Active Listing Count',
 'Housing Inventory: Active Listing Count Month-Over-Month',
 'Housing Inventory: Active Listing Count Year-Over-Year',
 'Housing Inventory: Average Listing Price',
 'Housing Inventory: Average Listing Price Month-Over-Month',
 'Housing Inventory: Average Listing Price Year-Over-Year',
 'Housing Inventory: Median Days on Market',
 'Housing Inventory: Median Days on Market Month-Over-Month',
 'Housing Inventory: Median Days on Market Year-Over-Year',
 'Housing Inventory: Median Listing Price',
 'Housing Inventory: Median Listing Price Month-Over-Month',
 'Housing Inventory: Median Listing Price per Square Feet',
 'Housing Inventory: Median Listing Price per Square Feet Month-Over-Month',
 'Housing Inventory: Median Listing Price per Square Feet Year-Over-Year',
 'Housing Inventory: Median Listing Price Year-Over-Year',
 'Housing Inventory: Median Home Size',
 'Housing Inventory: Median Home Size',
 'Housing Inventory: Median Home Size',
 'Hous

In [44]:
ny[ny.clean_title.isin(pd.Series(keyword_list).str.replace(':', ' -'))]

Unnamed: 0,id,realtime_start,realtime_end,title,observation_start,observation_end,frequency,frequency_short,units,units_short,seasonal_adjustment,seasonal_adjustment_short,last_updated,popularity,group_popularity,notes,msa,city,state,clean_title


In [45]:
pd.Series(keyword_list).str.replace(':', ' -')

0              Housing Inventory - Active Listing Count
1     Housing Inventory - Active Listing Count Month...
2     Housing Inventory - Active Listing Count Year-...
3             Housing Inventory - Average Listing Price
4     Housing Inventory - Average Listing Price Mont...
                            ...                        
69                      All Employees - Wholesale Trade
70    All Employees - Wholesale Trade - Merchant Who...
71    Average Hourly Earnings of All Employees - Tot...
72    Average Weekly Earnings of All Employees - Tot...
73    Average Weekly Hours of All Employees - Total ...
Length: 74, dtype: object

In [46]:
ny.clean_title

0                Housing Inventory: Active Listing Count
1      Housing Inventory: Active Listing Count Month-...
2      Housing Inventory: Active Listing Count Year-O...
3               Housing Inventory: Average Listing Price
4      Housing Inventory: Average Listing Price Month...
                             ...                        
216                      All Employees: Local Government
217          All Employees: Government: Local Government
218                         Market Hotness: Supply Score
219    Quarterly Average of Total Assets for Commerci...
220    Quarterly Average of Total Loans for Commercia...
Name: clean_title, Length: 221, dtype: object

In [15]:
list(all_data.keys())

['Housing Inventory - Median Days on Market',
 'Housing Inventory - New Listing Count Month-Over-Month',
 'Housing Inventory - Median Listing Price',
 'Housing Inventory - New Listing Count',
 'All Employees - Information',
 'Housing Inventory - Price Reduced Count Year-Over-Year',
 'All Employees - Financial Activities - Insurance Carriers and Related Activities',
 'Housing Inventory - Active Listing Count Month-Over-Month',
 'Housing Inventory - Average Listing Price Year-Over-Year',
 'Housing Inventory - Median Listing Price per Square Feet',
 'All Employees - Information - Telecommunications',
 'Housing Inventory - Price Increased Count Month-Over-Month',
 'Housing Inventory - Price Increased Count Year-Over-Year',
 'All Employees - Financial Activities',
 'Housing Inventory - Median Listing Price Month-Over-Month',
 'Housing Inventory - Median Listing Price per Square Feet Month-Over-Month',
 'Housing Inventory - Pending Listing Count Month-Over-Month',
 'All Employees - Financial

In [37]:
all_data['Unemployment Rate']

Unnamed: 0,date,Unemployment Rate,state,msa,city
0,1990-01-01,5.4,NY,"New York-Newark-Jersey City, NY-NJ-PA",New York
1,1991-01-01,7.3,NY,"New York-Newark-Jersey City, NY-NJ-PA",New York
2,1992-01-01,9.1,NY,"New York-Newark-Jersey City, NY-NJ-PA",New York
3,1993-01-01,8.4,NY,"New York-Newark-Jersey City, NY-NJ-PA",New York
4,1994-01-01,7.2,NY,"New York-Newark-Jersey City, NY-NJ-PA",New York
...,...,...,...,...,...
19488,2021-06-01,2.9,ID,"Lewiston, ID-WA",Lewiston
19489,2021-07-01,2.7,ID,"Lewiston, ID-WA",Lewiston
19490,2021-08-01,2.7,ID,"Lewiston, ID-WA",Lewiston
19491,2021-09-01,2.3,ID,"Lewiston, ID-WA",Lewiston


In [122]:
pop = pd.read_csv('..\\fred-data\Resident Population.csv')

In [124]:
income = pd.read_csv('..\\fred-data\Per Capita Personal Income.csv')

In [150]:
prices = pd.read_csv('..\\fred-data\Housing Inventory - Median Listing Price.csv')
listings = pd.read_csv('..\\fred-data\Housing Inventory - New Listing Count.csv')

In [151]:
drop_cols = ['state', 'city', 'title', 'id', 'frequency', 'seasonal_adjustment']

In [156]:
listings

Unnamed: 0,date,Housing Inventory: New Listing Count,state,msa,city,title,id,frequency,seasonal_adjustment
0,2016-07-01,1400.0,SC,"Charleston-North Charleston, SC",Charleston,Housing Inventory: New Listing Count in Charle...,NEWLISCOU16700,Monthly,Not Seasonally Adjusted
1,2016-08-01,1364.0,SC,"Charleston-North Charleston, SC",Charleston,Housing Inventory: New Listing Count in Charle...,NEWLISCOU16700,Monthly,Not Seasonally Adjusted
2,2016-09-01,1296.0,SC,"Charleston-North Charleston, SC",Charleston,Housing Inventory: New Listing Count in Charle...,NEWLISCOU16700,Monthly,Not Seasonally Adjusted
3,2016-10-01,1140.0,SC,"Charleston-North Charleston, SC",Charleston,Housing Inventory: New Listing Count in Charle...,NEWLISCOU16700,Monthly,Not Seasonally Adjusted
4,2016-11-01,1136.0,SC,"Charleston-North Charleston, SC",Charleston,Housing Inventory: New Listing Count in Charle...,NEWLISCOU16700,Monthly,Not Seasonally Adjusted
...,...,...,...,...,...,...,...,...,...
1360,2021-07-01,716.0,KS,"Wichita, KS",Wichita,Housing Inventory: New Listing Count in Wichit...,NEWLISCOU48620,Monthly,Not Seasonally Adjusted
1361,2021-08-01,680.0,KS,"Wichita, KS",Wichita,Housing Inventory: New Listing Count in Wichit...,NEWLISCOU48620,Monthly,Not Seasonally Adjusted
1362,2021-09-01,624.0,KS,"Wichita, KS",Wichita,Housing Inventory: New Listing Count in Wichit...,NEWLISCOU48620,Monthly,Not Seasonally Adjusted
1363,2021-10-01,596.0,KS,"Wichita, KS",Wichita,Housing Inventory: New Listing Count in Wichit...,NEWLISCOU48620,Monthly,Not Seasonally Adjusted


In [159]:
dataset = pd.merge(prices, listings.drop(columns=drop_cols), how='left', on=['date','msa'])

In [172]:
dataset.date = pd.to_datetime(dataset.date)
dataset['year'] = [d.year for d in dataset.date]

In [177]:
income.date = pd.to_datetime(income.date)
income['year'] = [d.year for d in income.date]

pop.date = pd.to_datetime(pop.date)
pop['year'] = [d.year for d in pop.date]

In [180]:
dataset = pd.merge(dataset, income.drop(columns=['city', 'state', 'date']), how='left', on=['msa', 'year'])

In [182]:
dataset = pd.merge(dataset, pop.drop(columns=['city', 'state', 'date']), how='left', on=['msa', 'year'])

In [185]:
dataset['month'] = [d.month for d in dataset.date]

In [189]:
dataset = dataset[['msa', 'city', 'state', 'date', 'year', 'month',
        'Housing Inventory: Median Listing Price',
        'Housing Inventory: New Listing Count',
        'Per Capita Personal Income',
        'Resident Population']]

In [190]:
dataset.to_csv('..\\cleaned-data\\sample_data_set.csv', index=False)

In [147]:
listings.query("city=='Lewiston'").sort_values('date')

Unnamed: 0,date,Housing Inventory: New Listing Count,state,msa,city,title,id,frequency,seasonal_adjustment
585,2016-07-01,96.0,ID,"Lewiston, ID-WA",Lewiston,Housing Inventory: New Listing Count in Lewist...,NEWLISCOU30300,Monthly,Not Seasonally Adjusted
586,2016-08-01,84.0,ID,"Lewiston, ID-WA",Lewiston,Housing Inventory: New Listing Count in Lewist...,NEWLISCOU30300,Monthly,Not Seasonally Adjusted
587,2016-09-01,68.0,ID,"Lewiston, ID-WA",Lewiston,Housing Inventory: New Listing Count in Lewist...,NEWLISCOU30300,Monthly,Not Seasonally Adjusted
588,2016-10-01,56.0,ID,"Lewiston, ID-WA",Lewiston,Housing Inventory: New Listing Count in Lewist...,NEWLISCOU30300,Monthly,Not Seasonally Adjusted
589,2016-11-01,48.0,ID,"Lewiston, ID-WA",Lewiston,Housing Inventory: New Listing Count in Lewist...,NEWLISCOU30300,Monthly,Not Seasonally Adjusted
...,...,...,...,...,...,...,...,...,...
645,2021-07-01,56.0,ID,"Lewiston, ID-WA",Lewiston,Housing Inventory: New Listing Count in Lewist...,NEWLISCOU30300,Monthly,Not Seasonally Adjusted
646,2021-08-01,84.0,ID,"Lewiston, ID-WA",Lewiston,Housing Inventory: New Listing Count in Lewist...,NEWLISCOU30300,Monthly,Not Seasonally Adjusted
647,2021-09-01,64.0,ID,"Lewiston, ID-WA",Lewiston,Housing Inventory: New Listing Count in Lewist...,NEWLISCOU30300,Monthly,Not Seasonally Adjusted
648,2021-10-01,72.0,ID,"Lewiston, ID-WA",Lewiston,Housing Inventory: New Listing Count in Lewist...,NEWLISCOU30300,Monthly,Not Seasonally Adjusted


In [155]:
prices

Unnamed: 0,date,Housing Inventory: Median Listing Price,state,msa,city
0,2016-07-01,428000.00,NY,"New York-Newark-Jersey City, NY-NJ-PA",New York
1,2016-08-01,424950.00,NY,"New York-Newark-Jersey City, NY-NJ-PA",New York
2,2016-09-01,427000.00,NY,"New York-Newark-Jersey City, NY-NJ-PA",New York
3,2016-10-01,434900.00,NY,"New York-Newark-Jersey City, NY-NJ-PA",New York
4,2016-11-01,439450.00,NY,"New York-Newark-Jersey City, NY-NJ-PA",New York
...,...,...,...,...,...
7177,2021-07-01,2.64,ID,"Lewiston, ID-WA",Lewiston
7178,2021-08-01,21.01,ID,"Lewiston, ID-WA",Lewiston
7179,2021-09-01,9.94,ID,"Lewiston, ID-WA",Lewiston
7180,2021-10-01,21.37,ID,"Lewiston, ID-WA",Lewiston
