In [214]:
import os
from io import StringIO
import tabula

# analysis
import numpy as np
import pandas as pd

# api
import json
import requests
import prettytable

# Get data

**What data do we need?**
- GDP by states by industry: https://www.bea.gov/system/files/2020-01/qgdpstate0120_2.pdf
- Employmnet by states by industry:
    - In order to get the data, we probably need to get the employment data by industry from each states' website.
        - NY: https://labor.ny.gov/stats/2016-2026-Statewide-and-Regional-Long-Term-Industry-Projections.xlsx
        - NJ: https://www.nj.gov/labor/lpa/employ/indoccpj/20162026%20Industry%20Employment%20Projections%20Reports.xlsx
        - ...so the 2016-2026 employment projection is avaliable on each states websites
        
- Labor Productivity by states by industry:
    - Private nonfarm productivity and costs by state and region: https://www.bls.gov/lpc/lpc-by-state-and-region.xlsx
    - https://www.bls.gov/opub/mlr/2019/article/bls-publishes-experimental-state-level-labor-productivity-measures.htm
        

## 1.0 GDP by states by industry
**API:** https://www.bls.gov/developers/api_signature_v2.htm

#### Try API

In [151]:
# my_bls_api = "ccdcbc9910a744a5be046d1aa70ac25c"

In [152]:
# headers = {'Content-type': 'application/json'}
# data = json.dumps({"seriesid": ['IPUAN1111__L010','IPUAN1111__L011'],"startyear":"2000", "endyear":"2019"})
# p = requests.post('https://api.bls.gov/ccdcbc9910a744a5be046d1aa70ac25c/v2/timeseries/data/', data=data, headers=headers)
# json_data = json.loads(p.text)
# for series in json_data['Results']['series']:
#     x=prettytable.PrettyTable(["series_id","year","period","value","footnotes"])
#     seriesId = series['seriesID']
#     for item in series['data']:
#         year = item['year']
#         period = item['period']
#         value = item['value']
#         footnotes=""
#         for footnote in item['footnotes']:
#             if footnote:
#                 footnotes = footnotes + footnote['text'] + ','
# #         if 'M01' <= period <= 'M12':
#         x.add_row([seriesId,year,period,value,footnotes[0:-1]])
#     output = open(seriesId + '.txt','w')
#     output.write (x.get_string())
#     output.close()

#### Reading from PDF


#### 1. Current-Dollar Gross Domestic Product (GDP) by State and Region, 2018:Q1-2019:Q3

In [157]:
gdp_df = tabula.read_pdf(os.path.join(os.getcwd(),"data/qgdpstate0120_2.pdf"), 
                         pages = 7)

In [158]:
gdp_df = gdp_df[0]

In [159]:
gdp_df = gdp_df.iloc[3:]

In [160]:
gdp_df = gdp_df.iloc[:,[0,1,2,3,4]]

In [161]:
gdp_df = pd.concat([gdp_df,gdp_df["Millions of dollars"].str.split(" ",expand = True)],axis=1)

In [162]:
gdp_df = gdp_df.drop(columns=["Millions of dollars","Unnamed: 3",5,6,7,8])

In [163]:
gdp_df.columns = ["areas","2018-1","2018-2","2018-3","2018-4","2019-1","2019-2","2019-3"]

In [164]:
gdp_df = gdp_df.reset_index(drop=True)

In [165]:
gdp_df.head()

Unnamed: 0,areas,2018-1,2018-2,2018-3,2018-4,2019-1,2019-2,2019-3
0,United States,20163159,20510177,20749752,20897804,21098827,21340267,21542540
1,New England,1070863,1084947,1096174,1101531,1118703,1130084,1142586
2,Connecticut,272854,273884,278295,277874,281659,284357,287560
3,Maine,63698,64746,65434,65545,66590,67138,67905
4,Massachusetts,558472,568220,573506,577754,586347,592588,599092


#### 2. Contributions to Percent Change in Real Gross Domestic Product (GDP) by State and Region, 2019:Q2-2019:Q3--Table Ends

In [166]:
gdp_by_state_by_inds = tabula.read_pdf(os.path.join(os.getcwd(),"data/qgdpstate0120_2.pdf"), 
                         pages = 6)

In [167]:
gdp_by_state_by_inds = gdp_by_state_by_inds[0]

In [172]:
gdp_by_state_by_inds.iloc[:5,:5]

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,,,,,
1,,Other services (except Real estate and governm...,Management of companies and enterprises,,
2,United States,0.11 0.42,0.16,,
3,New England,0.09 0.56,0.20,,
4,Connecticut,0.03 0.37,0.19,,


In [173]:
gdp_by_state_by_inds = gdp_by_state_by_inds[2:]

In [174]:
gdp_by_state_by_inds[["Real estate and rental and leasing",
                      "Professional, scientific, and technical services"]] = gdp_by_state_by_inds["Unnamed: 1"].str.split(" ",expand = True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


In [175]:
gdp_by_state_by_inds[["Educational services",
                      "Health care and social assistance"]] = gdp_by_state_by_inds[
                                                            "Seasonally adjusted at annual rates"
                                                                                  ].str.split(" ", expand = True)

In [176]:
gdp_by_state_by_inds[["Other services (except government and government enterprises)",
                      "Government and government enterprises"]] = gdp_by_state_by_inds["Unnamed: 14"].str.split(" ", expand = True)

In [177]:
gdp_by_state_by_inds = gdp_by_state_by_inds.drop(columns= ["Unnamed: 1","Unnamed: 14","Seasonally adjusted at annual rates"])

In [178]:
gdp_by_state_by_inds = gdp_by_state_by_inds.dropna(axis=1)

In [180]:
gdp_by_state_by_inds.iloc[:5,:5]

Unnamed: 0.1,Unnamed: 0,Unnamed: 2,Unnamed: 5,Unnamed: 9,Unnamed: 11
2,United States,0.16,0.1,0.02,0.17
3,New England,0.2,0.12,0.04,0.17
4,Connecticut,0.19,0.11,0.04,0.11
5,Maine,0.27,0.15,0.05,0.33
6,Massachusetts,0.2,0.1,0.03,0.16


In [181]:
name_dic = {"Unnamed: 0":"areas",
    "Unnamed: 2":"Management of companies and enterprises",
"Unnamed: 5":"Administrative and support and waste management and remediation services",
"Unnamed: 9":"Arts, entertainment, and recreation", 
"Unnamed: 11":"Accomodation and food services"}

In [182]:
gdp_by_state_by_inds = gdp_by_state_by_inds.rename(columns=name_dic)

In [184]:
gdp_by_state_by_inds.head()

Unnamed: 0,areas,Management of companies and enterprises,Administrative and support and waste management and remediation services,"Arts, entertainment, and recreation",Accomodation and food services,Real estate and rental and leasing,"Professional, scientific, and technical services",Educational services,Health care and social assistance,Other services (except government and government enterprises),Government and government enterprises
2,United States,0.16,0.1,0.02,0.17,0.11,0.42,0.08,0.19,0.05,0.01
3,New England,0.2,0.12,0.04,0.17,0.09,0.56,0.23,0.21,0.05,-0.01
4,Connecticut,0.19,0.11,0.04,0.11,0.03,0.37,0.27,0.17,0.07,-0.04
5,Maine,0.27,0.15,0.05,0.33,0.07,0.26,0.06,0.15,0.05,-0.1
6,Massachusetts,0.2,0.1,0.03,0.16,0.13,0.72,0.24,0.22,0.05,0.0


In [201]:
gdp_by_state_by_inds.columns = gdp_by_state_by_inds.columns.str.replace(
                                                "\)|\(|,| ","_").str.replace("__","_").str.lower()

In [206]:
gdp_by_state_by_inds.dtypes

areas                                                                        object
management_of_companies_and_enterprises                                      object
administrative_and_support_and_waste_management_and_remediation_services     object
arts_entertainment_and_recreation                                           float64
accomodation_and_food_services                                               object
real_estate_and_rental_and_leasing                                           object
professional_scientific_and_technical_services                               object
educational_services                                                         object
health_care_and_social_assistance                                            object
other_services_except_government_and_government_enterprises_                 object
government_and_government_enterprises                                        object
dtype: object

## 2.0 Employment by states
**States Data**

- NY: https://www.labor.ny.gov/stats/2019-2021-Statewide-Short-Term-Industry-Projections.xlsx


In [225]:
# headers = {'User-Agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36"}

# url="https://labor.ny.gov/stats/2016-2026-Statewide-and-Regional-Long-Term-Industry-Projections.xlsx"
# s=requests.get(url, headers= headers).text

### New York

In [265]:
employment_ny = pd.read_excel(os.path.join(os.getcwd(), "data/2019-2021-Statewide-Short-Term-Industry-Projections.xlsx"))

In [266]:
employment_ny.head(10)

Unnamed: 0.1,Unnamed: 0,New York State Department of Labor,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6
0,,"Short-Term Industry Employment Projections, 20...",,,,,
1,,New York State,,,,,
2,,,,,,,
3,,NAICS\nCode,Industry Title,Employment,,Net\nChange,Percent\nChange
4,,,,2019,2021.0,,
5,,,,,,,
6,,000000,Total All Industries,10234640,10435820.0,201180,0.0196568
7,,,,,,,
8,,00601,"Total Self Employed, All Jobs",570590,577460.0,6870,0.0120402
9,,,,,,,


In [267]:
employment_ny = employment_ny.dropna(axis=1,how='all')

In [272]:
employment_ny = employment_ny.dropna(axis=0, how="any").reset_index(drop = True)

In [274]:
employment_ny.shape

(109, 6)

In [275]:
employment_ny.columns = ["naics_code","industry","2016","2026","net_change","pct_change"]

- `00601`: Self Employed Workers, All Jobs
- `11`: Agriculture, Forestry, Fishing and Hunting
- `21`: Mining
- `22`: Utilities
- `23`: Construction
- `31`: Manufacturing
- `42`: Wholesale Trade
- `44`: Retail Trade
- `48`: Transportation and Warehousing
- `51`: Information
- `1023`: Financial Activities
- `1024`: Professional and Business Services
- `61`: Educational Services
- `62`: Health Care and Social Assistance
- `71`: Arts, Entertainment, and Recreation
- `72`: Accommodation and Food Services
- `81`: Other Services (except Government)
- `9`: Government

In [276]:
employment_ny.naics_code = employment_ny.naics_code.astype(int)
employment_ny['2016'] = employment_ny['2016'].astype(int)
employment_ny['2026'] = employment_ny['2026'].astype(int)

In [277]:
industry_codes = [601,11,21,22,23,31,42,44,48,51,1023,1024,61,62,71,72,81,9,]

In [278]:
employment_ny_inds = employment_ny[employment_ny.naics_code.isin(industry_codes)]

In [279]:
employment_ny_inds

Unnamed: 0,naics_code,industry,2016,2026,net_change,pct_change
1,601,"Total Self Employed, All Jobs",570590,577460,6870,0.0120402
2,11,"Agriculture, Forestry, Fishing and Hunting",31180,31870,690,0.0221296
8,21,Mining,4230,4550,320,0.0756501
12,22,Utilities,36700,36430,-270,-0.00735695
13,23,Construction,380940,390570,9630,0.0252796
17,31,Manufacturing,441060,439460,-1600,-0.00362762
39,42,Wholesale Trade,322100,318890,-3210,-0.00996585
43,44,Retail Trade,930340,913410,-16930,-0.0181976
56,48,Transportation and Warehousing,279600,289240,9640,0.0344778
67,51,Information,273110,276340,3230,0.0118267


### New Jersey