##Call the API with my variables for my research

> List of some states "46 SD", "47 TN","10 DE", "2 AK","5 AR","24 MD","31 NE"

CBSA:

"46 SD",
* "43620 Sioux Falls, SD",

"47 TN"
* "16860 Chattanooga, TN-GA",
* "28940 Knoxville, TN",
* "34980": "Nashville-Davidson--Murfreesboro--Franklin, TN",
* "28700": "Kingsport-Bristol-Bristol, TN-VA",
* "32820": "Memphis, TN-MS-AR",
* "17300": "Clarksville, TN-KY",
* "27740": "Johnson City, TN",
* "17420": "Cleveland, TN",

"10 DE"
* "20100 Dover, DE",
* "428": "Philadelphia-Reading-Camden, PA-NJ-DE-MD",
* "41540": "Salisbury, MD-DE",
*
* 0 city for "2 AK"

"5 AR",
* "22900 Fort Smith, AR-OK",
* "30780 Little Rock-North Little Rock-Conway, AR",
* "22220 Fayetteville-Springdale-Rogers, AR-MO",
* "38220 Pine Bluff, AR",
* "32820 Memphis, TN-MS-AR",
*
"24 MD",
* "25180": "Hagerstown-Martinsburg, MD-WV",
* "428": "Philadelphia-Reading-Camden, PA-NJ-DE-MD",
* "12580": "Baltimore-Columbia-Towson, MD",
* "37980": "Philadelphia-Camden-Wilmington, PA-NJ-DE-MD",
* "15680": "California-Lexington Park, MD",
* "41540": "Salisbury, MD-DE",

##CONSTANTES

In [3]:
# (0) IMPORT LIBRARY
import requests
import pandas as pd
import regex as re
import time
from datetime import datetime as dt
import json

In [4]:
# (1) CONSTANTES
month_dict = {    1: "jan",    2: "feb",    3: "mar",    4: "apr",    5: "may",    6: "jun",
    7: "jul",    8: "aug",    9: "sep",    10: "oct",    11: "nov",    12: "dec"}
# Find the key for 'A MONTH'
def monthTLA(m):
  """
  Will return the key for the 3 letters acronym
  """
  global month_dict
  return next(key for key, value in month_dict.items() if value == m)

#Dict of all states
all_states = requests.get('https://api.census.gov/data/2024/cps/basic/jan/variables.json')
states = json.loads(all_states.text)
states_dict = states['variables']['STATE']['values']['item']
states = [state for state in states_dict.keys()]

#All year of the study

##Variables
stateID = states[1] #47 #range(1-56), without 3, 7, 14, 43
year = 2015 # range(2010-2023)
month = month_dict[10] #range(1-12)

##Calling the API (1st step)

In [5]:
# (2) CLASS API
class API:
  """This generic class API will handle at once all request to the CPS api
  varsUrl (variable): given a year, it will return the url for all variables
  allVars_dict (variable): a dict of all variables for each 'year' (key)
  examples:
   - API(year=2010).varsUrl * url for all variables in 2010
   - API().allVars_dict[2023] * to se all the variables (dict) in 2023
   - 'GTCBSA' in API().allVars_dict[2024].keys() * is <GTCBSA> in 2024?
   - API().allState_dict[2021] * all stateID:description in year 2021"""

  def __init__(self) -> None:
    pass

  def get(self, year_search=2010):
    self.varsUrl = 'https://api.census.gov/data/'+ str(year_search) +'/cps/basic/jan/variables.json'

    #all_years = range(2022, 2024)
    all_years = range(2010, 1+dt.now().year)
    self.allVars_dict = {year_: year_vars for year_, year_vars in \
                      zip(all_years, [json.loads(requests.get('https://api.census.gov/data/'+ str(year_) +'/cps/basic/jan/variables.json').text)['variables'] for year_ in all_years])}

#    self.allStates_dict = {_year: state_vars for _year, state_vars in \
#                  zip(all_years, [ self.allVars_dict[_year]['GTCBSA']['values']['item'] for _year in all_years])}

  def store(self):
    pass

  def getVarsNameFromUrl(self, url):
    return re.findall(r'([A-Z]+)', url)

#api=API()
#api.get()
#api.allVars_dict.keys()

In [6]:
# (3) CLASS MIXCOMP
#Decorator to track API response time
def track_response_time(func):
    def wrapper(self, *args, **kwargs):
        start_time = time.time()
        result = func(self, *args, **kwargs)
        end_time = time.time()
        response_time = end_time - start_time
        self.response_time = response_time  # Store in the class
        print(f"Function: '{func.__name__}()'. Census API query took: {response_time} seconds")
        return result
    return wrapper


#Class for unit-request (specific State, specific Year, specific Month)
class MixComp:
  """
This class will pull the Mix composition about the US population from one state.
Code	    Label
tabulate  Counts of instances
PEMNTVTY	Demographics-native country of mother
PEMARITL	Demographics-marital status
PESEX	    Demographics-sex
PRCITSHP	Demographics-United States citizenship group
GTCBSA	  Demographics-city level
STATE	    FIPS STATE Code
GTCBSA	  Demographics-city level

  """
  def __init__(self, stateID, year, month):
      key = '&key='+'804d0a1a18d1de70764950c78e2a3a42d3d45e48'
      self.url = 'https://api.census.gov/data/'+ str(year) +'/cps/basic/'+ month + '?tabulate=weight(PWSSWGT)&row+for&row+PEMNTVTY&row+PEMARITL&row+PESEX&row+PRCITSHP&row+GTCBSA&for=state:' + str(stateID) + key
      self.stateID = stateID
      self.year = year
      self.month = month
      self.response_time = 0 # how long the API took to run on Census CPS
      self.dataAPI = None
      self.dataText = None
      self.data4Df = []
      self.df = pd.DataFrame(None) #Will contain the final df(dataframe)

  def __str__(self):
      return f"""
      Mix Composition of the US population: stateID:={self.stateID}, month={self.month}, year={self.year}
      Dataframe head:
         {self.df.head(3)}"""

  def __repr__(self):
      return f"MixComp({self.stateID}, {self.year}, {self.month})"

  def __len__(self):
      return self.df.shape[0]

  def __add__(self, other):
      return pd.concat([self.df, other.df], axis=0)

  # to convert in class
  def checkVariable(self, year, url):
      """
      this function will check if a set of variables are declared on the Census API. Using the an url as input. Vars are in ALL_CAPS
      """
      api=API()
      api.get()
      matches = api.getVarsNameFromUrl(url)
      cps_vars = api.allVars_dict[year].keys()
      res = dict()
      for var in matches:
        res[var] = True if var in cps_vars else False
      res['status'] = all([status for status in res.values()])
      return res

  @track_response_time
  def get(self):

      #This section will validate all variables and replace the city's with the right one if year>=2024
#      check = self.checkVariable(self.year, self.url)
#      print (f"checkVariable() = {check}")
#      if not check['status']:
#        print(f"BEFORE - Variables = {check}")
#        if (self.year>=2024) and ('CBSA' not in check.keys()): #before 2024, City variable in 'GTCBSA', starting on 2024, City variable was 'CBSA'
#          self.url = re.sub('GTCBSA', 'CBSA', self.url)
#          print(f"AFTER -  Variables = {self.checkVariable(self.year, self.url)}\nNew URL: {self.url}")
#        else:
#          raise(f"Please check all your Census variables. Unable to find some in the API call. Variables = {check}")

      try:
        # download the Demographics_Native_Country_Of_Mother for Mix-Compostion
        self.dataAPI = requests.get(self.url)
        return dict({'response time in secs': round(self.response_time,0), 'status': True if 200==self.dataAPI.status_code else False, 'code':self.dataAPI.status_code, 'comment': 'url:' + self.url})
        None
      except requests.exceptions.RequestException as e:
          # Handle connection errors or HTTP errors
          raise("STOP ERROR: Tools for Data Science - Semester Project: An error occurred:\n [[[[", e, "]]]]")

      except AttributeError:
          raise("STOP ERROR: Please check your variable name!")

  def cleaning(self):
      self.dataText = self.dataAPI.text
      pattern_cleaning = r'\[0,.*?\],?\n?' #remove the data with no tabulate information (tabulate=0)
      self.dataText = re.sub(pattern_cleaning, '', self.dataText)
      pattern_cleaning = r'\n' #remove retunr line
      self.dataText = re.sub(pattern_cleaning, '', self.dataText)

  def preping(self):
      ##Separe cols from data (numbers)
      pattern = r'.*?(\["tabulate".*?"GTCBSA"\]),([\s\S]*)'
      matches = re.search(pattern, self.dataText)
      data_cols = matches.group(1)
      data_vals = matches.group(2)[:-1] #removing the last char "]" at the end.

      self.data_cols = re.findall(r'"(.*?)"', data_cols) #Transform string_column in List_column

      #Transform string_row in List of Lists then Matrix
      rows = re.findall(r'\[(.*?)\]', data_vals)
      for row in rows:
        row_list = re.findall(r'-?\d+(?:\.\d+)?', row)
        self.data4Df.append([int(elt) for elt in row_list])

      print(f'len(self.data4Df): {len(self.data4Df)}')

  def buildingDf(self):
      """
      ##Definition of variables:
      * tabulate: represents the information related to population counts.
      * state: column provides information about the state
      1. PEMNTVTY Demographics-native country of mother
      2. PEMARITL Demographics-marital status
      3. PESEX Demographics-sex
      4. PRCITSHP Demographics-United States citizenship group
      5. GTCBSA Demographics-city level
      We will understand the population composition by state about how mixed it is based on the native country of mothers,
      marital status distribution, gender demographics, citizenship status, at the city-level characteristics.
      This data can be used to uncover trends, patterns, and insights into the social and cultural fabric of the US population.
      """
      ##Convert the result to the Df **mix_composition**
      self.df = pd.DataFrame(self.data4Df, columns=self.data_cols)

      ##Building the YYYYMM column
      self.df['YYYYMM'] = pd.Series([dt.strptime(str(self.year)+ '-' +str(self.month), "%Y-%b") for _ in range(self.df.shape[0])])

  def save(self, path=''):
      try:
        if path == '':
          path=str(self.year)+ '-' +str(self.month) +'-MixComp.csv'
        r = self.df.to_csv( path, index=False)
        return r
      except FileNotFoundError:
        raise ('Check the path {path} of the file')

  def main(self):
      res = self.get(); print(res)
      self.cleaning()
      self.preping()
      self.buildingDf()
      self.save()


In [7]:
# (4) TEST
stateID = states[1] #47 #range(1-56), without 3, 7, 14, 43
year = 2023 # range(2010-2023)
month = month_dict[10] #range(1-12)
dataA = MixComp(stateID, year, month)
dataA.main()

stateID = states[2] #range(1-56), without 3, 7, 14, 43
year = 2015 # range(2010-2023)
month = month_dict[2] #range(1-12)
dataB = MixComp(stateID, year, month)
dataB.main()

dataR = dataA + dataB
dataR

Function: 'get()'. Census API query took: 255.9217848777771 seconds
{'response time in secs': 0, 'status': True, 'code': 200, 'comment': 'url:https://api.census.gov/data/2023/cps/basic/oct?tabulate=weight(PWSSWGT)&row+for&row+PEMNTVTY&row+PEMARITL&row+PESEX&row+PRCITSHP&row+GTCBSA&for=state:47&key=804d0a1a18d1de70764950c78e2a3a42d3d45e48'}
len(self.data4Df): 247
Function: 'get()'. Census API query took: 294.7905013561249 seconds
{'response time in secs': 0, 'status': True, 'code': 200, 'comment': 'url:https://api.census.gov/data/2015/cps/basic/feb?tabulate=weight(PWSSWGT)&row+for&row+PEMNTVTY&row+PEMARITL&row+PESEX&row+PRCITSHP&row+GTCBSA&for=state:10&key=804d0a1a18d1de70764950c78e2a3a42d3d45e48'}
len(self.data4Df): 206


Unnamed: 0,tabulate,state,PEMNTVTY,PEMARITL,PESEX,PRCITSHP,GTCBSA,YYYYMM
0,2667,47,303,-1,2,3,17420,2023-10-01
1,3293,47,303,6,2,3,17420,2023-10-01
2,4566,47,233,1,2,4,17420,2023-10-01
3,3271,47,218,1,2,4,17420,2023-10-01
4,3142,47,163,1,1,4,17420,2023-10-01
...,...,...,...,...,...,...,...,...
201,1146,10,207,6,1,1,37980,2015-02-01
202,591,10,329,6,1,1,37980,2015-02-01
203,1976,10,303,6,1,1,37980,2015-02-01
204,67797,10,57,6,1,1,37980,2015-02-01


##Prep'ing the _columns_ and _data values_ for the _df_ called **mix_composition**

##Convert fto df

#class Creation
Now we have all we need for our analysis, let's try to reorganize what we build so far into a class called _MixComp_

##Let's test the Class MixComp