Related page: https://model.earth/data-commons/docs/data

TO DO: Reduce objust to just state number 01 instead of geoId/01

# Installations and Imports

In [26]:
pip install datacommons_pandas

Note: you may need to restart the kernel to use updated packages.


In [27]:
import numpy as np
import pandas as pd
pd.set_option('display.float_format', '{:.2f}'.format)
import datacommons_pandas as dc
import geopandas as gpd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import requests

# Data Pull for all the states in the USA

In [28]:
stateDict = {
    "AL": "Alabama", "AK": "Alaska", "AZ": "Arizona", "AR": "Arkansas", "CA": "California", "CO": "Colorado",
    "CT": "Connecticut", "DE": "Delaware", "FL": "Florida", "GA": "Georgia", "HI": "Hawaii", "ID": "Idaho",
    "IL": "Illinois", "IN": "Indiana", "IA": "Iowa", "KS": "Kansas", "KY": "Kentucky", "LA": "Louisiana",
    "ME": "Maine", "MD": "Maryland", "MA": "Massachusetts", "MI": "Michigan", "MN": "Minnesota", "MS": "Mississippi",
    "MO": "Missouri", "MT": "Montana", "NE": "Nebraska", "NV": "Nevada", "NH": "New Hampshire", "NJ": "New Jersey",
    "NM": "New Mexico", "NY": "New York", "NC": "North Carolina", "ND": "North Dakota", "OH": "Ohio", "OK": "Oklahoma",
    "OR": "Oregon", "PA": "Pennsylvania", "RI": "Rhode Island", "SC": "South Carolina", "SD": "South Dakota",
    "TN": "Tennessee", "TX": "Texas", "UT": "Utah", "VT": "Vermont", "VA": "Virginia", "WA": "Washington",
    "WV": "West Virginia", "WI": "Wisconsin", "WY": "Wyoming",
    "DC": "District of Columbia",
    # US Territories
    "AS": "American Samoa", "GU": "Guam", "MP": "Northern Mariana Islands", "PR": "Puerto Rico", "VI": "Virgin Islands of the U.S."
}

stateData = pd.DataFrame(list(stateDict.items()),columns = ['State','StateName'])
stateData.head()

Unnamed: 0,State,StateName
0,AL,Alabama
1,AK,Alaska
2,AZ,Arizona
3,AR,Arkansas
4,CA,California


In [29]:
def get_state_ids():
  # Getting the states  dcids
  state_dcids = dc.get_places_in(['country/USA'], 'State')
  # Getting the names of the states
  state_names = [dc.get_property_values([name], 'name') for name in state_dcids['country/USA']]

  data = {}
  for entry in state_names:
      for k,v in entry.items():
          data[k] = v[0]

  # Adding Rest 4 US Territories
  for k,v in dc.get_property_values(['geoId/60','geoId/66','geoId/69','geoId/78'], 'name').items():
      data[k] = v[0]
  return data

In [30]:
state_ids_data = get_state_ids()
state_ids_data

{'geoId/01': 'Alabama',
 'geoId/02': 'Alaska',
 'geoId/04': 'Arizona',
 'geoId/05': 'Arkansas',
 'geoId/06': 'California',
 'geoId/08': 'Colorado',
 'geoId/09': 'Connecticut',
 'geoId/10': 'Delaware',
 'geoId/11': 'District of Columbia',
 'geoId/12': 'Florida',
 'geoId/13': 'Georgia',
 'geoId/15': 'Hawaii',
 'geoId/16': 'Idaho',
 'geoId/17': 'Illinois',
 'geoId/18': 'Indiana',
 'geoId/19': 'Iowa',
 'geoId/20': 'Kansas',
 'geoId/21': 'Kentucky',
 'geoId/22': 'Louisiana',
 'geoId/23': 'Maine',
 'geoId/24': 'Maryland',
 'geoId/25': 'Massachusetts',
 'geoId/26': 'Michigan',
 'geoId/27': 'Minnesota',
 'geoId/28': 'Mississippi',
 'geoId/29': 'Missouri',
 'geoId/30': 'Montana',
 'geoId/31': 'Nebraska',
 'geoId/32': 'Nevada',
 'geoId/33': 'New Hampshire',
 'geoId/34': 'New Jersey',
 'geoId/35': 'New Mexico',
 'geoId/36': 'New York',
 'geoId/37': 'North Carolina',
 'geoId/38': 'North Dakota',
 'geoId/39': 'Ohio',
 'geoId/40': 'Oklahoma',
 'geoId/41': 'Oregon',
 'geoId/42': 'Pennsylvania',
 'geo

In [19]:
# CO2 = Annual_Emissions_CarbonDioxide_NonBiogenic -- State
# Methane = Annual_Emissions_Methane_NonBiogenic -- State
# Population = Count_Person -- State

def fetch_timelines_data():
  # Getting DCIDs for all the states
  state_ids_data = get_state_ids()
  # Getting the timelines data
  timelines_data = {}
  for key,value in state_ids_data.items():
    try:
      timelines_data[value] = {'Population':round(dc.get_stat_all([key], ['Count_Person'])[key]['Count_Person']['sourceSeries'][0]['val']['2022'],2)}
    except:
      timelines_data[value] = {'Population':0.00}
    try:
      timelines_data[value].update({'CO2':round(dc.get_stat_all([key], ['Annual_Emissions_CarbonDioxide_NonBiogenic'])[key]['Annual_Emissions_CarbonDioxide_NonBiogenic']['sourceSeries'][0]['val']['2021'],2)})
    except:
      timelines_data[value].update({'CO2':0.00})
    try:
      timelines_data[value].update({'Methane':round(dc.get_stat_all([key], ['Annual_Emissions_Methane_NonBiogenic'])[key]['Annual_Emissions_Methane_NonBiogenic']['sourceSeries'][0]['val']['2021'],2)})
    except:
      timelines_data[value].update({'Methane':0.00})

  # Creating the dataframe
  normalized_data = []
  for state, values in timelines_data.items():
      entry = {'StateName': state}
      entry.update(values)
      normalized_data.append(entry)
  dataframe = pd.DataFrame(normalized_data)

  # Adding state abbreviations to the dataframe
  dataframe = stateData.merge(dataframe,how = 'inner',on='StateName')

  # Dividing the data by 1000 and rounding to 2 decimals
  # for col in dataframe.iloc[:,2:]:
  #     dataframe[col] = round(dataframe[col]/1000,2)

  return dataframe

In [22]:
timelines_df = fetch_timelines_data()
timelines_df.head()

Unnamed: 0,State,StateName,Population,CO2,Methane
0,AL,Alabama,5074296.0,68215710.1,7657440.42
1,AK,Alaska,733583.0,16939106.6,871681.75
2,AZ,Arizona,7359197.0,38392400.3,1361141.25
3,AR,Arkansas,3045637.0,36628789.1,2123889.0
4,CA,California,39029342.0,91754567.1,8737629.75


In [23]:
# Fetching the data for land area of the states
area_data = gpd.read_file(f'https://www2.census.gov/geo/tiger/GENZ2023/shp/cb_2023_us_state_500k.zip')
area_data["SqMiles"] = round(area_data['ALAND']/ 2589988.110336,2)
area_data.head()

Unnamed: 0,STATEFP,STATENS,GEOIDFQ,GEOID,STUSPS,NAME,LSAD,ALAND,AWATER,geometry,SqMiles
0,35,897535,0400000US35,35,NM,New Mexico,0,314198587197,726463919,"POLYGON ((-109.05017 31.48, -109.04984 31.4995...",121312.75
1,46,1785534,0400000US46,46,SD,South Dakota,0,196341525171,3387709166,"POLYGON ((-104.05788 44.9976, -104.05078 44.99...",75807.89
2,6,1779778,0400000US06,6,CA,California,0,403673296401,20291770234,"MULTIPOLYGON (((-118.60442 33.47855, -118.5987...",155859.13
3,21,1779786,0400000US21,21,KY,Kentucky,0,102266598312,2384223544,"MULTIPOLYGON (((-89.40565 36.52816, -89.39868 ...",39485.35
4,1,1779775,0400000US01,1,AL,Alabama,0,131185049346,4582326383,"MULTIPOLYGON (((-88.05338 30.50699, -88.05109 ...",50650.83


In [24]:
timelines_df = timelines_df.merge(area_data[['STUSPS','SqMiles']], how='inner', left_on='State', right_on='STUSPS')
timelines_df.drop(columns = ['STUSPS'],inplace=True)
timelines_df

Unnamed: 0,State,StateName,Population,CO2,Methane,SqMiles
0,AL,Alabama,5074296.0,68215710.1,7657440.42,50650.83
1,AK,Alaska,733583.0,16939106.6,871681.75,571051.62
2,AZ,Arizona,7359197.0,38392400.3,1361141.25,113655.39
3,AR,Arkansas,3045637.0,36628789.1,2123889.0,51992.7
4,CA,California,39029342.0,91754567.1,8737629.75,155859.13
5,CO,Colorado,5839926.0,50334501.47,8793630.94,103637.06
6,CT,Connecticut,3626205.0,10812606.9,309106.88,4842.4
7,DE,Delaware,1018396.0,5142434.0,401666.25,1948.54
8,FL,Florida,22244823.0,102634265.8,6603958.08,53654.21
9,GA,Georgia,10912876.0,48611013.4,5377826.75,57716.6


In [None]:
# Converting to a CSV file
timelines_df.to_csv('UN_Timelines_Data.csv', index=False)

## Push CSV to Github


In [None]:
!apt-get install git

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
git is already the newest version (1:2.34.1-1ubuntu1.11).
0 upgraded, 0 newly installed, 0 to remove and 45 not upgraded.


In [None]:
from google.colab import userdata

In [None]:
#user information
token = userdata.get('Githubtoken')
USERNAME = "ultracatx"
REPO_NAME = "data-commons-me"
EMAIL = "ivyzhou752@gmail.com"

repo_url = f"https://{token}@github.com/{USERNAME}/{REPO_NAME}.git"

In [None]:
#config git global user information
!git config --global user.email {EMAIL}
!git config --global user.name {USERNAME}

In [None]:
#clone repo to local
!git clone {repo_url}

Cloning into 'data-commons-me'...
remote: Enumerating objects: 645, done.[K
remote: Counting objects: 100% (257/257), done.[K
remote: Compressing objects: 100% (154/154), done.[K
remote: Total 645 (delta 152), reused 169 (delta 85), pack-reused 388 (from 1)[K
Receiving objects: 100% (645/645), 11.26 MiB | 12.45 MiB/s, done.
Resolving deltas: 100% (228/228), done.


In [None]:
#move csv from /content to repo
!mv UN_Timelines_Data.csv data-commons-me/docs/data

In [None]:
%cd /content/data-commons-me/docs/data

/content/data-commons-me/docs/data


In [None]:
!git add UN_Timelines_Data.csv

In [None]:
!git commit -m "tpush csv to repo"

[main d738a91] test
 1 file changed, 57 deletions(-)
 delete mode 100644 docs/data/UN_Timelines_Data.csv


In [None]:
!git push {repo_url}

Enumerating objects: 7, done.
Counting objects:  14% (1/7)Counting objects:  28% (2/7)Counting objects:  42% (3/7)Counting objects:  57% (4/7)Counting objects:  71% (5/7)Counting objects:  85% (6/7)Counting objects: 100% (7/7)Counting objects: 100% (7/7), done.
Delta compression using up to 2 threads
Compressing objects:  25% (1/4)Compressing objects:  50% (2/4)Compressing objects:  75% (3/4)Compressing objects: 100% (4/4)Compressing objects: 100% (4/4), done.
Writing objects:  25% (1/4)Writing objects:  50% (2/4)Writing objects:  75% (3/4)Writing objects: 100% (4/4)Writing objects: 100% (4/4), 383 bytes | 383.00 KiB/s, done.
Total 4 (delta 2), reused 1 (delta 0), pack-reused 0
remote: Resolving deltas: 100% (2/2), completed with 2 local objects.[K
remote: This repository moved. Please use the new location:[K
remote:   https://github.com/Ultracatx/data-commons-me.git[K
To https://github.com/ultracatx/data-commons-me.git
   a66375a..d738a91  main -> main


# Data Pull for all the countries

In [None]:
def get_country_ids():
    country_dcids = dc.get_places_in(['Earth'], 'Country')
    country_names = {dcid: dc.get_property_values([dcid], 'name')[dcid][0] for dcid in country_dcids['Earth']}
    return country_names

country_ids_data = get_country_ids()
country_ids_data

{'country/ABW': 'Aruba',
 'country/AFG': 'Afghanistan',
 'country/AGO': 'Angola',
 'country/AIA': 'Anguilla',
 'country/ALA': 'Åland Islands',
 'country/ALB': 'Albania',
 'country/AND': 'Andorra',
 'country/ANT': 'Netherlands Antilles',
 'country/ARE': 'United Arab Emirates',
 'country/ARG': 'Argentina',
 'country/ARM': 'Armenia',
 'country/ASM': 'American Samoa',
 'country/ATA': 'Antarctica',
 'country/ATB': 'British Antarctic Territory',
 'country/ATF': 'French Southern Territories',
 'country/ATG': 'Antigua and Barbuda',
 'country/ATN': 'Dronning Maud Land',
 'country/AUS': 'Australia',
 'country/AUT': 'Austria',
 'country/AZE': 'Azerbaijan',
 'country/BDI': 'Burundi',
 'country/BEL': 'Belgium',
 'country/BEN': 'Benin',
 'country/BES': 'Bonaire, Sint Eustatius and Saba',
 'country/BFA': 'Burkina Faso',
 'country/BGD': 'Bangladesh',
 'country/BGR': 'Bulgaria',
 'country/BHR': 'Bahrain',
 'country/BHS': 'Bahamas',
 'country/BIH': 'Bosnia and Herzegovina',
 'country/BLM': 'Saint Barthé

In [3]:
!pip install pycountry
import pycountry
def create_country_dataframe():
    country_dict = {country.alpha_2: country.name for country in pycountry.countries}
    country_data = pd.DataFrame(list(country_dict.items()), columns=['CountryCode', 'CountryName'])

    return country_data

country_data = create_country_dataframe()
country_data

Collecting pycountry
  Downloading pycountry-24.6.1-py3-none-any.whl.metadata (12 kB)
Downloading pycountry-24.6.1-py3-none-any.whl (6.3 MB)
   ---------------------------------------- 0.0/6.3 MB ? eta -:--:--
   -------------- ------------------------- 2.4/6.3 MB 13.4 MB/s eta 0:00:01
   ---------------------------------------- 6.3/6.3 MB 16.2 MB/s eta 0:00:00
Installing collected packages: pycountry
Successfully installed pycountry-24.6.1


Unnamed: 0,CountryCode,CountryName
0,AW,Aruba
1,AF,Afghanistan
2,AO,Angola
3,AI,Anguilla
4,AX,Åland Islands
...,...,...
244,WS,Samoa
245,YE,Yemen
246,ZA,South Africa
247,ZM,Zambia


In [None]:
# CO2 = Annual_Emissions_CarbonDioxide_NonBiogenic
# Methane = Annual_Emissions_Methane_NonBiogenic
# Population = Count_Person
####I checked on https://datacommons.org/tools/download#pt=Country&place=Earth&sv=Count_Person&dtType=ALL&facets=%7B%7D,
#### And it seems like we don't have the same variable as states, shall we change another variable?


def fetch_timelines_data_country():
    # Getting DCIDs for all the countries
    country_ids_data = get_country_ids()

    # Getting the timelines data
    timelines_data = {}
    for key, value in country_ids_data.items():
        try:
            timelines_data[value] = {'Population': round(dc.get_stat_all([key], ['Count_Person'])[key]['Count_Person']['sourceSeries'][0]['val']['2022'], 2)}
        except:
            timelines_data[value] = {'Population': 0.00}
        try:
            timelines_data[value].update({'CO2': round(dc.get_stat_all([key], ['Annual_Emissions_CarbonDioxide_NonBiogenic'])[key]['Annual_Emissions_CarbonDioxide_NonBiogenic']['sourceSeries'][0]['val']['2021'], 2)})
        except:
            timelines_data[value].update({'CO2': 0.00})
        try:
            timelines_data[value].update({'Methane': round(dc.get_stat_all([key], ['Annual_Emissions_Methane_NonBiogenic'])[key]['Annual_Emissions_Methane_NonBiogenic']['sourceSeries'][0]['val']['2021'], 2)})
        except:
            timelines_data[value]. update({'Methane': 0.00})

    # Creating the dataframe
    normalized_data = []
    for country, values in timelines_data.items():
        entry = {'CountryName': country}
        entry.update(values)
        normalized_data.append(entry)
    dataframe = pd.DataFrame(normalized_data)

    return dataframe

timelines_country = fetch_timelines_data_country()
timelines_country.head()

Unnamed: 0,CountryName,Population,CO2,Methane
0,Aruba,106445.0,0.0,0.0
1,Afghanistan,41128771.0,0.0,0.0
2,Angola,35588987.0,0.0,0.0
3,Anguilla,0.0,0.0,0.0
4,Åland Islands,0.0,0.0,0.0


In [62]:
dc.get_stat_all(['geoId/01'],['Annual_Emissions_GreenhouseGas_NonBiogenic'])


{'geoId/01': {'Annual_Emissions_GreenhouseGas_NonBiogenic': {'sourceSeries': [{'val': {'2012': 101338411.829275,
      '2020': 72194584.9043,
      '2019': 79738704.6402,
      '2016': 88362942.285,
      '2021': 76949404.9401,
      '2010': 100269563.310365,
      '2011': 109514323.492,
      '2015': 96619315.0688,
      '2014': 102535115.322,
      '2013': 100367776.173,
      '2017': 83764978.9227,
      '2018': 86172746.6091},
     'measurementMethod': 'EPA_GHGRP',
     'observationPeriod': 'P1Y',
     'importName': 'EPA_GHGRP_AggCounty_AggState',
     'provenanceDomain': 'epa.gov',
     'unit': 'MetricTonCO2e',
     'isDcAggregate': True,
     'provenanceUrl': 'https://www.epa.gov/ghgreporting'}]}}}

In [72]:
#Emission data for GreenhouseGas and Lead
# GreenhouseGas = Annual_Emissions_GreenhouseGas_NonBiogenic -- State
#Lead=Annual_Emissions_GreenhouseGas_LeadProduction_NonBiogenic--State
# Population = Count_Person -- State

def fetch_timelines_data_ggas_lead():
  # Getting DCIDs for all the states
  state_ids_data = get_state_ids()
  # Getting the timelines data
  timelines_data = {}
  for key,value in state_ids_data.items():
    try:
      timelines_data[value] = {'Population':round(dc.get_stat_all([key], ['Count_Person'])[key]['Count_Person']['sourceSeries'][0]['val']['2022'],2)}
    except:
      timelines_data[value] = {'Population':0.00}
    try:
      timelines_data[value].update({'GreenhouseGas':round(dc.get_stat_all([key], ['Annual_Emissions_GreenhouseGas_NonBiogenic'])[key]['Annual_Emissions_GreenhouseGas_NonBiogenic']['sourceSeries'][0]['val']['2021'],2)})
    except:
      timelines_data[value].update({'GreenhouseGas':0.00})
    try:
      timelines_data[value].update({'Lead':round(dc.get_stat_all([key], ['Annual_Emissions_GreenhouseGas_LeadProduction_NonBiogenic'])[key]['Annual_Emissions_GreenhouseGas_LeadProduction_NonBiogenic']['sourceSeries'][0]['val']['2021'],2)})
    except:
      timelines_data[value].update({'Lead':0.00})

  # Creating the dataframe
  normalized_data = []
  for state, values in timelines_data.items():
      entry = {'StateName': state}
      entry.update(values)
      normalized_data.append(entry)
  dataframe = pd.DataFrame(normalized_data)

  # Adding state abbreviations to the dataframe
  dataframe = stateData.merge(dataframe,how = 'inner',on='StateName')

  return dataframe

In [73]:
# dc.get_stat_all(['geoId/01'],['Annual_Emissions_GreenhouseGas_LeadProduction_NonBiogenic'])

{'geoId/01': {'Annual_Emissions_GreenhouseGas_LeadProduction_NonBiogenic': {'sourceSeries': [{'val': {'2021': 174470.1,
      '2020': 182356.4,
      '2015': 184587.8,
      '2010': 159438,
      '2014': 192641.2,
      '2017': 197232.3,
      '2011': 179675,
      '2016': 211004.6,
      '2012': 194566.3,
      '2019': 182455.7,
      '2013': 196275,
      '2018': 190698.8},
     'measurementMethod': 'EPA_GHGRP',
     'observationPeriod': 'P1Y',
     'importName': 'EPA_GHGRP_AggCounty_AggState',
     'provenanceDomain': 'epa.gov',
     'unit': 'MetricTonCO2e',
     'isDcAggregate': True,
     'provenanceUrl': 'https://www.epa.gov/ghgreporting'}]}}}

In [74]:
timelines_df = fetch_timelines_data_ggas_lead()
timelines_df

Unnamed: 0,State,StateName,Population,GreenhouseGas,Lead
0,AL,Alabama,5074296.0,76949404.94,174470.1
1,AK,Alaska,733583.0,17829879.72,0.0
2,AZ,Arizona,7359197.0,40878981.54,0.0
3,AR,Arkansas,3045637.0,39045544.29,0.0
4,CA,California,39029342.0,100790742.86,0.0
5,CO,Colorado,5839926.0,59523857.2,0.0
6,CT,Connecticut,3626205.0,11151525.7,0.0
7,DE,Delaware,1018396.0,5552608.15,0.0
8,FL,Florida,22244823.0,116939550.83,80113.2
9,GA,Georgia,10912876.0,54701039.32,0.0


In [77]:
timelines_df.to_csv('ggas_lead_emission.csv',index=False)
timelines_df = pd.read_csv('ggas_lead_emission.csv')
timelines_df

Unnamed: 0,State,StateName,Population,GreenhouseGas,Lead
0,AL,Alabama,5074296.0,76949404.94,174470.1
1,AK,Alaska,733583.0,17829879.72,0.0
2,AZ,Arizona,7359197.0,40878981.54,0.0
3,AR,Arkansas,3045637.0,39045544.29,0.0
4,CA,California,39029342.0,100790742.86,0.0
5,CO,Colorado,5839926.0,59523857.2,0.0
6,CT,Connecticut,3626205.0,11151525.7,0.0
7,DE,Delaware,1018396.0,5552608.15,0.0
8,FL,Florida,22244823.0,116939550.83,80113.2
9,GA,Georgia,10912876.0,54701039.32,0.0
