The MapFilters Colab generates state and country lists used in the top filters.

Related page: https://model.earth/data-commons/docs/data

DONE: Reduce state_ids_data object to just state number 01 instead of geoId/01 - Alexandra

You can send output to Github, or copy the resulting file by clicking the folder icon in the left side.

# Installations and Imports

In [None]:
pip install datacommons_pandas

Collecting datacommons_pandas
  Downloading datacommons_pandas-0.0.3-py3-none-any.whl.metadata (2.3 kB)
Downloading datacommons_pandas-0.0.3-py3-none-any.whl (45 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.8/45.8 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: datacommons_pandas
Successfully installed datacommons_pandas-0.0.3


In [None]:
import numpy as np
import pandas as pd
pd.set_option('display.float_format', '{:.2f}'.format)
import datacommons_pandas as dc
import geopandas as gpd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import requests

# Data Pull for all the states in the USA

In [None]:
stateDict = {
    "AL": "Alabama", "AK": "Alaska", "AZ": "Arizona", "AR": "Arkansas", "CA": "California", "CO": "Colorado",
    "CT": "Connecticut", "DE": "Delaware", "FL": "Florida", "GA": "Georgia", "HI": "Hawaii", "ID": "Idaho",
    "IL": "Illinois", "IN": "Indiana", "IA": "Iowa", "KS": "Kansas", "KY": "Kentucky", "LA": "Louisiana",
    "ME": "Maine", "MD": "Maryland", "MA": "Massachusetts", "MI": "Michigan", "MN": "Minnesota", "MS": "Mississippi",
    "MO": "Missouri", "MT": "Montana", "NE": "Nebraska", "NV": "Nevada", "NH": "New Hampshire", "NJ": "New Jersey",
    "NM": "New Mexico", "NY": "New York", "NC": "North Carolina", "ND": "North Dakota", "OH": "Ohio", "OK": "Oklahoma",
    "OR": "Oregon", "PA": "Pennsylvania", "RI": "Rhode Island", "SC": "South Carolina", "SD": "South Dakota",
    "TN": "Tennessee", "TX": "Texas", "UT": "Utah", "VT": "Vermont", "VA": "Virginia", "WA": "Washington",
    "WV": "West Virginia", "WI": "Wisconsin", "WY": "Wyoming",
    "DC": "District of Columbia",
    # US Territories
    "AS": "American Samoa", "GU": "Guam", "MP": "Northern Mariana Islands", "PR": "Puerto Rico", "VI": "Virgin Islands of the U.S."
}

stateData = pd.DataFrame(list(stateDict.items()),columns = ['State','StateName'])
stateData.head()

Unnamed: 0,State,StateName
0,AL,Alabama
1,AK,Alaska
2,AZ,Arizona
3,AR,Arkansas
4,CA,California


In [None]:
def get_state_ids():
  # Getting the states  dcids
  state_dcids = dc.get_places_in(['country/USA'], 'State')
  # Getting the names of the states
  state_names = [dc.get_property_values([name], 'name') for name in state_dcids['country/USA']]

  data = {}
  for entry in state_names:
      for k,v in entry.items():
          data[k] = v[0]

  # Adding Rest 4 US Territories
  for k,v in dc.get_property_values(['geoId/60','geoId/66','geoId/69','geoId/78'], 'name').items():
      data[k] = v[0]
  return data

In [None]:
def get_state_ids():
  # Getting the states  dcids
  state_dcids = dc.get_places_in(['country/USA'], 'State')
  # Getting the names of the states
  state_names = [dc.get_property_values([name], 'name') for name in state_dcids['country/USA']]

  data = {}
  for entry in state_names:
      for k,v in entry.items():
          state_number = k.split('/')[-1]
          data[state_number] = v[0]

  # Adding Rest 4 US Territories
  for k,v in dc.get_property_values(['geoId/60','geoId/66','geoId/69','geoId/78'], 'name').items():
      state_number = k.split('/')[-1]
      data[state_number] = v[0]
  return data

In [None]:
state_ids_data = get_state_ids()
state_ids_data

{'01': 'Alabama',
 '02': 'Alaska',
 '04': 'Arizona',
 '05': 'Arkansas',
 '06': 'California',
 '08': 'Colorado',
 '09': 'Connecticut',
 '10': 'Delaware',
 '11': 'District of Columbia',
 '12': 'Florida',
 '13': 'Georgia',
 '15': 'Hawaii',
 '16': 'Idaho',
 '17': 'Illinois',
 '18': 'Indiana',
 '19': 'Iowa',
 '20': 'Kansas',
 '21': 'Kentucky',
 '22': 'Louisiana',
 '23': 'Maine',
 '24': 'Maryland',
 '25': 'Massachusetts',
 '26': 'Michigan',
 '27': 'Minnesota',
 '28': 'Mississippi',
 '29': 'Missouri',
 '30': 'Montana',
 '31': 'Nebraska',
 '32': 'Nevada',
 '33': 'New Hampshire',
 '34': 'New Jersey',
 '35': 'New Mexico',
 '36': 'New York',
 '37': 'North Carolina',
 '38': 'North Dakota',
 '39': 'Ohio',
 '40': 'Oklahoma',
 '41': 'Oregon',
 '42': 'Pennsylvania',
 '44': 'Rhode Island',
 '45': 'South Carolina',
 '46': 'South Dakota',
 '47': 'Tennessee',
 '48': 'Texas',
 '49': 'Utah',
 '50': 'Vermont',
 '51': 'Virginia',
 '53': 'Washington',
 '54': 'West Virginia',
 '55': 'Wisconsin',
 '56': 'Wyoming

In [None]:
# CO2 = Annual_Emissions_CarbonDioxide_NonBiogenic -- State
# Methane = Annual_Emissions_Methane_NonBiogenic -- State
# Population = Count_Person -- State

def fetch_timelines_data():
  # Getting DCIDs for all the states
  state_ids_data = get_state_ids()
  # Getting the timelines data
  timelines_data = {}
  for key,value in state_ids_data.items():
    try:
      timelines_data[value] = {'Population':round(dc.get_stat_all([key], ['Count_Person'])[key]['Count_Person']['sourceSeries'][0]['val']['2022'],2)}
    except:
      timelines_data[value] = {'Population':0.00}
    try:
      timelines_data[value].update({'CO2':round(dc.get_stat_all([key], ['Annual_Emissions_CarbonDioxide_NonBiogenic'])[key]['Annual_Emissions_CarbonDioxide_NonBiogenic']['sourceSeries'][0]['val']['2021'],2)})
    except:
      timelines_data[value].update({'CO2':0.00})
    try:
      timelines_data[value].update({'Methane':round(dc.get_stat_all([key], ['Annual_Emissions_Methane_NonBiogenic'])[key]['Annual_Emissions_Methane_NonBiogenic']['sourceSeries'][0]['val']['2021'],2)})
    except:
      timelines_data[value].update({'Methane':0.00})

  # Creating the dataframe
  normalized_data = []
  for state, values in timelines_data.items():
      entry = {'StateName': state}
      entry.update(values)
      normalized_data.append(entry)
  dataframe = pd.DataFrame(normalized_data)

  # Adding state abbreviations to the dataframe
  dataframe = stateData.merge(dataframe,how = 'inner',on='StateName')

  # Dividing the data by 1000 and rounding to 2 decimals
  # for col in dataframe.iloc[:,2:]:
  #     dataframe[col] = round(dataframe[col]/1000,2)

  return dataframe

In [None]:
timelines_df = fetch_timelines_data()
timelines_df.head()

Unnamed: 0,State,StateName,Population,CO2,Methane
0,AL,Alabama,0.0,0.0,0.0
1,AK,Alaska,0.0,0.0,0.0
2,AZ,Arizona,0.0,0.0,0.0
3,AR,Arkansas,0.0,0.0,0.0
4,CA,California,0.0,0.0,0.0


In [None]:
# Fetching the data for land area of the states
area_data = gpd.read_file(f'https://www2.census.gov/geo/tiger/GENZ2023/shp/cb_2023_us_state_500k.zip')
area_data["SqMiles"] = round(area_data['ALAND']/ 2589988.110336,2)
area_data.head()

Unnamed: 0,STATEFP,STATENS,GEOIDFQ,GEOID,STUSPS,NAME,LSAD,ALAND,AWATER,geometry,SqMiles
0,35,897535,0400000US35,35,NM,New Mexico,0,314198587197,726463919,"POLYGON ((-109.05017 31.48000, -109.04984 31.4...",121312.75
1,46,1785534,0400000US46,46,SD,South Dakota,0,196341525171,3387709166,"POLYGON ((-104.05788 44.99761, -104.05078 44.9...",75807.89
2,6,1779778,0400000US06,6,CA,California,0,403673296401,20291770234,"MULTIPOLYGON (((-118.60442 33.47855, -118.5987...",155859.13
3,21,1779786,0400000US21,21,KY,Kentucky,0,102266598312,2384223544,"MULTIPOLYGON (((-89.40565 36.52817, -89.39869 ...",39485.35
4,1,1779775,0400000US01,1,AL,Alabama,0,131185049346,4582326383,"MULTIPOLYGON (((-88.05338 30.50699, -88.05109 ...",50650.83


In [None]:
timelines_df = timelines_df.merge(area_data[['STUSPS','SqMiles']], how='inner', left_on='State', right_on='STUSPS')
timelines_df.drop(columns = ['STUSPS'],inplace=True)
timelines_df

Unnamed: 0,State,StateName,Population,CO2,Methane,SqMiles
0,AL,Alabama,0.0,0.0,0.0,50650.83
1,AK,Alaska,0.0,0.0,0.0,571051.62
2,AZ,Arizona,0.0,0.0,0.0,113655.39
3,AR,Arkansas,0.0,0.0,0.0,51992.7
4,CA,California,0.0,0.0,0.0,155859.13
5,CO,Colorado,0.0,0.0,0.0,103637.06
6,CT,Connecticut,0.0,0.0,0.0,4842.4
7,DE,Delaware,0.0,0.0,0.0,1948.54
8,FL,Florida,0.0,0.0,0.0,53654.21
9,GA,Georgia,0.0,0.0,0.0,57716.6


In [None]:
# Converting to a CSV file
timelines_df.to_csv('UN_Timelines_Data.csv', index=False)

## Push CSV to Github


In [None]:
!apt-get install git

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
git is already the newest version (1:2.34.1-1ubuntu1.11).
0 upgraded, 0 newly installed, 0 to remove and 45 not upgraded.


In [None]:
from google.colab import userdata

In [None]:
#user information
token = userdata.get('Githubtoken')
USERNAME = "ultracatx"
REPO_NAME = "data-commons-me"
EMAIL = "ivyzhou752@gmail.com"

repo_url = f"https://{token}@github.com/{USERNAME}/{REPO_NAME}.git"

In [None]:
#config git global user information
!git config --global user.email {EMAIL}
!git config --global user.name {USERNAME}

In [None]:
#clone repo to local
!git clone {repo_url}

Cloning into 'data-commons-me'...
remote: Enumerating objects: 645, done.[K
remote: Counting objects: 100% (257/257), done.[K
remote: Compressing objects: 100% (154/154), done.[K
remote: Total 645 (delta 152), reused 169 (delta 85), pack-reused 388 (from 1)[K
Receiving objects: 100% (645/645), 11.26 MiB | 12.45 MiB/s, done.
Resolving deltas: 100% (228/228), done.


In [None]:
#move csv from /content to repo
!mv UN_Timelines_Data.csv data-commons-me/docs/data

In [None]:
%cd /content/data-commons-me/docs/data

/content/data-commons-me/docs/data


In [None]:
!git add UN_Timelines_Data.csv

In [None]:
!git commit -m "push csv to repo"

[main d738a91] test
 1 file changed, 57 deletions(-)
 delete mode 100644 docs/data/UN_Timelines_Data.csv


In [None]:
!git push {repo_url}

Enumerating objects: 7, done.
Counting objects:  14% (1/7)Counting objects:  28% (2/7)Counting objects:  42% (3/7)Counting objects:  57% (4/7)Counting objects:  71% (5/7)Counting objects:  85% (6/7)Counting objects: 100% (7/7)Counting objects: 100% (7/7), done.
Delta compression using up to 2 threads
Compressing objects:  25% (1/4)Compressing objects:  50% (2/4)Compressing objects:  75% (3/4)Compressing objects: 100% (4/4)Compressing objects: 100% (4/4), done.
Writing objects:  25% (1/4)Writing objects:  50% (2/4)Writing objects:  75% (3/4)Writing objects: 100% (4/4)Writing objects: 100% (4/4), 383 bytes | 383.00 KiB/s, done.
Total 4 (delta 2), reused 1 (delta 0), pack-reused 0
remote: Resolving deltas: 100% (2/2), completed with 2 local objects.[K
remote: This repository moved. Please use the new location:[K
remote:   https://github.com/Ultracatx/data-commons-me.git[K
To https://github.com/ultracatx/data-commons-me.git
   a66375a..d738a91  main -> main


# Data Pull for all the countries

In [None]:
!pip install pycountry
import pycountry

Collecting pycountry
  Downloading pycountry-24.6.1-py3-none-any.whl.metadata (12 kB)
Downloading pycountry-24.6.1-py3-none-any.whl (6.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m37.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pycountry
Successfully installed pycountry-24.6.1


In [None]:
def get_country_ids():
    country_dcids = dc.get_places_in(['Earth'], 'Country')
    country_names = {dcid: dc.get_property_values([dcid], 'name')[dcid][0] for dcid in country_dcids['Earth']}
    return country_names

country_ids_data = get_country_ids()
country_ids_data

{'country/ABW': 'Aruba',
 'country/AFG': 'Afghanistan',
 'country/AGO': 'Angola',
 'country/AIA': 'Anguilla',
 'country/ALA': 'Åland Islands',
 'country/ALB': 'Albania',
 'country/AND': 'Andorra',
 'country/ANT': 'Netherlands Antilles',
 'country/ARE': 'United Arab Emirates',
 'country/ARG': 'Argentina',
 'country/ARM': 'Armenia',
 'country/ASM': 'American Samoa',
 'country/ATA': 'Antarctica',
 'country/ATB': 'British Antarctic Territory',
 'country/ATF': 'French Southern Territories',
 'country/ATG': 'Antigua and Barbuda',
 'country/ATN': 'Dronning Maud Land',
 'country/AUS': 'Australia',
 'country/AUT': 'Austria',
 'country/AZE': 'Azerbaijan',
 'country/BDI': 'Burundi',
 'country/BEL': 'Belgium',
 'country/BEN': 'Benin',
 'country/BES': 'Bonaire, Sint Eustatius and Saba',
 'country/BFA': 'Burkina Faso',
 'country/BGD': 'Bangladesh',
 'country/BGR': 'Bulgaria',
 'country/BHR': 'Bahrain',
 'country/BHS': 'Bahamas',
 'country/BIH': 'Bosnia and Herzegovina',
 'country/BLM': 'Saint Barthé

In [None]:
def get_country_code(country_name, existing_codes):
    try:
        country = pycountry.countries.lookup(country_name)
        return country.alpha_2
    except LookupError:
      #note that there are some counties only exist in datacommon but not in pycountry
        print(country_name)
        code = country_name[:2].upper()
        if code in existing_codes:
            code = (country_name[0] + country_name[-1]).upper()
        return code

def create_country_dataframe(country_ids_data):
    countries = []
    existing_codes = {country.alpha_2 for country in pycountry.countries}

    for dcid, name in country_ids_data.items():
        country_code = get_country_code(name, existing_codes)
        countries.append((country_code, name))
        existing_codes.add(country_code)

    country_df = pd.DataFrame(countries, columns=['CountryCode', 'CountryName'])
    return country_df

country_data = create_country_dataframe(country_ids_data)
country_data

Netherlands Antilles
British Antarctic Territory
Dronning Maud Land
Brunei
Congo [DRC]
Congo [Republic]
Cape Verde
Canton and Enderbury Islands
Falkland Islands
Metropolitan France
Macau
Saint Martin
Macedonia [FYROM]
Pitcairn Islands
Palestinian Territories
Russia
Saint Helena
São Tomé and Príncipe
Sint Maarten
East Timor
Turkey
U.S. Minor Outlying Islands
Vatican City
U.S. Virgin Islands
Kosovo
Yugoslavia


Unnamed: 0,CountryCode,CountryName
0,AW,Aruba
1,AF,Afghanistan
2,AO,Angola
3,AI,Anguilla
4,AX,Åland Islands
...,...,...
251,YE,Yemen
252,YU,Yugoslavia
253,ZA,South Africa
254,ZM,Zambia


In [None]:
# CO2 = Annual_Emissions_CarbonDioxide_NonBiogenic
# Methane = Annual_Emissions_Methane_NonBiogenic
# Population = Count_Person


def fetch_timelines_data_country():
    # Getting DCIDs for all the countries
    country_ids_data = get_country_ids()

    # Getting the timelines data
    timelines_data = {}
    for key, value in country_ids_data.items():
        try:
            timelines_data[value] = {'Population': round(dc.get_stat_all([key], ['Count_Person'])[key]['Count_Person']['sourceSeries'][0]['val']['2022'], 2)}
        except:
            timelines_data[value] = {'Population': 0.00}
        try:
            timelines_data[value].update({'CO2': round(dc.get_stat_all([key], ['Annual_Emissions_CarbonDioxide_NonBiogenic'])[key]['Annual_Emissions_CarbonDioxide_NonBiogenic']['sourceSeries'][0]['val']['2021'], 2)})
        except:
            timelines_data[value].update({'CO2': 0.00})
        try:
            timelines_data[value].update({'Methane': round(dc.get_stat_all([key], ['Annual_Emissions_Methane_NonBiogenic'])[key]['Annual_Emissions_Methane_NonBiogenic']['sourceSeries'][0]['val']['2021'], 2)})
        except:
            timelines_data[value]. update({'Methane': 0.00})

    # Creating the dataframe
    normalized_data = []
    for country, values in timelines_data.items():
        entry = {'CountryName': country}
        entry.update(values)
        normalized_data.append(entry)
    dataframe = pd.DataFrame(normalized_data)

    return dataframe

timelines_country = fetch_timelines_data_country()
timelines_country

Unnamed: 0,CountryName,Population,CO2,Methane
0,Aruba,106445.00,0.00,0.00
1,Afghanistan,41128771.00,0.00,0.00
2,Angola,35588987.00,0.00,0.00
3,Anguilla,0.00,0.00,0.00
4,Åland Islands,0.00,0.00,0.00
...,...,...,...,...
251,Yemen,33696614.00,0.00,0.00
252,Yugoslavia,0.00,0.00,0.00
253,South Africa,59893885.00,0.00,0.00
254,Zambia,20017675.00,0.00,0.00


combined

In [None]:
# Country
dc.get_stat_all(['country/USA'], ['Annual_Amount_Emissions_CarbonDioxide'])['country/USA']['Annual_Amount_Emissions_CarbonDioxide']

{'sourceSeries': [{'val': {'2011': 5128105384,
    '1978': 4771142682,
    '2020': 4257738096,
    '1972': 4527984118,
    '2017': 4761301727,
    '1999': 5563819024,
    '1989': 4904547626,
    '1975': 4355839181,
    '1993': 4952699132,
    '1994': 5024064992,
    '2001': 5702105773,
    '1966': 3547095488,
    '1968': 3898241195,
    '2010': 5352049740,
    '1970': 4231974784,
    '1963': 3085190502,
    '1961': 2832097916,
    '1973': 4691090844,
    '1976': 4616828588,
    '1986': 4478048562,
    '1997': 5498830887,
    '2003': 5610687172,
    '1984': 4485577869,
    '1988': 4842013205,
    '1992': 4837695301,
    '2014': 5046564428,
    '2007': 5686667534,
    '2006': 5602396200,
    '1964': 3196050810,
    '2004': 5688704740,
    '1971': 4289035482,
    '2002': 5545413879,
    '1960': 2813809267,
    '2018': 4909996024,
    '1985': 4514313221,
    '2005': 5703154482,
    '2000': 5729820336,
    '1987': 4633016119,
    '1991': 4765689535,
    '2016': 4838476243,
    '2008': 55124

In [None]:
sum(timelines_df['Methane'])

179803997.54000002

# Code for Priyanka

In [None]:
def get_county_ids():
  # Getting the county  dcids
  county_dcids = {}
  county_names = []
  for key,value in state_ids_data.items():
    county_dcids.update(dc.get_places_in([key], 'County'))
  # Getting the names of the county
    county_names = {}
  for key,value in county_dcids.items():
    for county_id in value:
      county_names[county_id] = dc.get_property_values([county_id], 'name')[county_id][0]
  return county_names

In [None]:
county_ids_data = get_county_ids()
county_ids_data

NameError: name 'state_ids_data' is not defined

Code updated by Priyanka for Population data of all counties

In [None]:
population_data_all_county = {}
for key,value in county_ids_data.items():
    try:
        #population_data_all_county[value] = {str(1970): dc.get_stat_all([key], ['Count_Person'])[key]['Count_Person']['sourceSeries'][0]['val']['1970']}
        stats = dc.get_stat_all([key], ['Count_Person'])[key]['Count_Person']['sourceSeries'][0]['val']
        population_data_all_county[value] = {str(year): stats.get(str(year), 0) for year in range(1970, 2023)}
    except:
        population_data_all_county[value] = {str(1970): 0}
        population_data_all_county[value] = {str(year): 0 for year in range(1970, 2023)}
    for year in range(1971,2023):
        try:
            population_data_all_county[value] = {str(year): stats.get(str(year), 0) for year in range(1970, 2023)}
            #population_data_all_county[value].update({str(year): dc.get_stat_all([key], ['Count_Person'])[key]['Count_Person']['sourceSeries'][0]['val'][str(year)]})
        except:
            #population_data_all_county[value].update({str(year): 0})
            population_data_all_county[value] = {str(year): 0 for year in range(1970, 2023)}

# for key, value in county_ids_data.items():
#     try:
#         stats = dc.get_stat_all([key], ['Count_Person'])[key]['Count_Person']['sourceSeries'][0]['val']
#         population_data_all_county[value] = {str(year): stats.get(str(year), 0) for year in range(1970, 2023)}
#     except:
#         population_data_all_county[value] = {str(year): 0 for year in range(1970, 2023)}

In [None]:
population_data_all_county

In [None]:
def County_Data_to_Dataframe(data):
  # Adding County as a Column and Converting the data to dataframe
  normalized_data = []
  for county, values in data.items():
      entry = {'CountyName': county}
      entry.update(values)
      normalized_data.append(entry)
  dataframe = pd.DataFrame(normalized_data)


  return dataframe

In [None]:
population_dataframe_all_county = County_Data_to_Dataframe(population_data_all_county)
population_dataframe_all_county.head()

In [None]:
population_dataframe_all_county

In [None]:
population_dataframe_all_county.to_csv('County_data_all_years.csv',index=False)

In [None]:
pd.read_csv('County_data_all_years.csv')