In [7]:
import pandas as pd
import numpy as np
import scipy as sp
import seaborn as sns
import reverse_geocoder as rg

# Project EDA

## EDA of `daily_global_weather_2020.csv`

`daily_global_weather_2020.csv` contains data on daily temperature and precipitation measurements. To learn how to use the data from this file, please read the following section on the first report. 

The data in `daily_global_weather_2020.csv`  is derived from the source file at https://www1.ncdc.noaa.gov/pub/data/ghcn/daily/by_year/2020.csv.gz.

To help you get started with a dataset of manageable size, we have preprocessed the GHCN dataset to include only the average temperature and precipitation measurements from stations that have both measurements. Each row in the preprocessed dataset contains both the average temperature and precipitation measurements for a given station on a given date.

The data contains only the (latitude, longitude) coordinates for the weather stations. To map the coordinates to geographical locations, the reverse-geocoder package mentioned in the References section might be helpful.

In [12]:
glb = pd.read_csv("Dataset A/daily_global_weather_2020.csv")

In [17]:
print(glb.shape)
glb.head()

(1064283, 8)


Unnamed: 0.1,Unnamed: 0,Station,Date,TAVG,Latitude,Longitude,Elevation,PRCP
0,0,AE000041196,2020-01-01,211.0,25.333,55.517,34.0,0.0
1,1,AEM00041194,2020-01-01,217.0,25.255,55.364,10.4,0.0
2,2,AFM00040938,2020-01-01,54.0,34.21,62.228,977.2,23.0
3,3,AG000060611,2020-01-01,71.0,28.05,9.6331,561.0,10.0
4,4,AGE00147708,2020-01-01,99.0,36.72,4.05,222.0,0.0


In [42]:
geo_reversed[0]

{'lat': '25.33737',
 'lon': '55.41206',
 'name': 'Sharjah',
 'admin1': 'Ash Shariqah',
 'admin2': '',
 'cc': 'AE'}

In [43]:
# reverse-geocoder
geo_reversed = rg.search(list(zip(glb.Latitude, glb.Longitude)))
# cc for country code
cc = [k['cc'] for k in geo_reversed]
name = [k['name'] for k in geo_reversed]
# copy cc to a glb column
glb['Country'] = cc
glb['City'] = name
glb.head()

Unnamed: 0.1,Unnamed: 0,Station,Date,TAVG,Latitude,Longitude,Elevation,PRCP,Country,City
0,0,AE000041196,2020-01-01,211.0,25.333,55.517,34.0,0.0,AE,Sharjah
1,1,AEM00041194,2020-01-01,217.0,25.255,55.364,10.4,0.0,AE,Sharjah
2,2,AFM00040938,2020-01-01,54.0,34.21,62.228,977.2,23.0,AF,Guzarah
3,3,AG000060611,2020-01-01,71.0,28.05,9.6331,561.0,10.0,DZ,Illizi
4,4,AGE00147708,2020-01-01,99.0,36.72,4.05,222.0,0.0,DZ,Tizi Ouzou


In [47]:
# extract records of US only
us = glb.loc[glb['Country']=='US']
print(us.shape)
us.head()

(307827, 10)


Unnamed: 0.1,Unnamed: 0,Station,Date,TAVG,Latitude,Longitude,Elevation,PRCP,Country,City
222,222,CA001017099,2020-01-01,73.0,48.7833,-123.1333,178.0,0.0,US,Point Roberts
223,223,CA001017101,2020-01-01,98.0,48.7833,-123.05,24.0,0.0,US,Point Roberts
226,226,CA001018611,2020-01-01,88.0,48.0333,-123.3333,70.0,2.0,US,Port Angeles East
266,266,CA001054503,2020-01-01,39.0,54.25,-133.0667,44.0,37.0,US,Craig
267,267,CA001054500,2020-01-01,58.0,54.25,-133.05,43.0,22.0,US,Craig


In [51]:
# examine how many days are included
print(len(us['Date'].unique()))
print(min(us['Date'].unique()))
print(max(us['Date'].unique()))

296
2020-01-01
2020-10-22


In [52]:
# check out temperature avg
us['TAVG'].describe()

count    307827.000000
mean         84.067574
std         101.308504
min        -483.000000
25%          12.000000
50%          90.000000
75%         154.000000
max         408.000000
Name: TAVG, dtype: float64

In [54]:
# check out precipitation avg
us['PRCP'].describe()

count    307827.000000
mean         22.982211
std          67.684176
min           0.000000
25%           0.000000
50%           0.000000
75%          25.000000
max        4191.000000
Name: PRCP, dtype: float64

## EDA of `us_air_quality_measures.csv`

In [61]:
aq = pd.read_csv("Dataset A/us_air_quality_measures.csv")
aq.head()

Unnamed: 0,MeasureId,MeasureName,MeasureType,StratificationLevel,StateFips,StateName,CountyFips,CountyName,ReportYear,Value,Unit,UnitName,DataOrigin,MonitorOnly
0,83,Number of days with maximum 8-hour average ozo...,Counts,State x County,1,Alabama,1027,Clay,1999,33.0,No Units,No Units,Monitor Only,1
1,83,Number of days with maximum 8-hour average ozo...,Counts,State x County,1,Alabama,1051,Elmore,1999,5.0,No Units,No Units,Monitor Only,1
2,83,Number of days with maximum 8-hour average ozo...,Counts,State x County,1,Alabama,1073,Jefferson,1999,39.0,No Units,No Units,Monitor Only,1
3,83,Number of days with maximum 8-hour average ozo...,Counts,State x County,1,Alabama,1079,Lawrence,1999,28.0,No Units,No Units,Monitor Only,1
4,83,Number of days with maximum 8-hour average ozo...,Counts,State x County,1,Alabama,1089,Madison,1999,31.0,No Units,No Units,Monitor Only,1


In [60]:
aq['MeasureId'].unique()

array([ 83,  85,  86,  84,  87, 292, 293, 294, 295, 296])

In [62]:
aq['MeasureName'].unique()

array(['Number of days with maximum 8-hour average ozone concentration over the National Ambient Air Quality Standard',
       'Percent of days with PM2.5 levels over the National Ambient Air Quality Standard (NAAQS)',
       'Person-days with PM2.5 over the National Ambient Air Quality Standard',
       'Number of person-days with maximum 8-hour average ozone concentration over the National Ambient Air Quality Standard',
       'Annual average ambient concentrations of PM2.5 in micrograms per cubic meter (based on seasonal averages and daily measurement)',
       'Number of days with maximum 8-hour average ozone concentration over the National Ambient Air Quality Standard (monitor and modeled data)',
       'Number of person-days with maximum 8-hour average ozone concentration over the National Ambient Air Quality Standard (monitor and modeled data)',
       'Percent of days with PM2.5 levels over the National Ambient Air Quality Standard (monitor and modeled data)',
       'Number of

## EDA of `us_greenhouse_gas_emission_direct_emitter_gas_type.csv`

In [63]:
emis_gas = pd.read_csv("Dataset A/us_greenhouse_gas_emission_direct_emitter_gas_type.csv")
emis_gas.head()

Unnamed: 0,V_GHG_EMITTER_GAS.ADDRESS1,V_GHG_EMITTER_GAS.ADDRESS2,V_GHG_EMITTER_GAS.CITY,V_GHG_EMITTER_GAS.CO2E_EMISSION,V_GHG_EMITTER_GAS.COUNTY,V_GHG_EMITTER_GAS.FACILITY_ID,V_GHG_EMITTER_GAS.GAS_CODE,V_GHG_EMITTER_GAS.GAS_NAME,V_GHG_EMITTER_GAS.LATITUDE,V_GHG_EMITTER_GAS.LONGITUDE,V_GHG_EMITTER_GAS.STATE,V_GHG_EMITTER_GAS.STATE_NAME,V_GHG_EMITTER_GAS.YEAR,V_GHG_EMITTER_GAS.ZIP,V_GHG_EMITTER_GAS.FACILITY_NAME,V_GHG_EMITTER_GAS.COUNTY_FIPS
0,1919 S. BROADWAY,,GREEN BAY,58024.0,BROWN COUNTY,1000589,BIOCO2,Biogenic CO2,44.4925,-88.0323,WI,WISCONSIN,2017,54304,GEORGIA-PACIFIC CONSUMER OPERATIONS LLC,55009.0
1,850 12TH AVE,,NEW YORK,134.5,New York,1000766,CH4,Methane,40.7711,-73.9911,NY,NEW YORK,2018,10019,59th Street,36061.0
2,3379 HWY 482,,Noble,6.854,SABINE,1009343,N2O,Nitrous Oxide,31.621528,-93.724774,LA,LOUISIANA,2012,71462,Baker Road Treater,22085.0
3,1012 BEAUCHAMP ST,,GREENVILLE,53562.0,WASHINGTON COUNTY,1003557,CH4,Methane,33.392476,-91.017584,MS,MISSISSIPPI,2017,38701,TEXAS GAS TRANSMISSION - GREENVILLE STATION,28151.0
4,487 CORN CREEK ROAD,,BEDFORD,7635064.7,Trimble,1006542,CO2,Carbon Dioxide,38.5847,-85.4117,KY,KENTUCKY,2018,40006,Trimble County,21223.0


In [65]:
em_faci = pd.read_csv("Dataset A/us_greenhouse_gas_emissions_direct_emitter_facilities.csv")
em_faci.head()

  em_faci = pd.read_csv("Dataset A/us_greenhouse_gas_emissions_direct_emitter_facilities.csv")


Unnamed: 0,V_GHG_EMITTER_FACILITIES.ADDRESS1,V_GHG_EMITTER_FACILITIES.ADDRESS2,V_GHG_EMITTER_FACILITIES.CEMS_USED,V_GHG_EMITTER_FACILITIES.CITY,V_GHG_EMITTER_FACILITIES.COUNTY,V_GHG_EMITTER_FACILITIES.COUNTY_FIPS,V_GHG_EMITTER_FACILITIES.FACILITY_ID,V_GHG_EMITTER_FACILITIES.LATITUDE,V_GHG_EMITTER_FACILITIES.LONGITUDE,V_GHG_EMITTER_FACILITIES.PRIMARY_NAICS_CODE,...,V_GHG_EMITTER_FACILITIES.STATE_NAME,V_GHG_EMITTER_FACILITIES.YEAR,V_GHG_EMITTER_FACILITIES.ZIP,V_GHG_EMITTER_FACILITIES.FACILITY_NAME,V_GHG_EMITTER_FACILITIES.SECONDARY_NAICS_CODE,V_GHG_EMITTER_FACILITIES.ADDITIONAL_NAICS_CODES,V_GHG_EMITTER_FACILITIES.COGENERATION_UNIT_EMISS_IND,V_GHG_EMITTER_FACILITIES.EPA_VERIFIED,V_GHG_EMITTER_FACILITIES.PARENT_COMPANY,V_GHG_EMITTER_FACILITIES.PLANT_CODE_INDICATOR
0,301 Commerce Dr. Suite 3701,,,Fort Worth,,,1008536.0,35.53376,-97.52976,211111.0,...,TEXAS,2012.0,76102.0,TEP Barnett USA LLC 415 - Strawn Basin,,,N,,CHESAPEAKE ENERGY CORP (100%),
1,449 Shell E&P Court,,,Gibson,TERREBONNE,22109.0,1005071.0,29.62879,-90.91768,211112.0,...,LOUISIANA,2011.0,70358.0,North Terrebonne Gas Plant,,,N,,ENTERPRISE GAS PROCESSING LLC (64.19%); DCP M...,
2,5494 MALONE ROAD,,,MEMPHIS,SHELBY COUNTY,47157.0,1002466.0,34.999213,-89.906724,562212.0,...,TENNESSEE,2017.0,38116.0,SOUTH SHELBY LANDFILL,,,N,,REPUBLIC SERVICES INC (100%),N
3,1601 WEEDON ISLAND DRIVE,,,SAINT PETERSBURG,Pinellas,12103.0,1001489.0,27.8613,-82.6012,221112.0,...,FLORIDA,2015.0,33702.0,P L Bartow Power Plant,,,N,,DUKE ENERGY CORP (100%),Y
4,,,,Rhome,WISE COUNTY,48497.0,1007509.0,33.0526,-97.4114,211112.0,...,TEXAS,2015.0,76078.0,Ross Compressor Station,,,N,,SWG PIPELINE LLC (100%),Y
