### Data Exploration and Cleanup Process

In [None]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import requests
import time
import json
import gmaps
from scipy.stats import linregress


# Import API key
from config import weather_api_key
from config import g_key

#### CSV Files to Pandas Dataframes

In [None]:
# Importing csv files and creating data frames

# CO2 by Country
co2_country_df = pd.read_csv("../project-1-data/co2_emission.csv")

# GDP by Country (GPD per capita PPP)
gdp_country_df = pd.read_csv("../project-1-data/GDP-countries-1990-2018.csv")

# State Income and Emissions 

# Create path
emission_data_path = "carbon_emission_data.csv"
income_data_path ="median_income_2018.csv"
energy_source_path = "energy_by_source.csv"

# Read csv files
state_emissions_2018 = pd.read_csv(emission_data_path)
state_income_2018 = pd.read_csv(income_data_path)
energy_source_data = pd.read_csv(energy_source_path)

In [None]:
# Data preview
co2_country_df.head()

In [None]:
# Data preview
gdp_country_df.head()

In [None]:
# Data preview
state_emissions_2018.head()

In [None]:
# Data preview
state_income_2018.head()

In [None]:
# Data preview
energy_source_data.head()

#### Cleaning GDP Country Dataframe

In [None]:
# Checking column names and getting rid of any trailing spaces for gdp dataframe

gdp_country_df.columns

gdp_country_df = gdp_country_df.rename(columns = {"Country " : "Country"})

In [None]:
# Taking a subset of original gdp data frame for only the year 2016 
gdp_country_2016_df = gdp_country_df[["Country", "2016"]]

# Renaming column to be more descriptive
gdp_country_2016_df = gdp_country_2016_df.rename(columns = {"2016" : "gdp_2016"})
gdp_country_2016_df.head()

#### Cleaning Country CO<sub>2</sub> Emissions Dataframe

In [None]:
# Checking column names for CO2 emissions
co2_country_df.columns

In [None]:
# Checking for strange observations 
co2_country_df.Entity.unique()

In [None]:
# Removing any strange observations found

co2_country_df = co2_country_df[(co2_country_df.Entity != 'Africa') &
                                (co2_country_df.Entity != 'Americas (other)') &
                                (co2_country_df.Entity != 'Antarctic Fisheries') &
                                (co2_country_df.Entity != 'Asia and Pacific (other)') &
                                (co2_country_df.Entity != 'EU-28') &
                                (co2_country_df.Entity != 'Europe (other)') &
                                (co2_country_df.Entity != 'International transport') &
                                (co2_country_df.Entity != 'Statistical differences') &
                                (co2_country_df.Entity != 'World')]


In [None]:
# Double checking that observations were removed
co2_country_df.Entity.unique()

In [None]:
# Only keeping observation years 1991 and onwards
co2_country_df = co2_country_df.loc[(co2_country_df.Year >= 1991), :]
co2_country_df = co2_country_df.rename(columns = {"Entity" : "Country", "Annual CO₂ emissions (tonnes )" : "annual_co2_tonnes"})
co2_country_df.head()

In [None]:
# Extracting data for the year 2016
co2_country_2016_df = co2_country_df.loc[(co2_country_df.Year == 2016), :]
co2_country_2016_df.head()

In [None]:
# Merging GDP data with carbon emissions data
co2_gdp_merge_df = co2_country_2016_df.merge(gdp_country_2016_df, how = 'inner', on = "Country")
co2_gdp_merge_df.head()

#### Geocode API Calls and Heat Maps

In [None]:
# Creating empty columns
co2_gdp_merge_df['Lat'] = ""
co2_gdp_merge_df['Lng'] = ""

# Geocode
for index, row in co2_gdp_merge_df.iterrows():
    
    country = row.Country
    
    # Build the endpoint URL
    target_url = f"https://maps.googleapis.com/maps/api/geocode/json?address={country}&key={g_key}"
    try:
        # Get json and extract latitude and longitude
        geo_data = requests.get(target_url).json()
        lat = geo_data["results"][0]["geometry"]["location"]["lat"]
        lng = geo_data["results"][0]["geometry"]["location"]["lng"]

        co2_gdp_merge_df.loc[index, 'Lat'] = lat
        co2_gdp_merge_df.loc[index, 'Lng'] = lng
        print(f"{country} found, adding coordinates")
    except:
        
        print("Could not find country.")

    
co2_gdp_merge_df.head()

In [None]:
# Store lat and long in locations, gdps in gdps, and carbon emissions in carbons
locations = co2_gdp_merge_df[["Lat", "Lng"]]
gdps = co2_gdp_merge_df.gdp_2016
carbons = co2_gdp_merge_df.annual_co2_tonnes

#### Global GDP Heat Map

In [None]:
# Configure gmaps
gmaps.configure(api_key=g_key)

# Create heatmap
fig = gmaps.figure()

# Create heat layer
heat_layer = gmaps.heatmap_layer(locations, weights=gdps, 
                                 dissipating=False, max_intensity=max(gdps),
                                 point_radius=5)



fig.add_layer(heat_layer)
fig

#### Global Carbon Emissions Heat Map

In [None]:
# Configure gmaps
gmaps.configure(api_key=g_key)

# Create heatmap
fig = gmaps.figure()

# Create heat layer
heat_layer = gmaps.heatmap_layer(locations, weights=carbons, 
                                 dissipating=False, max_intensity=1*(10**9),
                                 point_radius=5)



fig.add_layer(heat_layer)
fig

#### BRIC Countries Data Cleaning

In [None]:
# Import BRIC Historic climate csv into pandas dataframe
bric_climate_df = pd.read_csv("../project-1-data/BRIC-climate-1991-2016.csv")
bric_climate_df.head()

In [None]:
# Checking column names and fixing irregularities
bric_climate_df.columns

bric_climate_df = bric_climate_df.rename(columns = {"Temperature - (Celsius)" : "Temperature_C",
                                                    " Year" : "Year",
                                                    " Statistics" : "Statistics",
                                                    " Country" : "Country",
                                                    " ISO3" : "Code"})

bric_climate_df = bric_climate_df[["Country", "Year", "Code", "Temperature_C"]]
bric_climate_df.head()

In [None]:
# Groupby country and year to get max

bric_groupby = bric_climate_df.groupby(by = ["Country", "Year"])
bric_max_temp = bric_groupby.max()
bric_max_temp.head(40)

bric_max_temp[["Temperature_C", "Code"]].head()

In [None]:
# Resetting index of BRIC temperature dataframe
bric_temp_df = bric_max_temp.reset_index()

# Removing leading space in country and code values of BRIC dataframe
bric_temp_df.Country = [country.strip() for country in bric_temp_df.Country]
bric_temp_df.Code = [code.strip() for code in bric_temp_df.Code]

In [None]:
# Merging BRIC emissions with BRIC temperatures
bric_merge_df = bric_temp_df.merge(co2_country_df, how = 'left', on = ['Country', 'Year', 'Code'])
bric_merge_df.head()

# Export dataframe as csv
bric_merge_df.to_csv("output-data/bric-merge-data.csv")

#### Interactive Barplot for BRIC Countries

In [None]:
# BarPlot of Carbon Emissions for a Given Year

# Get user input for year of interest
year_of_interest = input("Please select a year between 1991 and 2016:")

# Subset data for given year
bar_data = bric_merge_df.loc[bric_merge_df.Year == int(year_of_interest), :]

# Creating barplot
plt.bar(bar_data.Country, bar_data.annual_co2_tonnes)
plt.title(f"BRIC Countries: CO2 Emissions ({year_of_interest})")
plt.xlabel("Country")
plt.ylabel("Annual Carbon Emissions (tonnes)")
plt.show()

#### State Emissions and Income Dataframe Cleaning

For the first couple of dataframes, the state names had to be replaced using a state abbreviation dictionary in order to merge the datasets, commas had to be removed from the median income data and converted to integers so that the data could be read for plotting.


In [None]:
# Replace abbreviated state names with dictionary
us_state_abbrev = {
    'Alabama': 'AL',
    'Alaska': 'AK',
    'American Samoa': 'AS',
    'Arizona': 'AZ',
    'Arkansas': 'AR',
    'California': 'CA',
    'Colorado': 'CO',
    'Connecticut': 'CT',
    'Delaware': 'DE',
    'District of Columbia': 'DC',
    'Florida': 'FL',
    'Georgia': 'GA',
    'Guam': 'GU',
    'Hawaii': 'HI',
    'Idaho': 'ID',
    'Illinois': 'IL',
    'Indiana': 'IN',
    'Iowa': 'IA',
    'Kansas': 'KS',
    'Kentucky': 'KY',
    'Louisiana': 'LA',
    'Maine': 'ME',
    'Maryland': 'MD',
    'Massachusetts': 'MA',
    'Michigan': 'MI',
    'Minnesota': 'MN',
    'Mississippi': 'MS',
    'Missouri': 'MO',
    'Montana': 'MT',
    'Nebraska': 'NE',
    'Nevada': 'NV',
    'New Hampshire': 'NH',
    'New Jersey': 'NJ',
    'New Mexico': 'NM',
    'New York': 'NY',
    'North Carolina': 'NC',
    'North Dakota': 'ND',
    'Northern Mariana Islands':'MP',
    'Ohio': 'OH',
    'Oklahoma': 'OK',
    'Oregon': 'OR',
    'Pennsylvania': 'PA',
    'Puerto Rico': 'PR',
    'Rhode Island': 'RI',
    'South Carolina': 'SC',
    'South Dakota': 'SD',
    'Tennessee': 'TN',
    'Texas': 'TX',
    'Utah': 'UT',
    'Vermont': 'VT',
    'Virgin Islands': 'VI',
    'Virginia': 'VA',
    'Washington': 'WA',
    'West Virginia': 'WV',
    'Wisconsin': 'WI',
    'Wyoming': 'WY'
}

abbrev_us_state = dict(map(reversed, us_state_abbrev.items()))

state_emissions_2018 = state_emissions_2018.replace({"State": abbrev_us_state})

In [None]:
# Merge dataframes
merge_df = pd.merge(state_emissions_2018, state_income_2018, how="outer", on="State")
merge_df = merge_df.drop(columns=["Federal offshore production is not included in the Production Shares."])
merge_df = merge_df.drop(index=51)

# Remove commas from median income data
merge_df["Median income"] = merge_df["Median income"].str.replace(",","")
merge_df["Median income"] = merge_df["Median income"].astype(int)
merge_df.head()

#### Barplot Visualization for State Emissions and Income Data

In [None]:
# Create variables for different value sets 
state_name = merge_df["State"].unique()
us_co2_production = merge_df["Production, U.S. Share"].astype(int)
median_income = merge_df["Median income"].astype(int)
co2_per_capita = merge_df["Consumption per Capita, Million Btu"].tolist()

In [None]:
#create dataframe for plotting
state_emissions_df = merge_df[["State", "Production, U.S. Share"]]
state_emissions_df = state_emissions_df.set_index("State")

#plot state vs. emissions data
state_emissions_df.plot(kind='bar', figsize=(20,3))
plt.ylabel("CO2 Production")

plt.show()

In [None]:
#create dataframe for plotting
state_median_df = merge_df[["State", "Median income"]]
state_median_df = state_median_df.set_index("State")

#plot state vs. median income data
state_median_df.plot(kind='bar', figsize=(20,3))
plt.show()

In [None]:
#create dataframe for plotting
state_co2_per_capita = merge_df[["State", "Consumption per Capita, Million Btu"]]
state_co2_per_capita = state_co2_per_capita.set_index("State")

#plot 
state_co2_per_capita.plot(kind='bar', figsize=(20,3))
plt.ylabel("CO2 Consumption per Capita")
plt.show()

#### U.S. Energy Source Data

In [None]:
energy_source_data.columns = ["Year", "Coal", "Natural Gas", "Crude Oil",
                  "Natural Gas Plant Liquids", "Nuclear",
                  "Renewables"]
energy_source_data.head()

In [None]:
#create dataframe f0r 2018 energy source data 
energy_source_data = energy_source_data.loc[energy_source_data["Year"] == 2018]
energy_source_data = energy_source_data.drop_duplicates(subset=None)
energy_source_data = energy_source_data.set_index("Year")

#creat bar graph 
energy_source_data.plot(kind='bar')
plt.ylabel("U.S. Energy Consumption (%)")
plt.xticks(rotation="horizontal")

plt.show()