In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression # Linear Regression
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_classification
from imblearn.over_sampling import RandomOverSampler
from imblearn.ensemble import BalancedRandomForestClassifier # RandomForestClassifier

In [3]:
%matplotlib inline
from sklearn.datasets import make_blobs

Step One

Importing Severe Weather Test Data

In [4]:
# importing test weather data
weather_df = pd.read_csv("Severe_Weather_TestData.csv", low_memory=False)

In [5]:
# dropping columns not needed
weather_df = weather_df.drop(columns=['INJURIES_DIRECT', 'INJURIES_INDIRECT', 'DEATHS_DIRECT', 'DEATHS_INDIRECT',
                                     'DAMAGE_PROPERTY', 'DAMAGE_CROPS', 'MAGNITUDE', 'TOR_F_SCALE', 'TOR_LENGTH',
                                     'TOR_WIDTH', 'BEGIN_LAT', 'BEGIN_LON'])

In [6]:
# displaying weather df
weather_df.head(30)

Unnamed: 0,EVENT_ID,STATE,STATE_FIPS,YEAR,MONTH_NAME,EVENT_TYPE,CATEGORY
0,10050384,MISSISSIPPI,28,1950,June,Tornado,
1,10086808,OHIO,39,1950,January,Tornado,
2,10120418,TEXAS,48,1950,June,Tornado,
3,9981922,ARKANSAS,5,1950,January,Tornado,
4,10001432,GEORGIA,13,1950,June,Tornado,
5,10049829,MISSISSIPPI,28,1955,October,Tornado,
6,9984208,ARKANSAS,5,1955,October,Tornado,
7,9991373,COLORADO,8,1955,June,Tornado,
8,10121863,TEXAS,48,1955,June,Tornado,
9,9978062,ALABAMA,1,1964,January,Tornado,


Step Two

Create a DF that focuses on events by year

In [7]:
# adding together events by year

year_groups = weather_df.groupby('YEAR', as_index=False).count()
year_groups

Unnamed: 0,YEAR,EVENT_ID,STATE,STATE_FIPS,MONTH_NAME,EVENT_TYPE,CATEGORY
0,1950,5,5,5,5,5,0
1,1955,4,4,4,4,4,0
2,1964,10,10,10,10,10,0
3,1967,26,26,26,26,26,0
4,1973,28,28,28,28,28,0
5,1980,14,14,14,14,14,0
6,1986,5,5,5,5,5,0
7,1989,17,17,17,17,17,0
8,1997,78,78,78,78,78,0
9,2003,16,16,16,16,16,0


In [8]:
# rename columns as counts
year_groups = year_groups.rename(columns={"EVENT_ID": "Number_Of_Events_by_Year"})
year_groups

Unnamed: 0,YEAR,Number_Of_Events_by_Year,STATE,STATE_FIPS,MONTH_NAME,EVENT_TYPE,CATEGORY
0,1950,5,5,5,5,5,0
1,1955,4,4,4,4,4,0
2,1964,10,10,10,10,10,0
3,1967,26,26,26,26,26,0
4,1973,28,28,28,28,28,0
5,1980,14,14,14,14,14,0
6,1986,5,5,5,5,5,0
7,1989,17,17,17,17,17,0
8,1997,78,78,78,78,78,0
9,2003,16,16,16,16,16,0


In [9]:
# drop unnecessary columns
year_groups = year_groups.drop(columns=['MONTH_NAME', 'EVENT_TYPE', 'STATE', 'STATE_FIPS', 'CATEGORY'])

In [10]:
# check work
year_groups

Unnamed: 0,YEAR,Number_Of_Events_by_Year
0,1950,5
1,1955,4
2,1964,10
3,1967,26
4,1973,28
5,1980,14
6,1986,5
7,1989,17
8,1997,78
9,2003,16


In [11]:
# Saving weather data focused on year
file_path = "cleaned_weather_year_nonML.csv"
year_groups.to_csv(file_path, index=False)

Step Three

Create a DF that focuses on events by month

In [12]:
# adding together events by month
month_groups = weather_df.groupby('MONTH_NAME', as_index=False).count()
month_groups

Unnamed: 0,MONTH_NAME,EVENT_ID,STATE,STATE_FIPS,YEAR,EVENT_TYPE,CATEGORY
0,December,29,29,29,29,29,0
1,January,48,48,48,48,48,0
2,June,175,175,175,175,175,0
3,October,37,37,37,37,37,0


In [13]:
# rename columns as counts
month_groups = month_groups.rename(columns={"EVENT_ID": "Number_Of_Events_by_Month"})

In [14]:
# check work
month_groups

Unnamed: 0,MONTH_NAME,Number_Of_Events_by_Month,STATE,STATE_FIPS,YEAR,EVENT_TYPE,CATEGORY
0,December,29,29,29,29,29,0
1,January,48,48,48,48,48,0
2,June,175,175,175,175,175,0
3,October,37,37,37,37,37,0


In [15]:
# drop unnecessary columns
month_groups = month_groups.drop(columns=['YEAR', 'EVENT_TYPE', 'STATE', 'STATE_FIPS', 'CATEGORY'])

In [16]:
# check work
month_groups

Unnamed: 0,MONTH_NAME,Number_Of_Events_by_Month
0,December,29
1,January,48
2,June,175
3,October,37


In [17]:
# Saving weather data focused on month
file_path = "cleaned_weather_month_nonML.csv"
month_groups.to_csv(file_path, index=False)

Step Four

Create a DF that counts the number of events by Type

In [18]:
# adding together events by type

type_groups = weather_df.groupby('EVENT_TYPE', as_index=False).count()
type_groups

Unnamed: 0,EVENT_TYPE,EVENT_ID,STATE,STATE_FIPS,YEAR,MONTH_NAME,CATEGORY
0,Blizzard,20,20,20,20,20,0
1,Heavy Snow,5,5,5,5,5,0
2,Lightning,118,118,118,118,118,0
3,Tornado,146,146,146,146,146,0


In [19]:
# check work
type_groups

Unnamed: 0,EVENT_TYPE,EVENT_ID,STATE,STATE_FIPS,YEAR,MONTH_NAME,CATEGORY
0,Blizzard,20,20,20,20,20,0
1,Heavy Snow,5,5,5,5,5,0
2,Lightning,118,118,118,118,118,0
3,Tornado,146,146,146,146,146,0


In [20]:
# drop unnecessary columns
type_groups = type_groups.drop(columns=['YEAR', 'MONTH_NAME', 'CATEGORY', 'STATE', 'STATE_FIPS'])

In [23]:
# rename columns 
type_groups = type_groups.rename(columns={"EVENT_ID": "Count"})

In [24]:
# check work
type_groups

Unnamed: 0,EVENT_TYPE,Count
0,Blizzard,20
1,Heavy Snow,5
2,Lightning,118
3,Tornado,146


In [25]:
# Saving weather data focused on type
file_path = "cleaned_weather_type_nonML.csv"
type_groups.to_csv(file_path, index=False)

Step Five

Concatanate the groups into a single DF

In [24]:
# gather up all the groups
# year_groups + month_groups + type_groups

In [None]:
# Concat the groups with pandas

In [27]:
concat1 = pd.concat([type_groups, month_groups])
concat1

Unnamed: 0,EVENT_TYPE,Number_Of_Events_by_Type,MONTH_NAME,Number_Of_Events_by_Month
0,Blizzard,20.0,,
1,Heavy Snow,5.0,,
2,Lightning,118.0,,
3,Tornado,146.0,,
0,,,December,29.0
1,,,January,48.0
2,,,June,175.0
3,,,October,37.0


In [29]:
# Drop the null rows
concat1 = concat1.dropna()
concat1

Unnamed: 0,EVENT_TYPE,Number_Of_Events_by_Type,MONTH_NAME,Number_Of_Events_by_Month


Step Six

Create a Severe Weather DF not for ML purposes

In [26]:
# drop null columns
weather_df = weather_df.dropna(axis='columns', how='all')

In [27]:
# Drop the null rows
weather_df = weather_df.dropna()

In [28]:
# show cleaned dataset
weather_df.head(25)

Unnamed: 0,EVENT_ID,STATE,STATE_FIPS,YEAR,MONTH_NAME,EVENT_TYPE
0,10050384,MISSISSIPPI,28,1950,June,Tornado
1,10086808,OHIO,39,1950,January,Tornado
2,10120418,TEXAS,48,1950,June,Tornado
3,9981922,ARKANSAS,5,1950,January,Tornado
4,10001432,GEORGIA,13,1950,June,Tornado
5,10049829,MISSISSIPPI,28,1955,October,Tornado
6,9984208,ARKANSAS,5,1955,October,Tornado
7,9991373,COLORADO,8,1955,June,Tornado
8,10121863,TEXAS,48,1955,June,Tornado
9,9978062,ALABAMA,1,1964,January,Tornado


In [29]:
# Saving cleaned test weather data
file_path = "cleaned_weather_one_nonML.csv"
weather_df.to_csv(file_path, index=False)

In [30]:
# make data for ML

In [31]:
# dropping columns not needed
weather_df = weather_df.drop(columns=['STATE', 'STATE_FIPS'])

In [13]:
# show df for ML
weather_df

Unnamed: 0,EVENT_ID,YEAR,MONTH_NAME,EVENT_TYPE
0,10050384,1950,June,Tornado
1,10086808,1950,January,Tornado
2,10120418,1950,June,Tornado
3,9981922,1950,January,Tornado
4,10001432,1950,June,Tornado
...,...,...,...,...
284,240665,2010,June,Lightning
285,245002,2010,June,Lightning
286,245889,2010,June,Tornado
287,226112,2010,June,Tornado


In [14]:
# Saving cleaned test weather data for ML
file_path = "cleaned_weather_one_ML.csv"
weather_df.to_csv(file_path, index=False)

In [15]:
# Co2 Below This

In [16]:
# importing test co2 data 
carbon_df = pd.read_csv("co2_byYear.csv", low_memory=False)

In [17]:
# display co2 data
carbon_df

Unnamed: 0,year,co2,co2_per_capita,co2_per_unit_energy,coal_co2,cement_co2,flaring_co2,gas_co2,oil_co2,other_industry_co2,...,ghg_per_capita,methane,methane_per_capita,nitrous_oxide,nitrous_oxide_per_capita,population,gdp,primary_energy_consumption,energy_per_capita,energy_per_gdp
0,1950,23193.336,943.286,0.000,15959.768,272.887,223.550,1110.931,5611.231,9.976,...,0.000,0.00,0.000,0.00,0.000,7.570713e+09,1.585967e+13,0.000,0.000,0.000
1,1951,24765.194,1214.978,0.000,16981.583,309.265,243.305,1306.674,5907.881,11.002,...,0.000,0.00,0.000,0.00,0.000,7.712655e+09,7.992201e+12,0.000,0.000,0.000
2,1952,25203.424,834.737,0.000,16927.201,331.430,259.877,1405.429,6263.420,10.710,...,0.000,0.00,0.000,0.00,0.000,7.852545e+09,8.401431e+12,0.000,0.000,0.000
3,1953,25929.893,880.578,0.000,17104.163,368.454,248.979,1498.933,6689.993,12.910,...,0.000,0.00,0.000,0.00,0.000,7.992062e+09,8.884521e+12,0.000,0.000,0.000
4,1954,26744.552,1378.627,0.000,17319.801,400.182,236.584,1588.220,7182.609,11.442,...,0.000,0.00,0.000,0.00,0.000,8.133217e+09,9.099744e+12,0.000,0.000,0.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
66,2016,124705.642,1156.831,44.924,48325.275,4990.527,1438.122,27011.160,41892.058,1048.509,...,1415.378,17424.08,345.519,6291.93,105.005,2.239131e+10,2.147076e+14,366170.181,5562418.068,208.589
67,2017,126304.716,1156.251,18.863,48587.789,5067.569,1576.265,27603.881,42412.614,1056.584,...,0.000,0.00,0.000,0.00,0.000,2.264281e+10,2.211738e+14,366923.317,3678183.705,0.000
68,2018,128452.538,1152.783,18.444,49368.748,5286.347,1531.205,28781.873,42422.168,1062.207,...,0.000,0.00,0.000,0.00,0.000,2.289184e+10,2.276454e+14,376507.961,3698476.648,0.000
69,2019,128422.887,1165.000,18.142,48731.161,5408.677,1611.674,29061.768,42547.578,1062.016,...,0.000,0.00,0.000,0.00,0.000,2.313895e+10,0.000000e+00,380148.487,3659567.475,0.000


In [18]:
# rename columns for merging purposes
carbon_df = carbon_df.rename(columns={"year": "YEAR"})
carbon_df

Unnamed: 0,YEAR,co2,co2_per_capita,co2_per_unit_energy,coal_co2,cement_co2,flaring_co2,gas_co2,oil_co2,other_industry_co2,...,ghg_per_capita,methane,methane_per_capita,nitrous_oxide,nitrous_oxide_per_capita,population,gdp,primary_energy_consumption,energy_per_capita,energy_per_gdp
0,1950,23193.336,943.286,0.000,15959.768,272.887,223.550,1110.931,5611.231,9.976,...,0.000,0.00,0.000,0.00,0.000,7.570713e+09,1.585967e+13,0.000,0.000,0.000
1,1951,24765.194,1214.978,0.000,16981.583,309.265,243.305,1306.674,5907.881,11.002,...,0.000,0.00,0.000,0.00,0.000,7.712655e+09,7.992201e+12,0.000,0.000,0.000
2,1952,25203.424,834.737,0.000,16927.201,331.430,259.877,1405.429,6263.420,10.710,...,0.000,0.00,0.000,0.00,0.000,7.852545e+09,8.401431e+12,0.000,0.000,0.000
3,1953,25929.893,880.578,0.000,17104.163,368.454,248.979,1498.933,6689.993,12.910,...,0.000,0.00,0.000,0.00,0.000,7.992062e+09,8.884521e+12,0.000,0.000,0.000
4,1954,26744.552,1378.627,0.000,17319.801,400.182,236.584,1588.220,7182.609,11.442,...,0.000,0.00,0.000,0.00,0.000,8.133217e+09,9.099744e+12,0.000,0.000,0.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
66,2016,124705.642,1156.831,44.924,48325.275,4990.527,1438.122,27011.160,41892.058,1048.509,...,1415.378,17424.08,345.519,6291.93,105.005,2.239131e+10,2.147076e+14,366170.181,5562418.068,208.589
67,2017,126304.716,1156.251,18.863,48587.789,5067.569,1576.265,27603.881,42412.614,1056.584,...,0.000,0.00,0.000,0.00,0.000,2.264281e+10,2.211738e+14,366923.317,3678183.705,0.000
68,2018,128452.538,1152.783,18.444,49368.748,5286.347,1531.205,28781.873,42422.168,1062.207,...,0.000,0.00,0.000,0.00,0.000,2.289184e+10,2.276454e+14,376507.961,3698476.648,0.000
69,2019,128422.887,1165.000,18.142,48731.161,5408.677,1611.674,29061.768,42547.578,1062.016,...,0.000,0.00,0.000,0.00,0.000,2.313895e+10,0.000000e+00,380148.487,3659567.475,0.000


In [19]:
# dropping columns not needed
carbon_df = carbon_df.drop(columns=['co2_per_unit_energy', 'coal_co2', 'cement_co2', 'flaring_co2', 'gas_co2',
                                         'oil_co2', 'other_industry_co2', 'ghg_per_capita', 'methane', 'methane_per_capita',
                                         'nitrous_oxide', 'nitrous_oxide_per_capita', 'population', 'gdp', 'primary_energy_consumption',
                                         'energy_per_capita', 'energy_per_gdp', 'total_ghg'])

In [20]:
# show new dataframe
carbon_df

Unnamed: 0,YEAR,co2,co2_per_capita
0,1950,23193.336,943.286
1,1951,24765.194,1214.978
2,1952,25203.424,834.737
3,1953,25929.893,880.578
4,1954,26744.552,1378.627
...,...,...,...
66,2016,124705.642,1156.831
67,2017,126304.716,1156.251
68,2018,128452.538,1152.783
69,2019,128422.887,1165.000


In [21]:
# Saving cleaned carbon data
file_path = "cleaned_carbon_one_nonML.csv"
weather_df.to_csv(file_path, index=False)

In [22]:
# making data for ML with the carbon data

In [21]:
# dropping columns not needed
# carbon_df = carbon_df.drop(columns=['ISO_CODE', 'COUNTRY'])
# carbon_df

In [22]:
# Saving cleaned carbon data
file_path = "cleaned_carbon_one_ML.csv"
weather_df.to_csv(file_path, index=False)

In [23]:
# checking dtypes

In [24]:
carbon_df.dtypes

YEAR                int64
co2               float64
co2_per_capita    float64
dtype: object

In [25]:
# weather dtypes
weather_df.dtypes

EVENT_ID       int64
YEAR           int64
MONTH_NAME    object
EVENT_TYPE    object
dtype: object

In [26]:
# Merging below this

In [27]:
# Merge attempt one
combo_df = weather_df.merge(carbon_df, left_on='YEAR', right_on='YEAR')

In [28]:
# checking merge
combo_df

Unnamed: 0,EVENT_ID,YEAR,MONTH_NAME,EVENT_TYPE,co2,co2_per_capita
0,10050384,1950,June,Tornado,23193.336,943.286
1,10086808,1950,January,Tornado,23193.336,943.286
2,10120418,1950,June,Tornado,23193.336,943.286
3,9981922,1950,January,Tornado,23193.336,943.286
4,10001432,1950,June,Tornado,23193.336,943.286
...,...,...,...,...,...,...
284,240665,2010,June,Lightning,118593.920,1210.391
285,245002,2010,June,Lightning,118593.920,1210.391
286,245889,2010,June,Tornado,118593.920,1210.391
287,226112,2010,June,Tornado,118593.920,1210.391


In [29]:
# Drop the null columns where all values are null
combo_df = combo_df.dropna(axis='columns', how='all')

In [30]:
# check work
combo_df

Unnamed: 0,EVENT_ID,YEAR,MONTH_NAME,EVENT_TYPE,co2,co2_per_capita
0,10050384,1950,June,Tornado,23193.336,943.286
1,10086808,1950,January,Tornado,23193.336,943.286
2,10120418,1950,June,Tornado,23193.336,943.286
3,9981922,1950,January,Tornado,23193.336,943.286
4,10001432,1950,June,Tornado,23193.336,943.286
...,...,...,...,...,...,...
284,240665,2010,June,Lightning,118593.920,1210.391
285,245002,2010,June,Lightning,118593.920,1210.391
286,245889,2010,June,Tornado,118593.920,1210.391
287,226112,2010,June,Tornado,118593.920,1210.391


In [31]:
# Drop the null rows
combo_df = combo_df.dropna()

In [36]:
# check work
combo_df

Unnamed: 0,EVENT_ID,YEAR,MONTH_NAME,EVENT_TYPE,co2,co2_per_capita
0,10050384,1950,June,Tornado,23193.336,943.286
1,10086808,1950,January,Tornado,23193.336,943.286
2,10120418,1950,June,Tornado,23193.336,943.286
3,9981922,1950,January,Tornado,23193.336,943.286
4,10001432,1950,June,Tornado,23193.336,943.286
...,...,...,...,...,...,...
284,240665,2010,June,Lightning,118593.920,1210.391
285,245002,2010,June,Lightning,118593.920,1210.391
286,245889,2010,June,Tornado,118593.920,1210.391
287,226112,2010,June,Tornado,118593.920,1210.391


In [37]:
# Adding machine learning model parameters below here

In [38]:
target = ["co2"]

In [39]:
# split data into training and testing

# Create our features
X = pd.get_dummies(combo_df.drop(columns="co2"))

# Create our target
y = pd.get_dummies(combo_df["co2"])

In [40]:
X.describe()

Unnamed: 0,EVENT_ID,YEAR,co2_per_capita,MONTH_NAME_December,MONTH_NAME_January,MONTH_NAME_June,MONTH_NAME_October,EVENT_TYPE_Blizzard,EVENT_TYPE_Heavy Snow,EVENT_TYPE_Lightning,EVENT_TYPE_Tornado
count,289.0,289.0,289.0,289.0,289.0,289.0,289.0,289.0,289.0,289.0,289.0
mean,6360170.0,1991.49827,1210.795678,0.100346,0.16609,0.605536,0.128028,0.069204,0.017301,0.408304,0.50519
std,3423148.0,16.311875,112.615095,0.300982,0.372807,0.489583,0.3347,0.254241,0.130617,0.492373,0.50084
min,202850.0,1950.0,943.286,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,5455386.0,1973.0,1140.447,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,5607755.0,1997.0,1210.391,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
75%,10012200.0,2005.0,1245.04,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0
max,10161730.0,2010.0,1503.995,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [42]:
# Check the balance of our target values
y = combo_df['co2']
y.value_counts()

90055.545     78
118593.920    48
107354.795    38
67163.422     28
48536.547     26
86621.769     17
100340.414    16
76976.517     14
43134.197     10
80843.758      5
23193.336      5
29264.525      4
Name: co2, dtype: int64

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)
Counter(y_train)

In [None]:
# resample the training data with the BalancedRandomForestClassifier

scaler = StandardScaler()
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)
brfc = BalancedRandomForestClassifier(n_estimators=100, random_state=1)
brfc.fit(X_train_scaled, y_train)

In [None]:
# display the confusion matrix
from sklearn.metrics import confusion_matrix

y_pred = brfc.predict(X_test)
confusion_matrix(y_test, y_pred)

In [None]:
# calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score

balanced_accuracy_score(y_test, y_pred)

In [None]:
# print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

In [None]:
# list the features sorted in descending order by feature importance
sorted(zip(brfc.feature_importances_, X.columns), reverse=True)