In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression # Linear Regression
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_classification
from imblearn.over_sampling import RandomOverSampler
from imblearn.ensemble import BalancedRandomForestClassifier # RandomForestClassifier

In [3]:
%matplotlib inline
from sklearn.datasets import make_blobs

In [4]:
# importing test weather data
weather_df = pd.read_csv("Severe_Weather_TestData.csv", low_memory=False)

In [5]:
# dropping columns not needed
weather_df = weather_df.drop(columns=['INJURIES_DIRECT', 'INJURIES_INDIRECT', 'DEATHS_DIRECT', 'DEATHS_INDIRECT',
                                     'DAMAGE_PROPERTY', 'DAMAGE_CROPS', 'MAGNITUDE', 'TOR_F_SCALE', 'TOR_LENGTH',
                                     'TOR_WIDTH', 'BEGIN_LAT', 'BEGIN_LON'])

In [6]:
# displaying weather df
weather_df

Unnamed: 0,EVENT_ID,STATE,STATE_FIPS,YEAR,MONTH_NAME,EVENT_TYPE,CATEGORY
0,10050384,MISSISSIPPI,28,1950,June,Tornado,
1,10086808,OHIO,39,1950,January,Tornado,
2,10120418,TEXAS,48,1950,June,Tornado,
3,9981922,ARKANSAS,5,1950,January,Tornado,
4,10001432,GEORGIA,13,1950,June,Tornado,
...,...,...,...,...,...,...,...
284,240665,TENNESSEE,47,2010,June,Lightning,
285,245002,ILLINOIS,17,2010,June,Lightning,
286,245889,MINNESOTA,27,2010,June,Tornado,
287,226112,ILLINOIS,17,2010,June,Tornado,


In [7]:
# Drop the null columns where all values are null
weather_df = weather_df.dropna(axis='columns', how='all')

In [8]:
# Drop the null rows
weather_df = weather_df.dropna()

In [9]:
# show cleaned dataset
weather_df.head(25)

Unnamed: 0,EVENT_ID,STATE,STATE_FIPS,YEAR,MONTH_NAME,EVENT_TYPE
0,10050384,MISSISSIPPI,28,1950,June,Tornado
1,10086808,OHIO,39,1950,January,Tornado
2,10120418,TEXAS,48,1950,June,Tornado
3,9981922,ARKANSAS,5,1950,January,Tornado
4,10001432,GEORGIA,13,1950,June,Tornado
5,10049829,MISSISSIPPI,28,1955,October,Tornado
6,9984208,ARKANSAS,5,1955,October,Tornado
7,9991373,COLORADO,8,1955,June,Tornado
8,10121863,TEXAS,48,1955,June,Tornado
9,9978062,ALABAMA,1,1964,January,Tornado


In [10]:
# Saving cleaned test weather data
file_path = "cleaned_weather_one_nonML.csv"
weather_df.to_csv(file_path, index=False)

In [11]:
# make data for ML

In [12]:
# dropping columns not needed
weather_df = weather_df.drop(columns=['STATE', 'STATE_FIPS'])

In [13]:
# show df for ML
weather_df

Unnamed: 0,EVENT_ID,YEAR,MONTH_NAME,EVENT_TYPE
0,10050384,1950,June,Tornado
1,10086808,1950,January,Tornado
2,10120418,1950,June,Tornado
3,9981922,1950,January,Tornado
4,10001432,1950,June,Tornado
...,...,...,...,...
284,240665,2010,June,Lightning
285,245002,2010,June,Lightning
286,245889,2010,June,Tornado
287,226112,2010,June,Tornado


In [14]:
# Saving cleaned test weather data for ML
file_path = "cleaned_weather_one_ML.csv"
weather_df.to_csv(file_path, index=False)

In [15]:
# Co2 Below This

In [18]:
# importing test co2 data 
carbon_df = pd.read_csv("co2_byYear.csv", low_memory=False)

In [19]:
# display co2 data
carbon_df

Unnamed: 0,year,co2,co2_per_capita,co2_per_unit_energy,coal_co2,cement_co2,flaring_co2,gas_co2,oil_co2,other_industry_co2,...,ghg_per_capita,methane,methane_per_capita,nitrous_oxide,nitrous_oxide_per_capita,population,gdp,primary_energy_consumption,energy_per_capita,energy_per_gdp
0,1950,23193.336,943.286,0.000,15959.768,272.887,223.550,1110.931,5611.231,9.976,...,0.000,0.00,0.000,0.00,0.000,7.570713e+09,1.585967e+13,0.000,0.000,0.000
1,1951,24765.194,1214.978,0.000,16981.583,309.265,243.305,1306.674,5907.881,11.002,...,0.000,0.00,0.000,0.00,0.000,7.712655e+09,7.992201e+12,0.000,0.000,0.000
2,1952,25203.424,834.737,0.000,16927.201,331.430,259.877,1405.429,6263.420,10.710,...,0.000,0.00,0.000,0.00,0.000,7.852545e+09,8.401431e+12,0.000,0.000,0.000
3,1953,25929.893,880.578,0.000,17104.163,368.454,248.979,1498.933,6689.993,12.910,...,0.000,0.00,0.000,0.00,0.000,7.992062e+09,8.884521e+12,0.000,0.000,0.000
4,1954,26744.552,1378.627,0.000,17319.801,400.182,236.584,1588.220,7182.609,11.442,...,0.000,0.00,0.000,0.00,0.000,8.133217e+09,9.099744e+12,0.000,0.000,0.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
66,2016,124705.642,1156.831,44.924,48325.275,4990.527,1438.122,27011.160,41892.058,1048.509,...,1415.378,17424.08,345.519,6291.93,105.005,2.239131e+10,2.147076e+14,366170.181,5562418.068,208.589
67,2017,126304.716,1156.251,18.863,48587.789,5067.569,1576.265,27603.881,42412.614,1056.584,...,0.000,0.00,0.000,0.00,0.000,2.264281e+10,2.211738e+14,366923.317,3678183.705,0.000
68,2018,128452.538,1152.783,18.444,49368.748,5286.347,1531.205,28781.873,42422.168,1062.207,...,0.000,0.00,0.000,0.00,0.000,2.289184e+10,2.276454e+14,376507.961,3698476.648,0.000
69,2019,128422.887,1165.000,18.142,48731.161,5408.677,1611.674,29061.768,42547.578,1062.016,...,0.000,0.00,0.000,0.00,0.000,2.313895e+10,0.000000e+00,380148.487,3659567.475,0.000


In [20]:
# rename columns for merging purposes
carbon_df = carbon_df.rename(columns={"year": "YEAR"})
carbon_df

Unnamed: 0,YEAR,co2,co2_per_capita,co2_per_unit_energy,coal_co2,cement_co2,flaring_co2,gas_co2,oil_co2,other_industry_co2,...,ghg_per_capita,methane,methane_per_capita,nitrous_oxide,nitrous_oxide_per_capita,population,gdp,primary_energy_consumption,energy_per_capita,energy_per_gdp
0,1950,23193.336,943.286,0.000,15959.768,272.887,223.550,1110.931,5611.231,9.976,...,0.000,0.00,0.000,0.00,0.000,7.570713e+09,1.585967e+13,0.000,0.000,0.000
1,1951,24765.194,1214.978,0.000,16981.583,309.265,243.305,1306.674,5907.881,11.002,...,0.000,0.00,0.000,0.00,0.000,7.712655e+09,7.992201e+12,0.000,0.000,0.000
2,1952,25203.424,834.737,0.000,16927.201,331.430,259.877,1405.429,6263.420,10.710,...,0.000,0.00,0.000,0.00,0.000,7.852545e+09,8.401431e+12,0.000,0.000,0.000
3,1953,25929.893,880.578,0.000,17104.163,368.454,248.979,1498.933,6689.993,12.910,...,0.000,0.00,0.000,0.00,0.000,7.992062e+09,8.884521e+12,0.000,0.000,0.000
4,1954,26744.552,1378.627,0.000,17319.801,400.182,236.584,1588.220,7182.609,11.442,...,0.000,0.00,0.000,0.00,0.000,8.133217e+09,9.099744e+12,0.000,0.000,0.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
66,2016,124705.642,1156.831,44.924,48325.275,4990.527,1438.122,27011.160,41892.058,1048.509,...,1415.378,17424.08,345.519,6291.93,105.005,2.239131e+10,2.147076e+14,366170.181,5562418.068,208.589
67,2017,126304.716,1156.251,18.863,48587.789,5067.569,1576.265,27603.881,42412.614,1056.584,...,0.000,0.00,0.000,0.00,0.000,2.264281e+10,2.211738e+14,366923.317,3678183.705,0.000
68,2018,128452.538,1152.783,18.444,49368.748,5286.347,1531.205,28781.873,42422.168,1062.207,...,0.000,0.00,0.000,0.00,0.000,2.289184e+10,2.276454e+14,376507.961,3698476.648,0.000
69,2019,128422.887,1165.000,18.142,48731.161,5408.677,1611.674,29061.768,42547.578,1062.016,...,0.000,0.00,0.000,0.00,0.000,2.313895e+10,0.000000e+00,380148.487,3659567.475,0.000


In [21]:
# dropping columns not needed
carbon_df = carbon_df.drop(columns=['co2_per_unit_energy', 'coal_co2', 'cement_co2', 'flaring_co2', 'gas_co2',
                                         'oil_co2', 'other_industry_co2', 'ghg_per_capita', 'methane', 'methane_per_capita',
                                         'nitrous_oxide', 'nitrous_oxide_per_capita', 'population', 'gdp', 'primary_energy_consumption',
                                         'energy_per_capita', 'energy_per_gdp', 'total_ghg'])

In [22]:
# show new dataframe
carbon_df

Unnamed: 0,YEAR,co2,co2_per_capita
0,1950,23193.336,943.286
1,1951,24765.194,1214.978
2,1952,25203.424,834.737
3,1953,25929.893,880.578
4,1954,26744.552,1378.627
...,...,...,...
66,2016,124705.642,1156.831
67,2017,126304.716,1156.251
68,2018,128452.538,1152.783
69,2019,128422.887,1165.000


In [96]:
# Saving cleaned carbon data
file_path = "cleaned_carbon_one_nonML.csv"
weather_df.to_csv(file_path, index=False)

In [97]:
# making data for ML with the carbon data

In [63]:
# dropping columns not needed
carbon_df = carbon_df.drop(columns=['ISO_CODE', 'COUNTRY'])
carbon_df

Unnamed: 0,YEAR,CO2,CUMULATIVE_CO2
0,1950,0.084,0.099
1,1955,0.154,0.649
2,1964,0.839,4.978
3,1967,1.282,8.358
4,1973,1.635,17.252
...,...,...,...
274,1989,5131.927,244082.956
275,1997,5686.465,286766.539
276,2003,6011.837,322176.256
277,2005,6134.521,334424.784


In [64]:
# Saving cleaned carbon data
file_path = "cleaned_carbon_one_ML.csv"
weather_df.to_csv(file_path, index=False)

In [65]:
# checking dtypes

In [66]:
carbon_df.dtypes

YEAR                int64
CO2               float64
CUMULATIVE_CO2    float64
dtype: object

In [67]:
# weather dtypes
weather_df.dtypes

EVENT_ID       int64
YEAR           int64
MONTH_NAME    object
EVENT_TYPE    object
dtype: object

In [68]:
# Merging below this

In [69]:
# Merge attempt one
combo_df = weather_df.merge(carbon_df, left_on='YEAR', right_on='YEAR')

In [70]:
# checking merge
combo_df

Unnamed: 0,EVENT_ID,YEAR,MONTH_NAME,EVENT_TYPE,CO2,CUMULATIVE_CO2
0,10050384,1950,June,Tornado,0.084,0.099
1,10050384,1950,June,Tornado,93.452,1736.488
2,10050384,1950,June,Tornado,0.297,7.464
3,10050384,1950,June,Tornado,476.122,12107.269
4,10050384,1950,June,Tornado,54.739,1340.047
...,...,...,...,...,...,...
6817,246134,2010,June,Lightning,45.049,2635.114
6818,246134,2010,June,Lightning,270.148,6275.885
6819,246134,2010,June,Lightning,294.078,28043.035
6820,246134,2010,June,Lightning,511.632,73998.526


In [71]:
# Drop the null columns where all values are null
combo_df = combo_df.dropna(axis='columns', how='all')

In [72]:
# check work
combo_df

Unnamed: 0,EVENT_ID,YEAR,MONTH_NAME,EVENT_TYPE,CO2,CUMULATIVE_CO2
0,10050384,1950,June,Tornado,0.084,0.099
1,10050384,1950,June,Tornado,93.452,1736.488
2,10050384,1950,June,Tornado,0.297,7.464
3,10050384,1950,June,Tornado,476.122,12107.269
4,10050384,1950,June,Tornado,54.739,1340.047
...,...,...,...,...,...,...
6817,246134,2010,June,Lightning,45.049,2635.114
6818,246134,2010,June,Lightning,270.148,6275.885
6819,246134,2010,June,Lightning,294.078,28043.035
6820,246134,2010,June,Lightning,511.632,73998.526


In [73]:
# Drop the null rows
combo_df = combo_df.dropna()

In [74]:
# check work
combo_df

Unnamed: 0,EVENT_ID,YEAR,MONTH_NAME,EVENT_TYPE,CO2,CUMULATIVE_CO2
0,10050384,1950,June,Tornado,0.084,0.099
1,10050384,1950,June,Tornado,93.452,1736.488
2,10050384,1950,June,Tornado,0.297,7.464
3,10050384,1950,June,Tornado,476.122,12107.269
4,10050384,1950,June,Tornado,54.739,1340.047
...,...,...,...,...,...,...
6817,246134,2010,June,Lightning,45.049,2635.114
6818,246134,2010,June,Lightning,270.148,6275.885
6819,246134,2010,June,Lightning,294.078,28043.035
6820,246134,2010,June,Lightning,511.632,73998.526


In [35]:
# Adding machine learning model parameters below here

In [85]:
target = ["CO2"]

In [86]:
# split data into training and testing

# Create our features
X = pd.get_dummies(combo_df.drop(columns="CO2"))

# Create our target
y = pd.get_dummies(combo_df["CO2"])

In [87]:
X.describe()

Unnamed: 0,EVENT_ID,YEAR,CUMULATIVE_CO2,MONTH_NAME_December,MONTH_NAME_January,MONTH_NAME_June,MONTH_NAME_October,EVENT_TYPE_Blizzard,EVENT_TYPE_Heavy Snow,EVENT_TYPE_Lightning,EVENT_TYPE_Tornado
count,6822.0,6822.0,6822.0,6822.0,6822.0,6822.0,6822.0,6822.0,6822.0,6822.0,6822.0
mean,6298431.0,1991.823952,27324.030389,0.099091,0.166667,0.60642,0.127822,0.070361,0.01759,0.415128,0.496922
std,3412078.0,16.158745,69225.548386,0.298806,0.372705,0.488579,0.333916,0.255773,0.131466,0.49278,0.500027
min,202850.0,1950.0,0.004,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,5453775.0,1980.0,47.921,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,5607396.0,1997.0,655.957,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
75%,10012000.0,2005.0,14578.93,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0
max,10161730.0,2010.0,363675.53,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [88]:
# Check the balance of our target values
y = combo_df['CO2']
y.value_counts()

0.465       92
24.809      78
5686.465    78
521.278     78
1.466       78
            ..
0.461        4
0.850        4
0.663        4
2728.348     4
0.110        4
Name: CO2, Length: 273, dtype: int64

In [89]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)
Counter(y_train)

Counter({277.501: 13,
         2.678: 19,
         1.143: 58,
         21.97: 19,
         5.1: 36,
         8.285: 12,
         220.537: 11,
         0.524: 58,
         1.635: 21,
         558.804: 36,
         170.829: 21,
         6.298: 28,
         3510.169: 58,
         44.652: 12,
         32.519: 19,
         59.222: 21,
         129.144: 19,
         340.243: 58,
         0.103: 3,
         4.232: 12,
         812.903: 58,
         53.992: 36,
         569.962: 28,
         0.48: 21,
         3.784: 4,
         530.038: 21,
         327.11: 58,
         562.333: 58,
         16.907: 8,
         45.788: 28,
         2.589: 19,
         1.084: 58,
         1.466: 58,
         2.547: 13,
         0.997: 36,
         0.517: 36,
         1.902: 36,
         27.065: 3,
         5.416: 58,
         0.465: 69,
         45.049: 36,
         1057.342: 28,
         6134.521: 28,
         40.491: 10,
         0.297: 4,
         24.809: 58,
         65.451: 58,
         442.847: 11,
     

In [90]:
# resample the training data with the BalancedRandomForestClassifier

scaler = StandardScaler()
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)
brfc = BalancedRandomForestClassifier(n_estimators=100, random_state=1)
brfc.fit(X_train_scaled, y_train)

ValueError: Unknown label type: 'continuous'

In [81]:
# display the confusion matrix
from sklearn.metrics import confusion_matrix

y_pred = brfc.predict(X_test)
confusion_matrix(y_test, y_pred)

AttributeError: 'BalancedRandomForestClassifier' object has no attribute 'estimators_'

In [82]:
# calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score

balanced_accuracy_score(y_test, y_pred)

NameError: name 'y_pred' is not defined

In [83]:
# print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

NameError: name 'y_pred' is not defined

In [84]:
# list the features sorted in descending order by feature importance
sorted(zip(brfc.feature_importances_, X.columns), reverse=True)

AttributeError: 'BalancedRandomForestClassifier' object has no attribute 'estimators_'