In [42]:
import warnings
warnings.filterwarnings('ignore')

In [43]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression # Linear Regression
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_classification
from imblearn.over_sampling import RandomOverSampler
from imblearn.ensemble import BalancedRandomForestClassifier # RandomForestClassifier

In [44]:
%matplotlib inline
from sklearn.datasets import make_blobs

In [45]:
# importing test weather data
weather_df = pd.read_csv("Severe_Weather_TestData.csv", low_memory=False)

In [46]:
# dropping columns not needed
weather_df = weather_df.drop(columns=['INJURIES_DIRECT', 'INJURIES_INDIRECT', 'DEATHS_DIRECT', 'DEATHS_INDIRECT',
                                     'DAMAGE_PROPERTY', 'DAMAGE_CROPS', 'MAGNITUDE', 'TOR_F_SCALE', 'TOR_LENGTH',
                                     'TOR_WIDTH', 'BEGIN_LAT', 'BEGIN_LON'])

In [47]:
# displaying weather df
weather_df

Unnamed: 0,EVENT_ID,STATE,STATE_FIPS,YEAR,MONTH_NAME,EVENT_TYPE,CATEGORY
0,10050384,MISSISSIPPI,28,1950,June,Tornado,
1,10086808,OHIO,39,1950,January,Tornado,
2,10120418,TEXAS,48,1950,June,Tornado,
3,9981922,ARKANSAS,5,1950,January,Tornado,
4,10001432,GEORGIA,13,1950,June,Tornado,
...,...,...,...,...,...,...,...
284,240665,TENNESSEE,47,2010,June,Lightning,
285,245002,ILLINOIS,17,2010,June,Lightning,
286,245889,MINNESOTA,27,2010,June,Tornado,
287,226112,ILLINOIS,17,2010,June,Tornado,


In [48]:
# Drop the null columns where all values are null
weather_df = weather_df.dropna(axis='columns', how='all')

In [49]:
# Drop the null rows
weather_df = weather_df.dropna()

In [50]:
# show cleaned dataset
weather_df.head(25)

Unnamed: 0,EVENT_ID,STATE,STATE_FIPS,YEAR,MONTH_NAME,EVENT_TYPE
0,10050384,MISSISSIPPI,28,1950,June,Tornado
1,10086808,OHIO,39,1950,January,Tornado
2,10120418,TEXAS,48,1950,June,Tornado
3,9981922,ARKANSAS,5,1950,January,Tornado
4,10001432,GEORGIA,13,1950,June,Tornado
5,10049829,MISSISSIPPI,28,1955,October,Tornado
6,9984208,ARKANSAS,5,1955,October,Tornado
7,9991373,COLORADO,8,1955,June,Tornado
8,10121863,TEXAS,48,1955,June,Tornado
9,9978062,ALABAMA,1,1964,January,Tornado


In [51]:
# Saving cleaned test weather data
file_path = "cleaned_weather_one_nonML.csv"
weather_df.to_csv(file_path, index=False)

In [52]:
# make data for ML

In [53]:
# dropping columns not needed
weather_df = weather_df.drop(columns=['STATE', 'STATE_FIPS'])

In [54]:
# show df for ML
weather_df

Unnamed: 0,EVENT_ID,YEAR,MONTH_NAME,EVENT_TYPE
0,10050384,1950,June,Tornado
1,10086808,1950,January,Tornado
2,10120418,1950,June,Tornado
3,9981922,1950,January,Tornado
4,10001432,1950,June,Tornado
...,...,...,...,...
284,240665,2010,June,Lightning
285,245002,2010,June,Lightning
286,245889,2010,June,Tornado
287,226112,2010,June,Tornado


In [55]:
# Saving cleaned test weather data for ML
file_path = "cleaned_weather_one_ML.csv"
weather_df.to_csv(file_path, index=False)

In [56]:
# Co2 Below This

In [57]:
# importing test co2 data 
carbon_df = pd.read_csv("CO2_Emissions_FixedTestData.csv", low_memory=False)

In [58]:
# display co2 data
carbon_df

Unnamed: 0,iso_code,country,year,co2,cumulative_co2
0,AFG,Afghanistan,1950,0.084,0.099
1,AFG,Afghanistan,1955,0.154,0.649
2,AFG,Afghanistan,1964,0.839,4.978
3,AFG,Afghanistan,1967,1.282,8.358
4,AFG,Afghanistan,1973,1.635,17.252
...,...,...,...,...,...
274,USA,United States,1989,5131.927,244082.956
275,USA,United States,1997,5686.465,286766.539
276,USA,United States,2003,6011.837,322176.256
277,USA,United States,2005,6134.521,334424.784


In [59]:
# rename columns for merging purposes
carbon_df = carbon_df.rename(columns={"iso_code": "ISO_CODE","country": "COUNTRY", "year": "YEAR", "co2": "CO2", 
                          "cumulative_co2": "CUMULATIVE_CO2"})
carbon_df

Unnamed: 0,ISO_CODE,COUNTRY,YEAR,CO2,CUMULATIVE_CO2
0,AFG,Afghanistan,1950,0.084,0.099
1,AFG,Afghanistan,1955,0.154,0.649
2,AFG,Afghanistan,1964,0.839,4.978
3,AFG,Afghanistan,1967,1.282,8.358
4,AFG,Afghanistan,1973,1.635,17.252
...,...,...,...,...,...
274,USA,United States,1989,5131.927,244082.956
275,USA,United States,1997,5686.465,286766.539
276,USA,United States,2003,6011.837,322176.256
277,USA,United States,2005,6134.521,334424.784


In [60]:
# display the new dataframe
carbon_df

Unnamed: 0,ISO_CODE,COUNTRY,YEAR,CO2,CUMULATIVE_CO2
0,AFG,Afghanistan,1950,0.084,0.099
1,AFG,Afghanistan,1955,0.154,0.649
2,AFG,Afghanistan,1964,0.839,4.978
3,AFG,Afghanistan,1967,1.282,8.358
4,AFG,Afghanistan,1973,1.635,17.252
...,...,...,...,...,...
274,USA,United States,1989,5131.927,244082.956
275,USA,United States,1997,5686.465,286766.539
276,USA,United States,2003,6011.837,322176.256
277,USA,United States,2005,6134.521,334424.784


In [61]:
# Saving cleaned carbon data
file_path = "cleaned_carbon_one_nonML.csv"
weather_df.to_csv(file_path, index=False)

In [62]:
# making data for ML with the carbon data

In [63]:
# dropping columns not needed
carbon_df = carbon_df.drop(columns=['ISO_CODE', 'COUNTRY'])
carbon_df

Unnamed: 0,YEAR,CO2,CUMULATIVE_CO2
0,1950,0.084,0.099
1,1955,0.154,0.649
2,1964,0.839,4.978
3,1967,1.282,8.358
4,1973,1.635,17.252
...,...,...,...
274,1989,5131.927,244082.956
275,1997,5686.465,286766.539
276,2003,6011.837,322176.256
277,2005,6134.521,334424.784


In [64]:
# Saving cleaned carbon data
file_path = "cleaned_carbon_one_ML.csv"
weather_df.to_csv(file_path, index=False)

In [65]:
# checking dtypes

In [66]:
carbon_df.dtypes

YEAR                int64
CO2               float64
CUMULATIVE_CO2    float64
dtype: object

In [67]:
# weather dtypes
weather_df.dtypes

EVENT_ID       int64
YEAR           int64
MONTH_NAME    object
EVENT_TYPE    object
dtype: object

In [68]:
# Merging below this

In [69]:
# Merge attempt one
combo_df = weather_df.merge(carbon_df, left_on='YEAR', right_on='YEAR')

In [70]:
# checking merge
combo_df

Unnamed: 0,EVENT_ID,YEAR,MONTH_NAME,EVENT_TYPE,CO2,CUMULATIVE_CO2
0,10050384,1950,June,Tornado,0.084,0.099
1,10050384,1950,June,Tornado,93.452,1736.488
2,10050384,1950,June,Tornado,0.297,7.464
3,10050384,1950,June,Tornado,476.122,12107.269
4,10050384,1950,June,Tornado,54.739,1340.047
...,...,...,...,...,...,...
6817,246134,2010,June,Lightning,45.049,2635.114
6818,246134,2010,June,Lightning,270.148,6275.885
6819,246134,2010,June,Lightning,294.078,28043.035
6820,246134,2010,June,Lightning,511.632,73998.526


In [71]:
# Drop the null columns where all values are null
combo_df = combo_df.dropna(axis='columns', how='all')

In [72]:
# check work
combo_df

Unnamed: 0,EVENT_ID,YEAR,MONTH_NAME,EVENT_TYPE,CO2,CUMULATIVE_CO2
0,10050384,1950,June,Tornado,0.084,0.099
1,10050384,1950,June,Tornado,93.452,1736.488
2,10050384,1950,June,Tornado,0.297,7.464
3,10050384,1950,June,Tornado,476.122,12107.269
4,10050384,1950,June,Tornado,54.739,1340.047
...,...,...,...,...,...,...
6817,246134,2010,June,Lightning,45.049,2635.114
6818,246134,2010,June,Lightning,270.148,6275.885
6819,246134,2010,June,Lightning,294.078,28043.035
6820,246134,2010,June,Lightning,511.632,73998.526


In [73]:
# Drop the null rows
combo_df = combo_df.dropna()

In [74]:
# check work
combo_df

Unnamed: 0,EVENT_ID,YEAR,MONTH_NAME,EVENT_TYPE,CO2,CUMULATIVE_CO2
0,10050384,1950,June,Tornado,0.084,0.099
1,10050384,1950,June,Tornado,93.452,1736.488
2,10050384,1950,June,Tornado,0.297,7.464
3,10050384,1950,June,Tornado,476.122,12107.269
4,10050384,1950,June,Tornado,54.739,1340.047
...,...,...,...,...,...,...
6817,246134,2010,June,Lightning,45.049,2635.114
6818,246134,2010,June,Lightning,270.148,6275.885
6819,246134,2010,June,Lightning,294.078,28043.035
6820,246134,2010,June,Lightning,511.632,73998.526


In [35]:
# Adding machine learning model parameters below here

In [75]:
target = ["CUMULATIVE_CO2"]

In [76]:
# split data into training and testing

# Create our features
X = pd.get_dummies(combo_df.drop(columns="CUMULATIVE_CO2"))

# Create our target
y = pd.get_dummies(combo_df["CUMULATIVE_CO2"])

In [77]:
X.describe()

Unnamed: 0,EVENT_ID,YEAR,CO2,MONTH_NAME_December,MONTH_NAME_January,MONTH_NAME_June,MONTH_NAME_October,EVENT_TYPE_Blizzard,EVENT_TYPE_Heavy Snow,EVENT_TYPE_Lightning,EVENT_TYPE_Tornado
count,6822.0,6822.0,6822.0,6822.0,6822.0,6822.0,6822.0,6822.0,6822.0,6822.0,6822.0
mean,6298431.0,1991.823952,866.573916,0.099091,0.166667,0.60642,0.127822,0.070361,0.01759,0.415128,0.496922
std,3412078.0,16.158745,2365.57757,0.298806,0.372705,0.488579,0.333916,0.255773,0.131466,0.49278,0.500027
min,202850.0,1950.0,0.004,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,5453775.0,1980.0,1.701,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,5607396.0,1997.0,25.943,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
75%,10012000.0,2005.0,404.533,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0
max,10161730.0,2010.0,16530.951,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [78]:
# Check the balance of our target values
y = combo_df['CUMULATIVE_CO2']
y.value_counts()

139.405       78
3055.422      78
64734.085     78
15.894        78
69.083        78
              ..
4.222          4
0.495          4
2586.075       4
189.337        4
104985.972     4
Name: CUMULATIVE_CO2, Length: 279, dtype: int64

In [79]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)
Counter(y_train)

Counter({3004.503: 58,
         655.957: 28,
         36.83: 11,
         20540.884: 58,
         6299.14: 21,
         1582.764: 4,
         816.943: 36,
         131428.029: 8,
         30.433: 36,
         41.949: 36,
         33.071: 19,
         2056.17: 58,
         265593.979: 28,
         31.794: 11,
         20096.836: 13,
         363675.53: 36,
         4.045: 21,
         816.428: 19,
         40.051: 28,
         21085.173: 58,
         3055.422: 58,
         6.896: 21,
         5.368: 11,
         22.255: 58,
         73998.526: 36,
         166.972: 13,
         1703.051: 13,
         48960.532: 19,
         15271.543: 13,
         66774.682: 58,
         60.808: 21,
         20.255: 28,
         9458.922: 21,
         55.202: 13,
         56.579: 13,
         16.495: 58,
         9961.339: 21,
         167.968: 36,
         195.41: 28,
         3720.566: 36,
         168041.793: 21,
         60.058: 36,
         2.693: 21,
         0.594: 19,
         1804.451: 13,
    

In [80]:
# resample the training data with the BalancedRandomForestClassifier

scaler = StandardScaler()
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)
brfc = BalancedRandomForestClassifier(n_estimators=100, random_state=1)
brfc.fit(X_train_scaled, y_train)

ValueError: Unknown label type: 'continuous'

In [81]:
# display the confusion matrix
from sklearn.metrics import confusion_matrix

y_pred = brfc.predict(X_test)
confusion_matrix(y_test, y_pred)

AttributeError: 'BalancedRandomForestClassifier' object has no attribute 'estimators_'

In [82]:
# calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score

balanced_accuracy_score(y_test, y_pred)

NameError: name 'y_pred' is not defined

In [83]:
# print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

NameError: name 'y_pred' is not defined

In [84]:
# list the features sorted in descending order by feature importance
sorted(zip(brfc.feature_importances_, X.columns), reverse=True)

AttributeError: 'BalancedRandomForestClassifier' object has no attribute 'estimators_'