In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression # Linear Regression
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced
from sklearn.datasets import make_classification
from imblearn.over_sampling import RandomOverSampler
from imblearn.ensemble import BalancedRandomForestClassifier # RandomForestClassifier

In [3]:
%matplotlib inline
from sklearn.datasets import make_blobs

In [4]:
# importing test weather data
weather_df = pd.read_csv("Severe_Weather_TestData.csv", low_memory=False)

In [12]:
# dropping columns not needed
weather_df = weather_df.drop(columns=['INJURIES_DIRECT', 'INJURIES_INDIRECT', 'DEATHS_DIRECT', 'DEATHS_INDIRECT',
                                     'DAMAGE_PROPERTY', 'DAMAGE_CROPS', 'MAGNITUDE', 'TOR_F_SCALE', 'TOR_LENGTH',
                                     'TOR_WIDTH', 'BEGIN_LAT', 'BEGIN_LON'])

In [17]:
# displaying weather df
weather_df

Unnamed: 0,EVENT_ID,STATE,STATE_FIPS,YEAR,MONTH_NAME,EVENT_TYPE
0,10050384,MISSISSIPPI,28,1950,June,Tornado
1,10086808,OHIO,39,1950,January,Tornado
2,10120418,TEXAS,48,1950,June,Tornado
3,9981922,ARKANSAS,5,1950,January,Tornado
4,10001432,GEORGIA,13,1950,June,Tornado
...,...,...,...,...,...,...
104,10147627,WASHINGTON,53,1989,June,Tornado
105,9993165,ARKANSAS,5,1989,June,Tornado
106,10001133,FLORIDA,12,1989,June,Tornado
107,10028440,IOWA,19,1989,June,Tornado


In [14]:
# Drop the null columns where all values are null
weather_df = weather_df.dropna(axis='columns', how='all')

In [15]:
# Drop the null rows
weather_df = weather_df.dropna()

In [16]:
# show cleaned dataset
weather_df.head(25)

Unnamed: 0,EVENT_ID,STATE,STATE_FIPS,YEAR,MONTH_NAME,EVENT_TYPE
0,10050384,MISSISSIPPI,28,1950,June,Tornado
1,10086808,OHIO,39,1950,January,Tornado
2,10120418,TEXAS,48,1950,June,Tornado
3,9981922,ARKANSAS,5,1950,January,Tornado
4,10001432,GEORGIA,13,1950,June,Tornado
5,10049829,MISSISSIPPI,28,1955,October,Tornado
6,9984208,ARKANSAS,5,1955,October,Tornado
7,9991373,COLORADO,8,1955,June,Tornado
8,10121863,TEXAS,48,1955,June,Tornado
9,9978062,ALABAMA,1,1964,January,Tornado


In [18]:
# Saving cleaned test weather data
file_path = "cleaned_weather_one.csv"
weather_df.to_csv(file_path, index=False)

In [None]:
# Co2 Below This

In [45]:
# importing test co2 data 
carbon_df = pd.read_csv("CO2_Emissions_FixedTestData.csv", low_memory=False)

In [46]:
# display co2 data
carbon_df

Unnamed: 0,iso_code,country,year,co2,cumulative_co2
0,AFG,Afghanistan,1950,0.084,0.099
1,AFG,Afghanistan,1955,0.154,0.649
2,AFG,Afghanistan,1964,0.839,4.978
3,AFG,Afghanistan,1967,1.282,8.358
4,AFG,Afghanistan,1973,1.635,17.252
...,...,...,...,...,...
274,USA,United States,1989,5131.927,244082.956
275,USA,United States,1997,5686.465,286766.539
276,USA,United States,2003,6011.837,322176.256
277,USA,United States,2005,6134.521,334424.784


In [47]:
# rename columns for merging purposes
carbon_df = carbon_df.rename(columns={"iso_code": "ISO_CODE","country": "COUNTRY", "year": "YEAR", "co2": "CO2", 
                          "cumulative_co2": "CUMULATIVE_CO2"})
carbon_df

Unnamed: 0,ISO_CODE,COUNTRY,YEAR,CO2,CUMULATIVE_CO2
0,AFG,Afghanistan,1950,0.084,0.099
1,AFG,Afghanistan,1955,0.154,0.649
2,AFG,Afghanistan,1964,0.839,4.978
3,AFG,Afghanistan,1967,1.282,8.358
4,AFG,Afghanistan,1973,1.635,17.252
...,...,...,...,...,...
274,USA,United States,1989,5131.927,244082.956
275,USA,United States,1997,5686.465,286766.539
276,USA,United States,2003,6011.837,322176.256
277,USA,United States,2005,6134.521,334424.784


In [48]:
# Saving cleaned carbon test data
file_path = "cleaned_carbon_one.csv"
carbon_df.to_csv(file_path, index=False)

In [None]:
# checking dtypes

In [51]:
# carbon dtypes
carbon_df.dtypes

ISO_CODE           object
COUNTRY            object
YEAR                int64
CO2               float64
CUMULATIVE_CO2    float64
dtype: object

In [52]:
# weather dtypes
weather_df.dtypes

EVENT_ID       int64
STATE         object
STATE_FIPS     int64
YEAR           int64
MONTH_NAME    object
EVENT_TYPE    object
dtype: object

In [53]:
# Merging below this

In [54]:
# Merge attempt one
combo_df = weather_df.merge(carbon_df, left_on='YEAR', right_on='YEAR')

In [58]:
# checking merge
combo_df

Unnamed: 0,EVENT_ID,STATE,STATE_FIPS,YEAR,MONTH_NAME,EVENT_TYPE,ISO_CODE,COUNTRY,CO2,CUMULATIVE_CO2
0,10050384,MISSISSIPPI,28,1950,June,Tornado,AFG,Afghanistan,0.084,0.099
1,10050384,MISSISSIPPI,28,1950,June,Tornado,,Africa,93.452,1736.488
2,10050384,MISSISSIPPI,28,1950,June,Tornado,ALB,Albania,0.297,7.464
3,10050384,MISSISSIPPI,28,1950,June,Tornado,,Asia,476.122,12107.269
4,10050384,MISSISSIPPI,28,1950,June,Tornado,AUS,Australia,54.739,1340.047
...,...,...,...,...,...,...,...,...,...,...
2451,10139048,TEXAS,48,1989,June,Tornado,CHE,Switzerland,39.419,1703.051
2452,10139048,TEXAS,48,1989,June,Tornado,TWN,Taiwan,123.107,1804.451
2453,10139048,TEXAS,48,1989,June,Tornado,UKR,Ukraine,682.617,20096.836
2454,10139048,TEXAS,48,1989,June,Tornado,GBR,United Kingdom,581.576,62102.659


In [59]:
# Drop the null columns where all values are null
combo_df = combo_df.dropna(axis='columns', how='all')

In [63]:
# check work
combo_df

Unnamed: 0,EVENT_ID,STATE,STATE_FIPS,YEAR,MONTH_NAME,EVENT_TYPE,ISO_CODE,COUNTRY,CO2,CUMULATIVE_CO2
0,10050384,MISSISSIPPI,28,1950,June,Tornado,AFG,Afghanistan,0.084,0.099
2,10050384,MISSISSIPPI,28,1950,June,Tornado,ALB,Albania,0.297,7.464
4,10050384,MISSISSIPPI,28,1950,June,Tornado,AUS,Australia,54.739,1340.047
5,10050384,MISSISSIPPI,28,1950,June,Tornado,BHS,Bahamas,0.055,0.055
6,10050384,MISSISSIPPI,28,1950,June,Tornado,BGD,Bangladesh,0.993,1.556
...,...,...,...,...,...,...,...,...,...,...
2451,10139048,TEXAS,48,1989,June,Tornado,CHE,Switzerland,39.419,1703.051
2452,10139048,TEXAS,48,1989,June,Tornado,TWN,Taiwan,123.107,1804.451
2453,10139048,TEXAS,48,1989,June,Tornado,UKR,Ukraine,682.617,20096.836
2454,10139048,TEXAS,48,1989,June,Tornado,GBR,United Kingdom,581.576,62102.659


In [64]:
# Drop the null rows
combo_df = combo_df.dropna()

In [65]:
# check work
combo_df

Unnamed: 0,EVENT_ID,STATE,STATE_FIPS,YEAR,MONTH_NAME,EVENT_TYPE,ISO_CODE,COUNTRY,CO2,CUMULATIVE_CO2
0,10050384,MISSISSIPPI,28,1950,June,Tornado,AFG,Afghanistan,0.084,0.099
2,10050384,MISSISSIPPI,28,1950,June,Tornado,ALB,Albania,0.297,7.464
4,10050384,MISSISSIPPI,28,1950,June,Tornado,AUS,Australia,54.739,1340.047
5,10050384,MISSISSIPPI,28,1950,June,Tornado,BHS,Bahamas,0.055,0.055
6,10050384,MISSISSIPPI,28,1950,June,Tornado,BGD,Bangladesh,0.993,1.556
...,...,...,...,...,...,...,...,...,...,...
2451,10139048,TEXAS,48,1989,June,Tornado,CHE,Switzerland,39.419,1703.051
2452,10139048,TEXAS,48,1989,June,Tornado,TWN,Taiwan,123.107,1804.451
2453,10139048,TEXAS,48,1989,June,Tornado,UKR,Ukraine,682.617,20096.836
2454,10139048,TEXAS,48,1989,June,Tornado,GBR,United Kingdom,581.576,62102.659


In [74]:
# remove countries that are not the USA
usa_df = combo_df.drop(combo_df[combo_df.COUNTRY != 'United States'].index, inplace=True)

# having some issues with this

In [75]:
# Adding machine learning model parameters below here

In [None]:
target = ["X X X X X "]

In [None]:
# split data into training and testing

# Create our features
X = pd.get_dummies(df.drop(columns="X X X X X "))

# Create our target
y = pd.get_dummies(df["X X X X X "])

In [None]:
X.describe()

In [None]:
# Check the balance of our target values
y = df['X X X X X ']
y.value_counts()

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)
Counter(y_train)

In [None]:
# resample the training data with the BalancedRandomForestClassifier

scaler = StandardScaler()
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)
brfc = BalancedRandomForestClassifier(n_estimators=100, random_state=1)
brfc.fit(X_train_scaled, y_train)

In [None]:
# display the confusion matrix
from sklearn.metrics import confusion_matrix

y_pred = brfc.predict(X_test)
confusion_matrix(y_test, y_pred)

In [None]:
# calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score

balanced_accuracy_score(y_test, y_pred)

In [None]:
# print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

In [None]:
# list the features sorted in descending order by feature importance
sorted(zip(brfc.feature_importances_, X.columns), reverse=True)