In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression # Linear Regression
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_classification
from imblearn.over_sampling import RandomOverSampler
from imblearn.ensemble import BalancedRandomForestClassifier # RandomForestClassifier

In [3]:
%matplotlib inline
from sklearn.datasets import make_blobs

Step One

Importing Severe Weather Test Data

In [4]:
# importing test weather data
weather_df = pd.read_csv("Severe_Weather_TestData.csv", low_memory=False)

In [5]:
# dropping columns not needed
weather_df = weather_df.drop(columns=['INJURIES_DIRECT', 'INJURIES_INDIRECT', 'DEATHS_DIRECT', 'DEATHS_INDIRECT',
                                     'DAMAGE_PROPERTY', 'DAMAGE_CROPS', 'MAGNITUDE', 'TOR_F_SCALE', 'TOR_LENGTH',
                                     'TOR_WIDTH', 'BEGIN_LAT', 'BEGIN_LON'])

In [71]:
# displaying weather df
weather_df.head

<bound method NDFrame.head of      EVENT_ID  YEAR MONTH_NAME EVENT_TYPE
0    10050384  1950       June    Tornado
1    10086808  1950    January    Tornado
2    10120418  1950       June    Tornado
3     9981922  1950    January    Tornado
4    10001432  1950       June    Tornado
..        ...   ...        ...        ...
284    240665  2010       June  Lightning
285    245002  2010       June  Lightning
286    245889  2010       June    Tornado
287    226112  2010       June    Tornado
288    246134  2010       June  Lightning

[289 rows x 4 columns]>

Step Two

Create a DF that focuses on events by year

In [7]:
# adding together events by year

year_groups = weather_df.groupby('YEAR', as_index=False).count()
year_groups

Unnamed: 0,YEAR,EVENT_ID,STATE,STATE_FIPS,MONTH_NAME,EVENT_TYPE,CATEGORY
0,1950,5,5,5,5,5,0
1,1955,4,4,4,4,4,0
2,1964,10,10,10,10,10,0
3,1967,26,26,26,26,26,0
4,1973,28,28,28,28,28,0
5,1980,14,14,14,14,14,0
6,1986,5,5,5,5,5,0
7,1989,17,17,17,17,17,0
8,1997,78,78,78,78,78,0
9,2003,16,16,16,16,16,0


In [8]:
# rename columns as counts
year_groups = year_groups.rename(columns={"EVENT_ID": "Number_Of_Events_by_Year"})
year_groups

Unnamed: 0,YEAR,Number_Of_Events_by_Year,STATE,STATE_FIPS,MONTH_NAME,EVENT_TYPE,CATEGORY
0,1950,5,5,5,5,5,0
1,1955,4,4,4,4,4,0
2,1964,10,10,10,10,10,0
3,1967,26,26,26,26,26,0
4,1973,28,28,28,28,28,0
5,1980,14,14,14,14,14,0
6,1986,5,5,5,5,5,0
7,1989,17,17,17,17,17,0
8,1997,78,78,78,78,78,0
9,2003,16,16,16,16,16,0


In [9]:
# drop unnecessary columns
year_groups = year_groups.drop(columns=['MONTH_NAME', 'EVENT_TYPE', 'STATE', 'STATE_FIPS'])

In [10]:
# check work
year_groups

Unnamed: 0,YEAR,Number_Of_Events_by_Year,CATEGORY
0,1950,5,0
1,1955,4,0
2,1964,10,0
3,1967,26,0
4,1973,28,0
5,1980,14,0
6,1986,5,0
7,1989,17,0
8,1997,78,0
9,2003,16,0


In [11]:
# Saving weather data focused on year
file_path = "cleaned_weather_year_nonML.csv"
year_groups.to_csv(file_path, index=False)

Step Three

Create a DF that focuses on events by month

In [68]:
# adding together events by month
month_groups = weather_df.groupby(['YEAR','MONTH_NAME', 'EVENT_TYPE'], as_index=False).count()
month_groups

Unnamed: 0,YEAR,MONTH_NAME,EVENT_TYPE,EVENT_ID
0,1950,January,Tornado,2
1,1950,June,Tornado,3
2,1955,June,Tornado,2
3,1955,October,Tornado,2
4,1964,December,Tornado,2
5,1964,January,Tornado,1
6,1964,June,Tornado,7
7,1967,December,Tornado,9
8,1967,January,Tornado,4
9,1967,June,Tornado,7


In [70]:
# adding together events by groups
group_groups = weather_df.groupby(['YEAR','MONTH_NAME', 'EVENT_TYPE'], as_index=False).count()
group_groups

Unnamed: 0,YEAR,MONTH_NAME,EVENT_TYPE,EVENT_ID
0,1950,January,Tornado,2
1,1950,June,Tornado,3
2,1955,June,Tornado,2
3,1955,October,Tornado,2
4,1964,December,Tornado,2
5,1964,January,Tornado,1
6,1964,June,Tornado,7
7,1967,December,Tornado,9
8,1967,January,Tornado,4
9,1967,June,Tornado,7


In [72]:
# rename columns as counts
group_groups = group_groups.rename(columns={"YEAR": "Year"})
group_groups

Unnamed: 0,Year,MONTH_NAME,EVENT_TYPE,EVENT_ID
0,1950,January,Tornado,2
1,1950,June,Tornado,3
2,1955,June,Tornado,2
3,1955,October,Tornado,2
4,1964,December,Tornado,2
5,1964,January,Tornado,1
6,1964,June,Tornado,7
7,1967,December,Tornado,9
8,1967,January,Tornado,4
9,1967,June,Tornado,7


In [14]:
# check work
month_groups

Unnamed: 0,MONTH_NAME,Number_Of_Events_by_Month,STATE,STATE_FIPS,YEAR,EVENT_TYPE,CATEGORY
0,December,29,29,29,29,29,0
1,January,48,48,48,48,48,0
2,June,175,175,175,175,175,0
3,October,37,37,37,37,37,0


In [15]:
# drop unnecessary columns
month_groups = month_groups.drop(columns=['YEAR', 'EVENT_TYPE', 'STATE', 'STATE_FIPS'])

In [16]:
# check work
month_groups

Unnamed: 0,MONTH_NAME,Number_Of_Events_by_Month,CATEGORY
0,December,29,0
1,January,48,0
2,June,175,0
3,October,37,0


In [17]:
# Saving weather data focused on month
file_path = "cleaned_weather_month_nonML.csv"
month_groups.to_csv(file_path, index=False)

Step Four

Create a DF that counts the number of events by Type

In [18]:
# adding together events by type

type_groups = weather_df.groupby('EVENT_TYPE', as_index=False).count()
type_groups

Unnamed: 0,EVENT_TYPE,EVENT_ID,STATE,STATE_FIPS,YEAR,MONTH_NAME,CATEGORY
0,Blizzard,20,20,20,20,20,0
1,Heavy Snow,5,5,5,5,5,0
2,Lightning,118,118,118,118,118,0
3,Tornado,146,146,146,146,146,0


In [19]:
# check work
type_groups

Unnamed: 0,EVENT_TYPE,EVENT_ID,STATE,STATE_FIPS,YEAR,MONTH_NAME,CATEGORY
0,Blizzard,20,20,20,20,20,0
1,Heavy Snow,5,5,5,5,5,0
2,Lightning,118,118,118,118,118,0
3,Tornado,146,146,146,146,146,0


In [20]:
# drop unnecessary columns
type_groups = type_groups.drop(columns=['YEAR', 'MONTH_NAME', 'STATE', 'STATE_FIPS'])

In [21]:
# rename columns 
type_groups = type_groups.rename(columns={"EVENT_ID": "Count"})

In [22]:
# check work
type_groups

Unnamed: 0,EVENT_TYPE,Count,CATEGORY
0,Blizzard,20,0
1,Heavy Snow,5,0
2,Lightning,118,0
3,Tornado,146,0


In [23]:
# Saving weather data focused on type
file_path = "cleaned_weather_type_nonML.csv"
type_groups.to_csv(file_path, index=False)

Step Five

Concatanate the groups into a single DF

In [24]:
# gather up all the groups
# year_groups + month_groups + type_groups

In [25]:
merge1 = pd.merge(year_groups,month_groups,on='CATEGORY')

In [26]:
# check work
merge1

Unnamed: 0,YEAR,Number_Of_Events_by_Year,CATEGORY,MONTH_NAME,Number_Of_Events_by_Month
0,1950,5,0,December,29
1,1950,5,0,January,48
2,1950,5,0,June,175
3,1950,5,0,October,37
4,1955,4,0,December,29
5,1955,4,0,January,48
6,1955,4,0,June,175
7,1955,4,0,October,37
8,1964,10,0,December,29
9,1964,10,0,January,48


In [27]:
merge2 = pd.merge(merge1,type_groups,on='CATEGORY')

In [28]:
merge2

Unnamed: 0,YEAR,Number_Of_Events_by_Year,CATEGORY,MONTH_NAME,Number_Of_Events_by_Month,EVENT_TYPE,Count
0,1950,5,0,December,29,Blizzard,20
1,1950,5,0,December,29,Heavy Snow,5
2,1950,5,0,December,29,Lightning,118
3,1950,5,0,December,29,Tornado,146
4,1950,5,0,January,48,Blizzard,20
...,...,...,...,...,...,...,...
187,2010,48,0,June,175,Tornado,146
188,2010,48,0,October,37,Blizzard,20
189,2010,48,0,October,37,Heavy Snow,5
190,2010,48,0,October,37,Lightning,118


In [29]:
# rename dataframe
merge_df = merge2

In [30]:
# check work
merge_df

Unnamed: 0,YEAR,Number_Of_Events_by_Year,CATEGORY,MONTH_NAME,Number_Of_Events_by_Month,EVENT_TYPE,Count
0,1950,5,0,December,29,Blizzard,20
1,1950,5,0,December,29,Heavy Snow,5
2,1950,5,0,December,29,Lightning,118
3,1950,5,0,December,29,Tornado,146
4,1950,5,0,January,48,Blizzard,20
...,...,...,...,...,...,...,...
187,2010,48,0,June,175,Tornado,146
188,2010,48,0,October,37,Blizzard,20
189,2010,48,0,October,37,Heavy Snow,5
190,2010,48,0,October,37,Lightning,118


In [31]:
# drop the category column
merge_df = merge_df.drop(columns=['CATEGORY'])
merge_df

Unnamed: 0,YEAR,Number_Of_Events_by_Year,MONTH_NAME,Number_Of_Events_by_Month,EVENT_TYPE,Count
0,1950,5,December,29,Blizzard,20
1,1950,5,December,29,Heavy Snow,5
2,1950,5,December,29,Lightning,118
3,1950,5,December,29,Tornado,146
4,1950,5,January,48,Blizzard,20
...,...,...,...,...,...,...
187,2010,48,June,175,Tornado,146
188,2010,48,October,37,Blizzard,20
189,2010,48,October,37,Heavy Snow,5
190,2010,48,October,37,Lightning,118


In [32]:
# rename columns to look better
merge_df = merge_df.rename(columns={"YEAR": "Year", "MONTH_NAME": "Month", "EVENT_TYPE": "Event Type",
                                   "Count": "Event Count", "Number_Of_Events_by_Year": "Number of Events by Year",
                                   "Number_Of_Events_by_Month": "Number of Events by Month"})
merge_df

Unnamed: 0,Year,Number of Events by Year,Month,Number of Events by Month,Event Type,Event Count
0,1950,5,December,29,Blizzard,20
1,1950,5,December,29,Heavy Snow,5
2,1950,5,December,29,Lightning,118
3,1950,5,December,29,Tornado,146
4,1950,5,January,48,Blizzard,20
...,...,...,...,...,...,...
187,2010,48,June,175,Tornado,146
188,2010,48,October,37,Blizzard,20
189,2010,48,October,37,Heavy Snow,5
190,2010,48,October,37,Lightning,118


In [33]:
# Saving final test weather data
file_path = "final_test_weather_data.csv"
merge_df.to_csv(file_path, index=False)

Continuing upon old code here

Step Six

Create a Severe Weather DF not for ML purposes

In [34]:
# drop null columns
weather_df = weather_df.dropna(axis='columns', how='all')

In [35]:
# Drop the null rows
weather_df = weather_df.dropna()

In [36]:
# show cleaned dataset
weather_df.head(25)

Unnamed: 0,EVENT_ID,STATE,STATE_FIPS,YEAR,MONTH_NAME,EVENT_TYPE
0,10050384,MISSISSIPPI,28,1950,June,Tornado
1,10086808,OHIO,39,1950,January,Tornado
2,10120418,TEXAS,48,1950,June,Tornado
3,9981922,ARKANSAS,5,1950,January,Tornado
4,10001432,GEORGIA,13,1950,June,Tornado
5,10049829,MISSISSIPPI,28,1955,October,Tornado
6,9984208,ARKANSAS,5,1955,October,Tornado
7,9991373,COLORADO,8,1955,June,Tornado
8,10121863,TEXAS,48,1955,June,Tornado
9,9978062,ALABAMA,1,1964,January,Tornado


In [37]:
# Saving cleaned test weather data
file_path = "cleaned_weather_one_nonML.csv"
weather_df.to_csv(file_path, index=False)

In [38]:
# make data for ML

In [39]:
# dropping columns not needed
weather_df = weather_df.drop(columns=['STATE', 'STATE_FIPS'])

In [40]:
# show df for ML
weather_df

Unnamed: 0,EVENT_ID,YEAR,MONTH_NAME,EVENT_TYPE
0,10050384,1950,June,Tornado
1,10086808,1950,January,Tornado
2,10120418,1950,June,Tornado
3,9981922,1950,January,Tornado
4,10001432,1950,June,Tornado
...,...,...,...,...
284,240665,2010,June,Lightning
285,245002,2010,June,Lightning
286,245889,2010,June,Tornado
287,226112,2010,June,Tornado


In [41]:
# Saving cleaned test weather data for ML
file_path = "cleaned_weather_one_ML.csv"
weather_df.to_csv(file_path, index=False)

In [42]:
# Co2 Below This

In [43]:
# importing test co2 data 
carbon_df = pd.read_csv("co2_byYear.csv", low_memory=False)

In [44]:
# display co2 data
carbon_df

Unnamed: 0,year,co2,co2_per_capita,co2_per_unit_energy,coal_co2,cement_co2,flaring_co2,gas_co2,oil_co2,other_industry_co2,...,ghg_per_capita,methane,methane_per_capita,nitrous_oxide,nitrous_oxide_per_capita,population,gdp,primary_energy_consumption,energy_per_capita,energy_per_gdp
0,1950,23193.336,943.286,0.000,15959.768,272.887,223.550,1110.931,5611.231,9.976,...,0.000,0.00,0.000,0.00,0.000,7.570713e+09,1.585967e+13,0.000,0.000,0.000
1,1951,24765.194,1214.978,0.000,16981.583,309.265,243.305,1306.674,5907.881,11.002,...,0.000,0.00,0.000,0.00,0.000,7.712655e+09,7.992201e+12,0.000,0.000,0.000
2,1952,25203.424,834.737,0.000,16927.201,331.430,259.877,1405.429,6263.420,10.710,...,0.000,0.00,0.000,0.00,0.000,7.852545e+09,8.401431e+12,0.000,0.000,0.000
3,1953,25929.893,880.578,0.000,17104.163,368.454,248.979,1498.933,6689.993,12.910,...,0.000,0.00,0.000,0.00,0.000,7.992062e+09,8.884521e+12,0.000,0.000,0.000
4,1954,26744.552,1378.627,0.000,17319.801,400.182,236.584,1588.220,7182.609,11.442,...,0.000,0.00,0.000,0.00,0.000,8.133217e+09,9.099744e+12,0.000,0.000,0.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
66,2016,124705.642,1156.831,44.924,48325.275,4990.527,1438.122,27011.160,41892.058,1048.509,...,1415.378,17424.08,345.519,6291.93,105.005,2.239131e+10,2.147076e+14,366170.181,5562418.068,208.589
67,2017,126304.716,1156.251,18.863,48587.789,5067.569,1576.265,27603.881,42412.614,1056.584,...,0.000,0.00,0.000,0.00,0.000,2.264281e+10,2.211738e+14,366923.317,3678183.705,0.000
68,2018,128452.538,1152.783,18.444,49368.748,5286.347,1531.205,28781.873,42422.168,1062.207,...,0.000,0.00,0.000,0.00,0.000,2.289184e+10,2.276454e+14,376507.961,3698476.648,0.000
69,2019,128422.887,1165.000,18.142,48731.161,5408.677,1611.674,29061.768,42547.578,1062.016,...,0.000,0.00,0.000,0.00,0.000,2.313895e+10,0.000000e+00,380148.487,3659567.475,0.000


In [45]:
# rename columns for merging purposes
carbon_df = carbon_df.rename(columns={"year": "Year"})
carbon_df

Unnamed: 0,Year,co2,co2_per_capita,co2_per_unit_energy,coal_co2,cement_co2,flaring_co2,gas_co2,oil_co2,other_industry_co2,...,ghg_per_capita,methane,methane_per_capita,nitrous_oxide,nitrous_oxide_per_capita,population,gdp,primary_energy_consumption,energy_per_capita,energy_per_gdp
0,1950,23193.336,943.286,0.000,15959.768,272.887,223.550,1110.931,5611.231,9.976,...,0.000,0.00,0.000,0.00,0.000,7.570713e+09,1.585967e+13,0.000,0.000,0.000
1,1951,24765.194,1214.978,0.000,16981.583,309.265,243.305,1306.674,5907.881,11.002,...,0.000,0.00,0.000,0.00,0.000,7.712655e+09,7.992201e+12,0.000,0.000,0.000
2,1952,25203.424,834.737,0.000,16927.201,331.430,259.877,1405.429,6263.420,10.710,...,0.000,0.00,0.000,0.00,0.000,7.852545e+09,8.401431e+12,0.000,0.000,0.000
3,1953,25929.893,880.578,0.000,17104.163,368.454,248.979,1498.933,6689.993,12.910,...,0.000,0.00,0.000,0.00,0.000,7.992062e+09,8.884521e+12,0.000,0.000,0.000
4,1954,26744.552,1378.627,0.000,17319.801,400.182,236.584,1588.220,7182.609,11.442,...,0.000,0.00,0.000,0.00,0.000,8.133217e+09,9.099744e+12,0.000,0.000,0.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
66,2016,124705.642,1156.831,44.924,48325.275,4990.527,1438.122,27011.160,41892.058,1048.509,...,1415.378,17424.08,345.519,6291.93,105.005,2.239131e+10,2.147076e+14,366170.181,5562418.068,208.589
67,2017,126304.716,1156.251,18.863,48587.789,5067.569,1576.265,27603.881,42412.614,1056.584,...,0.000,0.00,0.000,0.00,0.000,2.264281e+10,2.211738e+14,366923.317,3678183.705,0.000
68,2018,128452.538,1152.783,18.444,49368.748,5286.347,1531.205,28781.873,42422.168,1062.207,...,0.000,0.00,0.000,0.00,0.000,2.289184e+10,2.276454e+14,376507.961,3698476.648,0.000
69,2019,128422.887,1165.000,18.142,48731.161,5408.677,1611.674,29061.768,42547.578,1062.016,...,0.000,0.00,0.000,0.00,0.000,2.313895e+10,0.000000e+00,380148.487,3659567.475,0.000


In [46]:
# dropping columns not needed
carbon_df = carbon_df.drop(columns=['co2_per_unit_energy', 'coal_co2', 'cement_co2', 'flaring_co2', 'gas_co2',
                                         'oil_co2', 'other_industry_co2', 'ghg_per_capita', 'methane', 'methane_per_capita',
                                         'nitrous_oxide', 'nitrous_oxide_per_capita', 'population', 'gdp', 'primary_energy_consumption',
                                         'energy_per_capita', 'energy_per_gdp', 'total_ghg'])

In [47]:
# show new dataframe
carbon_df

Unnamed: 0,Year,co2,co2_per_capita
0,1950,23193.336,943.286
1,1951,24765.194,1214.978
2,1952,25203.424,834.737
3,1953,25929.893,880.578
4,1954,26744.552,1378.627
...,...,...,...
66,2016,124705.642,1156.831
67,2017,126304.716,1156.251
68,2018,128452.538,1152.783
69,2019,128422.887,1165.000


In [48]:
# Saving cleaned carbon data
file_path = "cleaned_carbon_one_nonML.csv"
weather_df.to_csv(file_path, index=False)

In [49]:
# making data for ML with the carbon data

In [50]:
# dropping columns not needed
# carbon_df = carbon_df.drop(columns=['ISO_CODE', 'COUNTRY'])
# carbon_df

In [79]:
# Saving cleaned carbon data
file_path = "cleaned_carbon_one_ML.csv"
weather_df.to_csv(file_path, index=False)

In [51]:
# checking dtypes

In [52]:
carbon_df.dtypes

Year                int64
co2               float64
co2_per_capita    float64
dtype: object

In [53]:
# weather dtypes
weather_df.dtypes

EVENT_ID       int64
YEAR           int64
MONTH_NAME    object
EVENT_TYPE    object
dtype: object

In [54]:
# merge dtypes
merge_df.dtypes

Year                          int64
Number of Events by Year      int64
Month                        object
Number of Events by Month     int64
Event Type                   object
Event Count                   int64
dtype: object

In [55]:
# Merging below this

In [83]:
# Merge attempt one
combo_df = group_groups.merge(carbon_df, left_on='Year', right_on='Year')

In [84]:
# checking merge
combo_df

Unnamed: 0,Year,MONTH_NAME,EVENT_TYPE,EVENT_ID,co2,co2_per_capita
0,1950,January,Tornado,2,23193.336,943.286
1,1950,June,Tornado,3,23193.336,943.286
2,1955,June,Tornado,2,29264.525,1214.075
3,1955,October,Tornado,2,29264.525,1214.075
4,1964,December,Tornado,2,43134.197,1150.748
5,1964,January,Tornado,1,43134.197,1150.748
6,1964,June,Tornado,7,43134.197,1150.748
7,1967,December,Tornado,9,48536.547,1135.557
8,1967,January,Tornado,4,48536.547,1135.557
9,1967,June,Tornado,7,48536.547,1135.557


In [85]:
# Drop the null columns where all values are null
combo_df = combo_df.dropna(axis='columns', how='all')

In [86]:
# check work
combo_df

Unnamed: 0,Year,MONTH_NAME,EVENT_TYPE,EVENT_ID,co2,co2_per_capita
0,1950,January,Tornado,2,23193.336,943.286
1,1950,June,Tornado,3,23193.336,943.286
2,1955,June,Tornado,2,29264.525,1214.075
3,1955,October,Tornado,2,29264.525,1214.075
4,1964,December,Tornado,2,43134.197,1150.748
5,1964,January,Tornado,1,43134.197,1150.748
6,1964,June,Tornado,7,43134.197,1150.748
7,1967,December,Tornado,9,48536.547,1135.557
8,1967,January,Tornado,4,48536.547,1135.557
9,1967,June,Tornado,7,48536.547,1135.557


In [87]:
# Drop the null rows
combo_df = combo_df.dropna()

In [88]:
# check work
combo_df.head(25)

Unnamed: 0,Year,MONTH_NAME,EVENT_TYPE,EVENT_ID,co2,co2_per_capita
0,1950,January,Tornado,2,23193.336,943.286
1,1950,June,Tornado,3,23193.336,943.286
2,1955,June,Tornado,2,29264.525,1214.075
3,1955,October,Tornado,2,29264.525,1214.075
4,1964,December,Tornado,2,43134.197,1150.748
5,1964,January,Tornado,1,43134.197,1150.748
6,1964,June,Tornado,7,43134.197,1150.748
7,1967,December,Tornado,9,48536.547,1135.557
8,1967,January,Tornado,4,48536.547,1135.557
9,1967,June,Tornado,7,48536.547,1135.557


In [89]:
# drop columns for non-ML code
combo_df2 = combo_df.drop(columns=['MONTH_NAME', 'EVENT_TYPE'])
combo_df2

Unnamed: 0,Year,EVENT_ID,co2,co2_per_capita
0,1950,2,23193.336,943.286
1,1950,3,23193.336,943.286
2,1955,2,29264.525,1214.075
3,1955,2,29264.525,1214.075
4,1964,2,43134.197,1150.748
5,1964,1,43134.197,1150.748
6,1964,7,43134.197,1150.748
7,1967,9,48536.547,1135.557
8,1967,4,48536.547,1135.557
9,1967,7,48536.547,1135.557


In [90]:
# rename columns to look better
combo_df2 = combo_df2.rename(columns={"EVENT_ID": "Count"})
combo_df2

Unnamed: 0,Year,Count,co2,co2_per_capita
0,1950,2,23193.336,943.286
1,1950,3,23193.336,943.286
2,1955,2,29264.525,1214.075
3,1955,2,29264.525,1214.075
4,1964,2,43134.197,1150.748
5,1964,1,43134.197,1150.748
6,1964,7,43134.197,1150.748
7,1967,9,48536.547,1135.557
8,1967,4,48536.547,1135.557
9,1967,7,48536.547,1135.557


In [92]:
combo_df.head(5)

Unnamed: 0,Year,MONTH_NAME,EVENT_TYPE,EVENT_ID,co2,co2_per_capita
0,1950,January,Tornado,2,23193.336,943.286
1,1950,June,Tornado,3,23193.336,943.286
2,1955,June,Tornado,2,29264.525,1214.075
3,1955,October,Tornado,2,29264.525,1214.075
4,1964,December,Tornado,2,43134.197,1150.748


In [91]:
# Saving combined df non ml as csv data
file_path = "combined_test_data_nonML.csv"
combo_df2.to_csv(file_path, index=False)

In [93]:
# work on machine learning data form of combo_df
# rename columns to look better
combo_df = combo_df.rename(columns={"MONTH_NAME": "Month", "EVENT_TYPE": "Event", "EVENT_ID": "Count"})
combo_df

Unnamed: 0,Year,Month,Event,Count,co2,co2_per_capita
0,1950,January,Tornado,2,23193.336,943.286
1,1950,June,Tornado,3,23193.336,943.286
2,1955,June,Tornado,2,29264.525,1214.075
3,1955,October,Tornado,2,29264.525,1214.075
4,1964,December,Tornado,2,43134.197,1150.748
5,1964,January,Tornado,1,43134.197,1150.748
6,1964,June,Tornado,7,43134.197,1150.748
7,1967,December,Tornado,9,48536.547,1135.557
8,1967,January,Tornado,4,48536.547,1135.557
9,1967,June,Tornado,7,48536.547,1135.557


In [94]:
# Saving cleaned combined data
file_path = "comboned_test_data_ML.csv"
combo_df.to_csv(file_path, index=False)

In [37]:
# Adding machine learning model parameters below here

In [95]:
target = ["Count"]

In [96]:
# split data into training and testing

# Create our features
X = pd.get_dummies(combo_df.drop(columns="Count"))

# Create our target
y = pd.get_dummies(combo_df["Count"])

In [97]:
X.describe()

Unnamed: 0,Year,co2,co2_per_capita,Month_December,Month_January,Month_June,Month_October,Event_Blizzard,Event_Heavy Snow,Event_Lightning,Event_Tornado
count,51.0,51.0,51.0,51.0,51.0,51.0,51.0,51.0,51.0,51.0,51.0
mean,1988.588235,82526.373392,1200.262216,0.176471,0.294118,0.27451,0.254902,0.058824,0.098039,0.196078,0.647059
std,18.161692,27379.552749,114.052372,0.385013,0.460179,0.450708,0.440143,0.237635,0.300327,0.400979,0.48264
min,1950.0,23193.336,943.286,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1973.0,67163.422,1140.447,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,1997.0,90055.545,1210.391,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
75%,2003.0,100340.414,1226.914,0.0,1.0,1.0,0.5,0.0,0.0,0.0,1.0
max,2010.0,118593.92,1503.995,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [98]:
# Check the balance of our target values
y = combo_df['Count']
y.value_counts()

1     16
2      8
3      6
4      4
7      3
9      3
6      2
12     2
5      1
11     1
15     1
18     1
26     1
31     1
33     1
Name: Count, dtype: int64

In [99]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)
Counter(y_train)

ValueError: The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2.

In [None]:
# resample the training data with the BalancedRandomForestClassifier

scaler = StandardScaler()
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)
brfc = BalancedRandomForestClassifier(n_estimators=100, random_state=1)
brfc.fit(X_train_scaled, y_train)

In [None]:
# display the confusion matrix
from sklearn.metrics import confusion_matrix

y_pred = brfc.predict(X_test)
confusion_matrix(y_test, y_pred)

In [None]:
# calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score

balanced_accuracy_score(y_test, y_pred)

In [None]:
# print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

In [None]:
# list the features sorted in descending order by feature importance
sorted(zip(brfc.feature_importances_, X.columns), reverse=True)