In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression # Linear Regression
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_classification
from imblearn.over_sampling import RandomOverSampler
from imblearn.ensemble import BalancedRandomForestClassifier # RandomForestClassifier

In [None]:
%matplotlib inline
from sklearn.datasets import make_blobs

Step One

Importing Severe Weather Test Data

In [None]:
# importing test weather data
weather_df = pd.read_csv("SevereWeatherDetails_appended.csv", low_memory=False)

In [None]:
weather_df.head()

In [None]:
# dropping columns not needed
weather_df = weather_df.drop(columns=['INJURIES_DIRECT', 'INJURIES_INDIRECT', 'DEATHS_DIRECT', 'DEATHS_INDIRECT',
                                     'DAMAGE_PROPERTY', 'DAMAGE_CROPS', 'MAGNITUDE', 'TOR_F_SCALE', 'TOR_LENGTH',
                                     'TOR_WIDTH', 'BEGIN_LAT', 'BEGIN_LON', 'STATE_FIPS','STATE','CATEGORY'])

In [None]:
# displaying weather df
weather_df.head()

In [None]:
# adding together events by groups
group_groups = weather_df.groupby(['YEAR','MONTH_NAME', 'EVENT_TYPE'], as_index=False).count()
group_groups.head(20)

In [None]:
# rename 'EVENT_ID' column as 'Count' and 'YEAR' as 'Year'
group_groups = group_groups.rename(columns={'EVENT_ID': 'Count', 'YEAR': 'Year'})
group_groups.head()

In [None]:
# Co2 Below This

In [None]:
# importing test co2 data 
carbon_df = pd.read_csv("co2_byYear.csv", low_memory=False)

In [None]:
# display co2 data
carbon_df

In [None]:
# rename columns for merging purposes
carbon_df = carbon_df.rename(columns={"year": "Year"})
carbon_df

In [None]:
# dropping columns not needed
carbon_df = carbon_df.drop(columns=['co2_per_unit_energy', 'coal_co2', 'cement_co2', 'flaring_co2', 'gas_co2',
                                         'oil_co2', 'other_industry_co2', 'ghg_per_capita', 'methane', 'methane_per_capita',
                                         'nitrous_oxide', 'nitrous_oxide_per_capita', 'population', 'gdp', 'primary_energy_consumption',
                                         'energy_per_capita', 'energy_per_gdp', 'total_ghg'])

In [None]:
# show new dataframe
carbon_df

In [None]:
# Merge attempt one
combo_df = group_groups.merge(carbon_df, left_on='Year', right_on='Year')

In [None]:
# checking merge
combo_df

In [None]:
# Print out the # of yearly events value counts
count_counts = combo_df.Count.value_counts()
count_counts

In [None]:
count_counts.head()

In [None]:
# Visualize the value counts
count_counts.plot.density(xlim=(-50,100))

In [None]:
# Determine which values to replace
replace_counts = list(count_counts[count_counts < 3].index)

# Replace in DataFrame
for count in replace_counts:
    combo_df.Count = combo_df.Count.replace(count, 0)


# Check to make sure binning was successful
combo_df.Count.value_counts()



In [None]:
# combo_df.groupby('Count').filter(lambda x : len(x)>3)

In [None]:
# Adding machine learning model parameters below here

In [None]:
target = ["Count"]

In [None]:
# split data into training and testing

# Create our features
X = pd.get_dummies(combo_df.drop(columns="Count"))

# Create our target
y = pd.get_dummies(combo_df["Count"])

In [None]:
X.describe()

In [None]:
# Check the balance of our target values
y = combo_df['Count']
y.value_counts()

# Impliment BalancedRandomForestClassifier

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)
Counter(y_train)

In [None]:
# resample the training data with the BalancedRandomForestClassifier

scaler = StandardScaler()
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)
brfc = BalancedRandomForestClassifier(n_estimators=100, random_state=1)
brfc.fit(X_train_scaled, y_train)

In [None]:
# display the confusion matrix
from sklearn.metrics import confusion_matrix

y_pred = brfc.predict(X_test)
confusion_matrix(y_test, y_pred)

In [None]:
# calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score

balanced_accuracy_score(y_test, y_pred)

In [None]:
# print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

In [None]:
# list the features sorted in descending order by feature importance
sorted(zip(brfc.feature_importances_, X.columns), reverse=True)

# Easy Ensemble AdaBoost Classifier

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)
Counter(y_train)

In [None]:
# Train the EasyEnsembleClassifier
from sklearn.model_selection import cross_val_score
from sklearn.datasets import load_iris
from sklearn.ensemble import AdaBoostClassifier
from imblearn.ensemble import EasyEnsembleClassifier 

X, y = load_iris(return_X_y=True)
eec = EasyEnsembleClassifier(n_estimators=100, random_state=1)
scores = cross_val_score(clf, X, y, cv=5)
scores.mean()

In [None]:
scaler = StandardScaler()
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)
eec.fit(X_train, y_train)

In [None]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix

y_pred = clf.predict(X_test)
confusion_matrix(y_test, y_pred)

In [None]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score

balanced_accuracy_score(y_test, y_pred)

In [None]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

In [None]:
# list the features sorted in descending order by feature importance
sorted(zip(eec.feature_importances_, X.columns), reverse=True)