In [0]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

In [2]:
# Load S&P 500 df
df_quake_sp500 = pd.read_csv("https://raw.githubusercontent.com/labs13-quake-viewer/ds-data/master/" +
                             "S&P%20500%20Price%20Change%20by%20Earthquake(5.5+).csv", index_col=0)

df_quake_sp500.shape

(28350, 17)

In [3]:
df_quake_sp500.head()

Unnamed: 0,Date,Mag,Lat,Long,Depth,magType,Place,Type,locationSource,magSource,Price_Day_0,Price_Day_7,Price_Day_14,Price_Day_30,Appr_Day_7,Appr_Day_14,Appr_Day_30
0,1950-01-03,6.5,17.576,121.428,30.0,mw,"Luzon, Philippines",earthquake,iscgem,iscgem,16.66,17.030001,16.860001,17.23,2.220894,1.200486,3.421369
1,1950-01-03,6.3,-45.798,-77.077,15.0,mw,"off the coast of Aisen, Chile",earthquake,iscgem,iscgem,16.66,17.030001,16.860001,17.23,2.220894,1.200486,3.421369
2,1950-01-10,6.1,10.031,-103.964,15.0,mw,northern East Pacific Rise,earthquake,iscgem,iscgem,17.030001,16.860001,16.860001,17.280001,-0.998238,-0.998238,1.467998
3,1950-01-19,5.7,27.255,53.029,15.0,mw,southern Iran,earthquake,iscgem,iscgem,16.870001,16.73,17.23,17.200001,-0.829881,2.13396,1.956135
4,1950-01-22,5.7,27.096,52.946,15.0,mw,southern Iran,earthquake,iscgem,iscgem,16.92,17.02,17.32,17.17,0.591017,2.364066,1.477541


In [0]:
dates = []
for i in df_quake_sp500.Date:
  dates.append(int(''.join(c for c in i if c.isdigit())))

In [0]:
df_quake_sp500["magg"] = (df_quake_sp500["Mag"] * 10).astype(int)

In [0]:
df_quake_sp500["dates"] = dates

In [7]:
df_quake_sp500.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 28350 entries, 0 to 28349
Data columns (total 19 columns):
Date              28350 non-null object
Mag               28350 non-null float64
Lat               28350 non-null float64
Long              28350 non-null float64
Depth             28350 non-null float64
magType           28350 non-null object
Place             28350 non-null object
Type              28350 non-null object
locationSource    28350 non-null object
magSource         28350 non-null object
Price_Day_0       28350 non-null float64
Price_Day_7       28350 non-null float64
Price_Day_14      28350 non-null float64
Price_Day_30      28350 non-null float64
Appr_Day_7        28350 non-null float64
Appr_Day_14       28350 non-null float64
Appr_Day_30       28350 non-null float64
magg              28350 non-null int64
dates             28350 non-null int64
dtypes: float64(11), int64(2), object(6)
memory usage: 4.3+ MB


In [8]:
y = df_quake_sp500['Appr_Day_30']
X = df_quake_sp500[['dates', 'Mag', 'Lat', 'Long', 'Depth']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25, random_state=42)
print("Original shape:", X.shape, "\n")

print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

Original shape: (28350, 5) 

X_train shape: (21262, 5)
X_test shape: (7088, 5)
y_train shape: (21262,)
y_test shape: (7088,)


In [9]:
X_train.sample()

Unnamed: 0,dates,Mag,Lat,Long,Depth
7845,19760407,5.5,17.62,145.547,217.0


In [0]:
# Instantiate model with 100 decision trees
rf = RandomForestRegressor(n_estimators = 100, random_state = 42)

In [11]:
%%time
# Train model on training data
rf.fit(X_train, y_train)

CPU times: user 16.4 s, sys: 165 ms, total: 16.6 s
Wall time: 16.9 s


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=100,
                      n_jobs=None, oob_score=False, random_state=42, verbose=0,
                      warm_start=False)

In [0]:
# Use forest's predict method on test data
predictions = rf.predict(X_test)

In [0]:
# Calculate absolute errors
errors = abs(predictions - y_test)

In [14]:
# Print out mean absolute error
print('Mean Absolute Error:', round(np.mean(errors), 2), 'degrees.')

Mean Absolute Error: 0.88 degrees.


In [16]:
# Calculate and display accuracy
accuracy = errors.sum() / y_test.sum()
print("For S&P 500, Incident Mag >= 5.5 ({} incidents)".format(df_quake_sp500.shape[0]))
print("Random Forest Regressor Model score:", rf.score(X_train, y_train))
print('Predictive Accuracy:', round(accuracy, 2), '%.')

For S&P 500, Incident Mag >= 5.5 (28350 incidents)
Random Forest Regressor Model score: 0.9862586854952426
Predictive Accuracy: 1.3 %.
