In [0]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

In [2]:
# Load S&P 500 df
df_quake_gold = pd.read_csv("https://raw.githubusercontent.com/labs13-quake-viewer/ds-data/master/" +
                             "Gold%20Price%20Change%20by%20Earthquake(5.5+).csv", index_col=0)

df_quake_gold.shape

(23510, 17)

In [3]:
df_quake_gold.head()

Unnamed: 0,Date,Mag,Lat,Long,Depth,magType,Place,Type,locationSource,magSource,Price_Day_0,Price_Day_7,Price_Day_14,Price_Day_30,Appr_Day_7,Appr_Day_14,Appr_Day_30
0,1968-04-01,7.5,32.449,132.269,34.2,mw,"Shikoku, Japan",earthquake,iscgem,iscgem,37.7,37.05,38.0,39.2,-1.724138,0.795756,3.97878
1,1968-04-01,6.8,32.241,132.136,30.0,mw,"Shikoku, Japan",earthquake,iscgem,iscgem,37.7,37.05,38.0,39.2,-1.724138,0.795756,3.97878
2,1968-04-07,5.9,51.359,176.55,36.4,mw,"Rat Islands, Aleutian Islands, Alaska",earthquake,iscgem,iscgem,37.05,38.0,38.0,39.3,2.564103,2.564103,6.072874
3,1968-04-09,6.6,33.179833,-116.103,10.0,mw,"5km NNE of Ocotillo Wells, CA",earthquake,ci,ci,37.5,38.0,38.4,39.7,1.333333,2.4,5.866667
4,1968-04-14,5.6,33.514,141.763,24.2,mw,"off the east coast of Honshu, Japan",earthquake,iscgem,iscgem,38.0,38.0,38.75,39.8,0.0,1.973684,4.736842


In [0]:
dates = []
for i in df_quake_gold.Date:
  dates.append(int(''.join(c for c in i if c.isdigit())))

In [0]:
df_quake_gold["magg"] = (df_quake_gold["Mag"] * 10).astype(int)

In [0]:
df_quake_gold["dates"] = dates

In [7]:
df_quake_gold.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 23510 entries, 0 to 23509
Data columns (total 19 columns):
Date              23510 non-null object
Mag               23510 non-null float64
Lat               23510 non-null float64
Long              23510 non-null float64
Depth             23510 non-null float64
magType           23510 non-null object
Place             23510 non-null object
Type              23510 non-null object
locationSource    23510 non-null object
magSource         23510 non-null object
Price_Day_0       23510 non-null float64
Price_Day_7       23510 non-null float64
Price_Day_14      23510 non-null float64
Price_Day_30      23510 non-null float64
Appr_Day_7        23510 non-null float64
Appr_Day_14       23510 non-null float64
Appr_Day_30       23510 non-null float64
magg              23510 non-null int64
dates             23510 non-null int64
dtypes: float64(11), int64(2), object(6)
memory usage: 3.6+ MB


In [8]:
y = df_quake_gold['Appr_Day_30']
X = df_quake_gold[['dates', 'Mag', 'Lat', 'Long', 'Depth']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25, random_state=42)
print("Original shape:", X.shape, "\n")

print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

Original shape: (23510, 5) 

X_train shape: (17632, 5)
X_test shape: (5878, 5)
y_train shape: (17632,)
y_test shape: (5878,)


In [9]:
X_train.sample()

Unnamed: 0,dates,Mag,Lat,Long,Depth
16453,20050419,5.9,29.642,138.891,425.8


In [0]:
# Instantiate model with 100 decision trees
rf = RandomForestRegressor(n_estimators = 100, random_state = 42)

In [11]:
# Train model on training data
rf.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=100,
                      n_jobs=None, oob_score=False, random_state=42, verbose=0,
                      warm_start=False)

In [0]:
# Use forest's predict method on test data
predictions = rf.predict(X_test)

In [0]:
# Calculate absolute errors
errors = abs(predictions - y_test)

In [14]:
# Print out mean absolute error
print('Mean Absolute Error:', round(np.mean(errors), 2), 'degrees.')

Mean Absolute Error: 0.91 degrees.


In [15]:
# Calculate and display accuracy
accuracy = errors.sum() / y_test.sum()
print("For Gold, Incident Mag >= 5.5 ({} incidents)".format(df_quake_gold.shape[0]))
print("Random Forest Regressor Model score:", rf.score(X_train, y_train))
print('Predictive Accuracy:', round(accuracy, 2), '%.')

For Gold, Incident Mag >= 5.5 (23510 incidents)
Random Forest Regressor Model score: 0.9890685065534728
Predictive Accuracy: 1.33 %.
