In [1]:
!pip install -U pip

Requirement already up-to-date: pip in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (19.1.1)


In [2]:
!pip install -U xgboost

Requirement already up-to-date: xgboost in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (0.90)


In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier

In [4]:
# Load S&P 500 df
df_quake_gold = pd.read_csv("https://raw.githubusercontent.com/labs13-quake-viewer/ds-data/master/" +
                             "Gold%20Price%20Change%20by%20Earthquake(5.5+).csv", index_col=0)

df_quake_gold.shape

(23510, 17)

In [5]:
df_quake_gold.head()

Unnamed: 0,Date,Mag,Lat,Long,Depth,magType,Place,Type,locationSource,magSource,Price_Day_0,Price_Day_7,Price_Day_14,Price_Day_30,Appr_Day_7,Appr_Day_14,Appr_Day_30
0,1968-04-01,7.5,32.449,132.269,34.2,mw,"Shikoku, Japan",earthquake,iscgem,iscgem,37.7,37.05,38.0,39.2,-1.724138,0.795756,3.97878
1,1968-04-01,6.8,32.241,132.136,30.0,mw,"Shikoku, Japan",earthquake,iscgem,iscgem,37.7,37.05,38.0,39.2,-1.724138,0.795756,3.97878
2,1968-04-07,5.9,51.359,176.55,36.4,mw,"Rat Islands, Aleutian Islands, Alaska",earthquake,iscgem,iscgem,37.05,38.0,38.0,39.3,2.564103,2.564103,6.072874
3,1968-04-09,6.6,33.179833,-116.103,10.0,mw,"5km NNE of Ocotillo Wells, CA",earthquake,ci,ci,37.5,38.0,38.4,39.7,1.333333,2.4,5.866667
4,1968-04-14,5.6,33.514,141.763,24.2,mw,"off the east coast of Honshu, Japan",earthquake,iscgem,iscgem,38.0,38.0,38.75,39.8,0.0,1.973684,4.736842


In [6]:
dates = []
for i in df_quake_gold.Date:
  dates.append(int(''.join(c for c in i if c.isdigit())))

In [7]:
df_quake_gold["magg"] = (df_quake_gold["Mag"] * 10).astype(int)

In [8]:
df_quake_gold["dates"] = dates

In [9]:
df_quake_gold.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 23510 entries, 0 to 23509
Data columns (total 19 columns):
Date              23510 non-null object
Mag               23510 non-null float64
Lat               23510 non-null float64
Long              23510 non-null float64
Depth             23510 non-null float64
magType           23510 non-null object
Place             23510 non-null object
Type              23510 non-null object
locationSource    23510 non-null object
magSource         23510 non-null object
Price_Day_0       23510 non-null float64
Price_Day_7       23510 non-null float64
Price_Day_14      23510 non-null float64
Price_Day_30      23510 non-null float64
Appr_Day_7        23510 non-null float64
Appr_Day_14       23510 non-null float64
Appr_Day_30       23510 non-null float64
magg              23510 non-null int64
dates             23510 non-null int64
dtypes: float64(11), int64(2), object(6)
memory usage: 3.6+ MB


In [10]:
y = df_quake_gold['Appr_Day_30'].astype(str)
X = df_quake_gold[['dates', 'Mag', 'Lat', 'Long', 'Depth']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25, random_state=42)
print("Original shape:", X.shape, "\n")

print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

Original shape: (23510, 5) 

X_train shape: (17632, 5)
X_test shape: (5878, 5)
y_train shape: (17632,)
y_test shape: (5878,)


In [11]:
X_train.sample()

Unnamed: 0,dates,Mag,Lat,Long,Depth
1325,19720122,6.3,-20.435,172.94,20.0


In [12]:
# Instantiate model with various parms
xgb = XGBClassifier(objective = 'multi:softmax', booster = 'gbtree',
                    nrounds = 'min.error.idx', num_class = 4,
                    maximize = False, eval_metric = 'merror', eta = .2,
                    max_depth = 4, colsample_bytree = .4, nthread = -1)

In [13]:
%%time
# Train model on training data
xgb.fit(X_train, y_train)

CPU times: user 3d 6h 19min 13s, sys: 17.9 s, total: 3d 6h 19min 31s
Wall time: 1h 15min 16s


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=0.4, eta=0.2,
       eval_metric='merror', gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=4, maximize=False, min_child_weight=1, missing=None,
       n_estimators=100, n_jobs=1, nrounds='min.error.idx', nthread=-1,
       num_class=4, objective='multi:softprob', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=None, subsample=1, verbosity=1)

In [14]:
%%time
score = xgb.score(X_train, y_train)

CPU times: user 2d 10h 28min 15s, sys: 1.38 s, total: 2d 10h 28min 16s
Wall time: 55min 33s


In [15]:
%%time
# Use xgb's predict method on test data
predictions = xgb.predict(X_test)

CPU times: user 19h 11min 41s, sys: 388 ms, total: 19h 11min 41s
Wall time: 19min 8s


In [16]:
# Calculate absolute errors
errors = abs(predictions.astype('float64') - y_test.astype('float64'))

In [17]:
# Print out mean absolute error
print('Mean Absolute Error:', round(np.mean(errors), 2), 'degrees.')

Mean Absolute Error: 4.12 degrees.


In [18]:
# Calculate and display accuracy
accuracy = errors.sum() / y_test.astype('float64').sum()
print("For Gold, Incident Mag >= 5.5")
print("XGB Model Score:", score)
print('XGB Model Predictive Accuracy:', round(accuracy, 2), '%.')

For Gold, Incident Mag >= 5.5
XGB Model Score: 0.48548094373865697
XGB Model Predictive Accuracy: 6.05 %.
