In [0]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, mean_squared_error, r2_score

In [2]:
# Load S&P 500 df
df_quake_sp500 = pd.read_csv("https://raw.githubusercontent.com/labs13-quake-viewer/ds-data/master/" +
                             "S&P%20500%20Price%20Change%20by%20Earthquake.csv", index_col=0)
df_quake_sp500.shape

(1870, 9)

In [0]:
df_quake_sp500.head()

Unnamed: 0,Date,Mag,Price_Day_0,Price_Day_7,Price_Day_14,Price_Day_30,Appr_Day_7,Appr_Day_14,Appr_Day_30
0,1950-01-30,6.8,17.02,17.32,17.059999,17.24,1.762632,0.235012,1.292597
1,1950-02-02,6.9,17.23,17.280001,16.99,17.32,0.290197,-1.392919,0.522345
2,1950-02-03,6.7,17.290001,17.24,17.15,17.32,-0.28919,-0.809722,0.173505
3,1950-02-28,7.7,17.219999,17.200001,17.25,17.299999,-0.116132,0.174222,0.464576
4,1950-03-07,6.7,17.200001,17.25,17.450001,17.780001,0.290692,1.453488,3.372093


In [0]:
dates = []
for i in df_quake_sp500.Date:
  dates.append(int(''.join(c for c in i if c.isdigit())))

In [0]:
df_quake_sp500["magg"] = (df_quake_sp500["Mag"] * 10).astype(int)

In [0]:
df_quake_sp500["dates"] = dates

In [6]:
df_quake_sp500.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1870 entries, 0 to 1869
Data columns (total 11 columns):
Date            1870 non-null object
Mag             1870 non-null float64
Price_Day_0     1870 non-null float64
Price_Day_7     1870 non-null float64
Price_Day_14    1870 non-null float64
Price_Day_30    1870 non-null float64
Appr_Day_7      1870 non-null float64
Appr_Day_14     1870 non-null float64
Appr_Day_30     1870 non-null float64
magg            1870 non-null int64
dates           1870 non-null int64
dtypes: float64(8), int64(2), object(1)
memory usage: 175.3+ KB


##Linear Regression

In [0]:
#X = df_quake_gold[['dates', 'magg']]
X = df_quake_sp500[['dates', 'Mag']]
y = df_quake_sp500['Appr_Day_30']

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25, random_state=42)
print("Original shape:", X.shape, "\n")

print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

Original shape: (1870, 2) 

X_train shape: (1402, 2)
X_test shape: (468, 2)
y_train shape: (1402,)
y_test shape: (468,)


In [0]:
model = LinearRegression()
linear_reg = model.fit(X_train, y_train)
lin_reg_score = linear_reg.score(X_train, y_train)

In [20]:
beta_0 = model.intercept_
beta_i = model.coef_[0]

print("Slope Coefficient: ", beta_i)
print("\nIntercept Value: ", beta_0)

print("\nCoefficients:")
for i in range(X.shape[1]):
  print(X.columns[i], '\t', model.coef_[i])

Slope Coefficient:  -2.0398883440022482e-06

Intercept Value:  42.14837648881234

Coefficients:
dates 	 -2.0398883440022482e-06
Mag 	 -0.13938481292214935


In [0]:
y_test_predict = model.predict(X_test)

In [26]:
RMSE = np.sqrt(mean_squared_error(y_test, y_test_predict))
R2= r2_score(y_test, y_test_predict)

print("For S&P 500, Incident Mag >= 6.7 ({} incidents)".format(df_quake_sp500.shape[0]))
print("Linear Regression Model score:", lin_reg_score)
print('\nLinear Regression Model Predictive Accuracy:')
print('RMSE is {}'.format(RMSE))
print('R^2 is {}'.format(R2))

For S&P 500, Incident Mag >= 6.7 (1870 inclidents)
Linear Regression Model score: 0.010592626515933956

Linear Regression Model Predictive Accuracy:
RMSE is 4.274140089080051
R^2 is -0.012256544903924471


##Logistic Regression

In [23]:
y = df_quake_sp500['Appr_Day_30'].astype(str)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((1402, 2), (468, 2), (1402,), (468,))

In [24]:
X_train.sample()

Unnamed: 0,dates,Mag
979,19900303,7.6


In [28]:
log_reg = LogisticRegression(multi_class='ovr',
                             solver='liblinear',
                             max_iter=100)
log_reg_fit = log_reg.fit(X_train, y_train)
print("For S&P 500, Incident Mag >= 6.7 ({} incidents)".format(df_quake_sp500.shape[0]))
print("Logistic Regression Model score:", log_reg_fit.score(X_train, y_train))
predictions = log_reg.predict(X_test)
print("Logistic Regression prediction accuracy:", accuracy_score(y_test, predictions))

For S&P 500, Incident Mag >= 6.7 (1870 incidents)
Logistic Regression Model score: 0.0028530670470756064
Logistic Regression prediction accuracy: 0.002136752136752137


In [0]:
log_reg.coef_[0]

array([-3.64810555e-07, -1.30921001e-13])