In [0]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import accuracy_score, mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder

In [3]:
# Load S&P 500 df
#remember to fix!
df_quake_sp500 = pd.read_csv("https://raw.githubusercontent.com/labs13-quake-viewer/ds-data/master/" +
                             "S&P%20500%20Price%20Change%20by%20Earthquake(5.5+).csv", index_col=0)
#df_quake_gold = pd.read_csv("Gold Price Change by Earthquake(5.5+).csv", index_col=0)

df_quake_sp500.shape

(28350, 17)

In [0]:
dates = []
for i in df_quake_sp500.Date:
  dates.append(int(''.join(c for c in i if c.isdigit())))

In [0]:
df_quake_sp500["magg"] = (df_quake_sp500["Mag"] * 10).astype(int)

In [0]:
df_quake_sp500["dates"] = dates

In [7]:
df_quake_sp500.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 28350 entries, 0 to 28349
Data columns (total 19 columns):
Date              28350 non-null object
Mag               28350 non-null float64
Lat               28350 non-null float64
Long              28350 non-null float64
Depth             28350 non-null float64
magType           28350 non-null object
Place             28350 non-null object
Type              28350 non-null object
locationSource    28350 non-null object
magSource         28350 non-null object
Price_Day_0       28350 non-null float64
Price_Day_7       28350 non-null float64
Price_Day_14      28350 non-null float64
Price_Day_30      28350 non-null float64
Appr_Day_7        28350 non-null float64
Appr_Day_14       28350 non-null float64
Appr_Day_30       28350 non-null float64
magg              28350 non-null int64
dates             28350 non-null int64
dtypes: float64(11), int64(2), object(6)
memory usage: 4.3+ MB


##Linear Regression

In [0]:
X = df_quake_sp500[['dates', 'Mag', 'Lat', 'Long', 'Depth']]
y = df_quake_sp500['Appr_Day_30']

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25, random_state=42)
print("Original shape:", X.shape, "\n")

print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

Original shape: (28350, 5) 

X_train shape: (21262, 5)
X_test shape: (7088, 5)
y_train shape: (21262,)
y_test shape: (7088,)


In [0]:
model = LinearRegression()
linear_reg = model.fit(X_train, y_train)
lin_reg_score = linear_reg.score(X_train, y_train)

In [11]:
beta_0 = model.intercept_
beta_i = model.coef_[0]

print("Slope Coefficient: ", beta_i)
print("\nIntercept Value: ", beta_0)

print("\nCoefficients:")
for i in range(X.shape[1]):
  print(X.columns[i], '\t', model.coef_[i])

Slope Coefficient:  -4.6156180692039706e-07

Intercept Value:  9.859000865561473

Coefficients:
dates 	 -4.6156180692039706e-07
Mag 	 0.004671969358659884
Lat 	 -0.00027899365225175256
Long 	 0.000655624242916738
Depth 	 -0.00027141122842527223


In [12]:
y_test_predict = model.predict(X_test)
RMSE = np.sqrt(mean_squared_error(y_test, y_test_predict))
R2= r2_score(y_test, y_test_predict)

print("For S&P 500, Incident Mag >= 5.5 ({} incidents)".format(df_quake_sp500.shape[0]))
print("Linear Regression Model score:", lin_reg_score)
print('\nLinear Regression Model Predictive Accuracy:')
print('RMSE is {}'.format(RMSE))
print('R^2 is {}'.format(R2))

For S&P 500, Incident Mag >= 5.5 (28350 inclidents)
Linear Regression Model score: 0.0008829997123126486

Linear Regression Model Predictive Accuracy:
RMSE is 4.317046839951411
R^2 is 1.4139265634427467e-05


##Logistic Regression

In [0]:
df = df_quake_sp500

In [14]:
#encode object columns
object_columns = list(df.select_dtypes(include=['object']))
df[object_columns] = df[object_columns].apply(LabelEncoder().fit_transform)
print(df.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 28350 entries, 0 to 28349
Data columns (total 19 columns):
Date              28350 non-null int64
Mag               28350 non-null float64
Lat               28350 non-null float64
Long              28350 non-null float64
Depth             28350 non-null float64
magType           28350 non-null int64
Place             28350 non-null int64
Type              28350 non-null int64
locationSource    28350 non-null int64
magSource         28350 non-null int64
Price_Day_0       28350 non-null float64
Price_Day_7       28350 non-null float64
Price_Day_14      28350 non-null float64
Price_Day_30      28350 non-null float64
Appr_Day_7        28350 non-null float64
Appr_Day_14       28350 non-null float64
Appr_Day_30       28350 non-null float64
magg              28350 non-null int64
dates             28350 non-null int64
dtypes: float64(11), int64(8)
memory usage: 4.3 MB
None


In [15]:
y = df['Appr_Day_30'].astype(str)
X = df[['dates', 'Mag', 'Lat', 'Long', 'Depth', 'magType', 'Place', 'Type', 'locationSource', 'magSource']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((22680, 10), (5670, 10), (22680,), (5670,))

In [16]:
X_train.sample()

Unnamed: 0,dates,Mag,Lat,Long,Depth,magType,Place,Type,locationSource,magSource
23619,20091010,5.6,-24.941,-70.684,20.2,8,2982,0,44,10


In [17]:
%%time
log_reg = LogisticRegression(multi_class='ovr',
                             solver='liblinear',
                             max_iter=100)
log_reg_fit = log_reg.fit(X_train, y_train)

CPU times: user 18min 20s, sys: 511 ms, total: 18min 21s
Wall time: 18min 22s


In [18]:
log_reg

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='ovr', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [19]:
print("For S&P 500, Incident Mag >= 6.7 ({} incidents)".format(df_quake_sp500.shape[0]))
print("Logistic Regression Model score:", log_reg_fit.score(X_train, y_train))
predictions = log_reg.predict(X_test)
print("Logistic Regression prediction accuracy:", accuracy_score(y_test, predictions))

For S&P 500, Incident Mag >= 6.7 (28350 incidents)
Logistic Regression Model score: 0.0050264550264550265
Logistic Regression prediction accuracy: 0.0031746031746031746


In [0]:
log_reg.coef_[0]

array([-3.07945866e-07,  1.26642697e-05, -2.68839639e-04,  1.40788096e-03,
       -4.76735624e-04,  2.01844002e-05, -2.02081592e-03, -5.57768535e-08,
        2.83273227e-05,  5.10713245e-05])