In [0]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, mean_squared_error, r2_score

In [2]:
# Load Gold df
df_quake_gold = pd.read_csv("https://raw.githubusercontent.com/labs13-quake-viewer/ds-data/master/" +
                            "Gold%20Price%20Change%20by%20Earthquake.csv", index_col=0)
df_quake_gold.shape

(1445, 9)

In [0]:
dates = []
for i in df_quake_gold.Date:
  dates.append(int(''.join(c for c in i if c.isdigit())))

In [0]:
df_quake_gold["magg"] = (df_quake_gold["Mag"] * 10).astype(int)

In [0]:
df_quake_gold["dates"] = dates

In [6]:
df_quake_gold.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1445 entries, 0 to 1444
Data columns (total 11 columns):
Date            1445 non-null object
Mag             1445 non-null float64
Price_Day_0     1445 non-null float64
Price_Day_7     1445 non-null float64
Price_Day_14    1445 non-null float64
Price_Day_30    1445 non-null float64
Appr_Day_7      1445 non-null float64
Appr_Day_14     1445 non-null float64
Appr_Day_30     1445 non-null float64
magg            1445 non-null int64
dates           1445 non-null int64
dtypes: float64(8), int64(2), object(1)
memory usage: 135.5+ KB


##Linear Regression

In [0]:
X = df_quake_gold[['dates', 'Mag']]
y = df_quake_gold['Appr_Day_30']

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25, random_state=42)
print("Original shape:", X.shape, "\n")

print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

Original shape: (1445, 2) 

X_train shape: (1083, 2)
X_test shape: (362, 2)
y_train shape: (1083,)
y_test shape: (362,)


In [0]:
model = LinearRegression()
linear_reg = model.fit(X_train, y_train)
lin_reg_score = linear_reg.score(X_train, y_train)

In [10]:
beta_0 = model.intercept_
beta_i = model.coef_[0]

print("Slope Coefficient: ", beta_i)
print("\nIntercept Value: ", beta_0)

print("\nCoefficients:")
for i in range(X.shape[1]):
  print(X.columns[i], '\t', model.coef_[i])

Slope Coefficient:  -2.8952803060960044e-06

Intercept Value:  52.96039168469455

Coefficients:
dates 	 -2.8952803060960044e-06
Mag 	 0.7520889479536156


In [0]:
y_test_predict = model.predict(X_test)

In [18]:
RMSE = np.sqrt(mean_squared_error(y_test, y_test_predict))
R2= r2_score(y_test, y_test_predict)

print("For Gold, Incident Mag >= 6.7 ({} incidents)".format(df_quake_gold.shape[0]))
print("Linear Regression Model score:", lin_reg_score)
print('\nLinear Regression Model Predictive Accuracy:')
print('RMSE is {}'.format(RMSE))
print('R^2 is {}'.format(R2))

For Gold, Incident Mag >= 6.7 (1445 incidents)
Linear Regression Model score: 0.008723354639138758

Linear Regression Model Predictive Accuracy:
RMSE is 5.995585089061758
R^2 is -0.012486586964725932


##Logistic Regression

In [13]:
y = df_quake_gold['Appr_Day_30'].astype(str)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((1083, 2), (362, 2), (1083,), (362,))

In [14]:
X_train.sample()

Unnamed: 0,dates,Mag
410,19841101,7.1


In [19]:
log_reg = LogisticRegression(multi_class='ovr',
                             solver='liblinear',
                             max_iter=100)
log_reg_fit = log_reg.fit(X_train, y_train)
print("For Gold, Incident Mag >= 6.7 ({} incidents)".format(df_quake_gold.shape[0]))
print("Logistic Regression Model score:", log_reg_fit.score(X_train, y_train))
predictions = log_reg.predict(X_test)
print("Logistic Regression prediction accuracy:", accuracy_score(y_test, predictions))

For Gold, Incident Mag >= 6.7 (1445 incidents)
Logistic Regression Model score: 0.003693444136657433
Logistic Regression prediction accuracy: 0.0


In [0]:
log_reg.coef_[0]

array([-3.49901711e-07, -1.25910606e-13])