In [1]:
import csv
import numpy as np
f = open("geyser.csv",'r')

reader = csv.reader(f)

X = []
Y = []

for row in reader:
    if row[1] != "eruptions":
        X.append(float(row[1]))
        Y.append(float(row[2]))

f.close()



In [2]:
def slr(X,Y):
    """
    This Functions Evaluates beta_0(Slope) and beta_1(Intercept)
    for X and Y values to get the bestfit line for Simple Linear Regression
    
    # Eruption Duration Values (Input) in mins
    # X = [3.1,3.2,2.6,......]

    # Waiting Time Values (Output) in mins
    # Y = [88,65,75,......]

    """
    
    X_bar = np.mean(X)
    Y_bar = np.mean(Y)
    
    Xi_X_bar=[]
    Xi_X_bar_2=[]
    Xi_X_bar_mul_Yi_Y_bar=[]
    
    for i in range(len(X)):
        Xi_X_bar.append(X[i] - X_bar)
        Xi_X_bar_2.append((X[i] - X_bar)**2)
        Xi_X_bar_mul_Yi_Y_bar.append((X[i] - X_bar) * (Y[i] -Y_bar))
    numerator = np.sum(Xi_X_bar_mul_Yi_Y_bar)
    denominator = np.sum(Xi_X_bar_2)
    beta_1 = numerator/denominator
    beta_0 = Y_bar - beta_1 * X_bar
    #print('beta_0 (Intercept) is: ', beta_0)
    #print('beta_1 (Slope) is : ', beta_1)
    return (beta_0,beta_1)
    
    

In [3]:
betas = slr(X,Y)
betas


(33.47439702275336, 10.729641395133525)

In [4]:
def predict(x):
    """
    This function predicts wait time (in mins) for the next eruption to happen.
    Args: x ('Last Eruption Duration (mins)')
    
    """
    (beta_0,beta_1) = slr(X,Y)
    pred_y = beta_0 + beta_1*x
    #print(f'The Next Eruption is Predicted to be in {pred_y} mins')
    return pred_y


In [5]:
predict(2)

54.933679813020404

## Model Valdation with RSE(Residual Standard Error) & R-Squared


$$RSE = \sqrt\frac{RSS}{n-2}$$

## Calculate R-Squared

### $$R^2 = 1 -\frac{RSS}{TSS}$$

where TSS is Total Sum of Squares 

### $$ TSS = \sum_{i=1}^{n}(y_i-\bar{y})^2$$


####   where yi – the value in a sample  
#####  ȳ – the mean value of a sample

In [6]:
def calculate_evaluation_metrics(betas):
    """
    This function calculates RSS (in mins) RSE and R-Squared
    Args: betas (takes beta_0 and beta_1 values)
    
    """
    TSS=0
    RSS=0
    for idx in range(0,len(X)):
        pred_Y = predict(X[idx])
        actual_Y = Y[idx]
        error = pred_Y - actual_Y
        RSS = RSS+(error**2)
        Y_bar = np.mean(Y)
        TSS = TSS+(actual_Y - Y_bar)**2
        R_squared = 1 - (RSS/TSS)
        #print(idx+1,RSS)
    RSE = round(np.sqrt(RSS/(len(X)-2)),2)
    print(f' RSE is {RSE} mins')
    print(f' R-Squared is {R_squared}')
  

calculate_evaluation_metrics(betas)

 RSE is 5.91 mins
 R-Squared is 0.8114607609733098
