In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
import numpy as np

In [3]:
df = pd.read_csv('../class 02/regression.csv')
df.head()

Unnamed: 0,x,y
0,1,3
1,2,4
2,3,8
3,4,4
4,5,6


In [8]:
# Extract y and X from the dataframe
# .values: converts the selected column into a numpy array
# .reshape(-1,1): reshapes the array to have one column and as many rows as needed
y = df['y'].values.reshape(-1,1)
X = df['x'].values.reshape(-1,1)

In [7]:
# X: mxn, y: mx1, Theta: nx1
def grad_descent(X, y, alpha, epsilon, iter=float("inf"), Theta=None):
  iteration = [0]
  i = 0
  m = np.shape(X)[0] # Total number of samples

  # Initialize the model
  if Theta is None:
    Theta = np.random.randn(X.shape[1], 1) * 0.01

  # Compute the initial cost
  cost = [1/(2*m) * np.transpose(X @ Theta - y) @ (X @ Theta -y)]
  delta = 1

  # Keep going until the change in the model is big enough
  while (delta>epsilon or i<iter):
    gradient = (1/m) * (np.transpose(X) @ (X @ Theta - y))
    Theta = Theta - alpha*gradient
    J_Theta = 1/(2*m) * (np.transpose(X @ Theta - y) @ (X @ Theta -y))
    print(J_Theta)
    cost.append(J_Theta)
    delta = abs(cost[i+1][0,0] - cost[i][0,0])
    if ((cost[i+1][0,0] - cost[i][0,0])> 0):
      print("The cost is increasing. DO SOMETHING.")
      break
    i += 1
    iteration.append(i)

  print("Completed in %d iterations." %(i))
  return(Theta)

In [9]:
# Add a column of ones to X to account for the intercept term
X = np.concatenate([np.ones((X.shape[0],1)), X], axis=1)

Theta = grad_descent(X, y, 0.01, 10**-10)
print(Theta)

[[175.44753782]]
[[57.65407943]]
[[34.52209922]]
[[29.94200789]]
[[28.99791438]]
[[28.76661457]]
[[28.6752205]]
[[28.61145712]]
[[28.55331988]]
[[28.49649539]]
[[28.44013734]]
[[28.3840789]]
[[28.32828643]]
[[28.27275238]]
[[28.21747432]]
[[28.16245082]]
[[28.10768066]]
[[28.05316268]]
[[27.99889571]]
[[27.94487858]]
[[27.89111016]]
[[27.83758929]]
[[27.78431484]]
[[27.73128568]]
[[27.67850066]]
[[27.62595867]]
[[27.57365859]]
[[27.52159931]]
[[27.46977972]]
[[27.41819871]]
[[27.36685518]]
[[27.31574804]]
[[27.26487621]]
[[27.2142386]]
[[27.16383413]]
[[27.11366172]]
[[27.06372032]]
[[27.01400885]]
[[26.96452626]]
[[26.9152715]]
[[26.86624351]]
[[26.81744124]]
[[26.76886368]]
[[26.72050976]]
[[26.67237848]]
[[26.62446879]]
[[26.57677969]]
[[26.52931015]]
[[26.48205917]]
[[26.43502574]]
[[26.38820886]]
[[26.34160752]]
[[26.29522075]]
[[26.24904754]]
[[26.20308692]]
[[26.15733791]]
[[26.11179953]]
[[26.06647082]]
[[26.02135081]]
[[25.97643853]]
[[25.93173303]]
[[25.88723337]]
[[25.842938

In [11]:
df = pd.read_csv('./rock.csv')
df.head()

Unnamed: 0,area,peri,shape,perm
0,4990,2791.9,0.09033,6.3
1,7002,3892.6,0.148622,6.3
2,7558,3930.66,0.183312,6.3
3,7352,3869.32,0.117063,6.3
4,7943,3948.54,0.122417,17.1


In [12]:
# Extract y and X from the dataframe
y = df['perm'].values.reshape(-1,1)
X = df[['area', 'peri', 'shape']].values

In [13]:
# Normalize the features
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X = scaler.fit_transform(X)

# Add a column of ones to X to account for the intercept term
X = np.concatenate([np.ones((X.shape[0],1)), X], axis=1)

In [14]:
Theta = grad_descent(X, y, alpha=0.1, epsilon=10**-10)
print(Theta)

[[146542.64533286]]
[[121948.44350795]]
[[103667.68814001]]
[[89878.83518194]]
[[79332.95055066]]
[[71162.77833222]]
[[64757.84023107]]
[[59682.24557023]]
[[55620.22328516]]
[[52339.7239786]]
[[49667.86169974]]
[[47474.16175345]]
[[45658.99339258]]
[[44145.47645107]]
[[42873.73898696]]
[[41796.78407799]]
[[40877.47184207]]
[[40086.28482988]]
[[39399.6515012]]
[[38798.67304745]]
[[38268.14591292]]
[[37795.80409679]]
[[37371.72692784]]
[[36987.87290061]]
[[36637.71056448]]
[[36315.92482382]]
[[36018.1822988]]
[[35740.94325216]]
[[35481.31043653]]
[[35236.90735035]]
[[35005.7800076]]
[[34786.31756648]]
[[34577.18812112]]
[[34377.28670984]]
[[34185.69318151]]
[[34001.63802795]]
[[33824.47465976]]
[[33653.6568989]]
[[33488.72069753]]
[[33329.26928298]]
[[33174.96108123]]
[[33025.49989487]]
[[32880.6269109]]
[[32740.11419409]]
[[32603.75938691]]
[[32471.38138946]]
[[32342.81683591]]
[[32217.91721805]]
[[32096.54653511]]
[[31978.57937141]]
[[31863.899322]]
[[31752.39770155]]
[[31643.97248361]