In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt # plotting

In [None]:
# Importing data
# dataset link -> https://www.kaggle.com/sohier/calcofi
complete_bottle_data = pd.read_csv("sample_data/bottle.csv")
partial_bottle_data = complete_bottle_data.loc[1:1000, ['T_degC','Salnty']]
partial_bottle_data.head()

In [None]:
partial_bottle_data.describe()

In [None]:
def z_score(df):
    # copy the dataframe
    df_std = df.copy()
    # apply the z-score method
    for column in df_std.columns:
        df_std[column] = (df_std[column] - df_std[column].mean()) / df_std[column].std()
        
    return df_std

In [None]:
partial_bottle_data = z_score(partial_bottle_data)

In [None]:
# removing nan values
partial_bottle_data.dropna(inplace=True)

In [None]:
# Renaming columns
partial_bottle_data.rename(columns={"T_degC": "TEMP", "Salnty": "SALINITY"}, inplace=True)

In [None]:
# Analyzing dataset
plt.scatter(partial_bottle_data['SALINITY'], partial_bottle_data['TEMP'], color='blue')
plt.xlabel("SALINITY")
plt.ylabel("TEMP")
plt.show()

In [None]:
# Adding degree 2 column
partial_bottle_data['SALINITY_2'] = partial_bottle_data['SALINITY']**2

# Adding X0 Column
partial_bottle_data.insert(loc=0, column='X0', value=1)

In [None]:
# partial_bottle_data = z_score(partial_bottle_data)

In [None]:
# Creating train and test dataset
msk = np.random.rand(len(partial_bottle_data)) < 0.8
train_raw = partial_bottle_data[msk]
test_raw = partial_bottle_data[~msk]

In [None]:
train_X = train_raw.drop('TEMP', axis=1).values
train_Y = train_raw[['TEMP']].values

test_X = test_raw.drop('TEMP', axis=1).values
test_Y = test_raw[['TEMP']].values


print(train_X.shape, train_Y.shape)
print(test_X.shape, test_Y.shape)

In [None]:
m, n = train_X.shape
theta = np.ones(n).reshape((n, 1))

In [None]:
theta

In [None]:
def gradient_descent(X, Y, theta, alpha=0.0005, epoch=10000):
    
    m, n = X.shape
    
#     print(theta)
    
    listOfJTheta = []

    for _ in range(epoch):
      # print(theta, end=' ')
      jTheta = (1/(2*m))*(np.transpose(X@theta-Y)@(X@theta-Y))
      # print(theta, end=' ')
      theta = theta - alpha * (1/m) * np.transpose(X) @ (X@theta-Y)
      listOfJTheta.append(jTheta)
    
    return theta, listOfJTheta

In [None]:
alpha = 0.0005
epoch = 10000
theta, cost_list = gradient_descent(train_X, train_Y, theta, alpha, epoch)

In [None]:
print(theta)

In [None]:
plt.scatter(x=range(epoch), y=cost_list)
plt.show()

In [None]:
def mean_absolute_percentage_error(Y, Y_hat):
    return np.mean((np.abs(Y-Y_hat)/Y))*100

In [None]:
def testMultivariateLR(X, Y, theta):
  predicted_Y = X@theta
  accuracy = 100 - mean_absolute_percentage_error(Y, predicted_Y)
  return accuracy

In [None]:
print(testMultivariateLR(test_X, test_Y, theta))

In [None]:
# Analyzing dataset
plt.scatter(partial_bottle_data['SALINITY'], partial_bottle_data['TEMP'], color='blue')
plt.plot(test_X[:, 1:2], test_X@theta, color='r')
plt.xlabel("SALINITY")
plt.ylabel("TEMP")
plt.show()