In [2]:
import pandas as pd
import numpy as np
import requests
from io import StringIO
import seaborn as sns
from sklearn.model_selection import train_test_split

In [3]:
def async_read_csv(url):
    response = requests.get(url, stream=True)
    response.raise_for_status() # will raise error if request is not successful
    return pd.read_csv(StringIO(response.text))
    
# Read the streamed content
df = async_read_csv("https://raw.githubusercontent.com/uy-seng/cs4375/main/assignment-1/scripts/convert_to_csv/abalone.csv")
df.head(), df.shape

(  sex  length  diameter  height  whole_weight  shucked_weight  viscera_weight  \
 0   M   0.455     0.365   0.095        0.5140          0.2245          0.1010   
 1   M   0.350     0.265   0.090        0.2255          0.0995          0.0485   
 2   F   0.530     0.420   0.135        0.6770          0.2565          0.1415   
 3   M   0.440     0.365   0.125        0.5160          0.2155          0.1140   
 4   I   0.330     0.255   0.080        0.2050          0.0895          0.0395   
 
    shell_weight  rings  
 0         0.150     15  
 1         0.070      7  
 2         0.210      9  
 3         0.155     10  
 4         0.055      7  ,
 (4177, 9))

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4177 entries, 0 to 4176
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   sex             4177 non-null   object 
 1   length          4177 non-null   float64
 2   diameter        4177 non-null   float64
 3   height          4177 non-null   float64
 4   whole_weight    4177 non-null   float64
 5   shucked_weight  4177 non-null   float64
 6   viscera_weight  4177 non-null   float64
 7   shell_weight    4177 non-null   float64
 8   rings           4177 non-null   int64  
dtypes: float64(7), int64(1), object(1)
memory usage: 293.8+ KB


In [5]:
df.describe()

Unnamed: 0,length,diameter,height,whole_weight,shucked_weight,viscera_weight,shell_weight,rings
count,4177.0,4177.0,4177.0,4177.0,4177.0,4177.0,4177.0,4177.0
mean,0.523992,0.407881,0.139516,0.828742,0.359367,0.180594,0.238831,9.933684
std,0.120093,0.09924,0.041827,0.490389,0.221963,0.109614,0.139203,3.224169
min,0.075,0.055,0.0,0.002,0.001,0.0005,0.0015,1.0
25%,0.45,0.35,0.115,0.4415,0.186,0.0935,0.13,8.0
50%,0.545,0.425,0.14,0.7995,0.336,0.171,0.234,9.0
75%,0.615,0.48,0.165,1.153,0.502,0.253,0.329,11.0
max,0.815,0.65,1.13,2.8255,1.488,0.76,1.005,29.0


In [6]:
# check for null or na values
df.isnull().sum()

sex               0
length            0
diameter          0
height            0
whole_weight      0
shucked_weight    0
viscera_weight    0
shell_weight      0
rings             0
dtype: int64

In [7]:
# check for redundant rows
df.duplicated().sum()

0

In [8]:
df.sex.unique()

array(['M', 'F', 'I'], dtype=object)

In [9]:
# convert categorical variables to numerical variables
df['sex'] = df['sex'].map({'M': 1, 'F': 2, 'I': 3})

In [10]:
# remove attribute that is not correlated to the outcome
correlation_matrix = df.corr()
correlation_matrix['rings'].sort_values(ascending=False)

rings             1.000000
shell_weight      0.627574
diameter          0.574660
height            0.557467
length            0.556720
whole_weight      0.540390
viscera_weight    0.503819
shucked_weight    0.420884
sex              -0.351822
Name: rings, dtype: float64

In [11]:
# remove sex since ring does not correlate with sex
if "sex" in df: del df["sex"]

In [12]:
x = df.drop("rings", axis=1).values
y = df["rings"].values

In [13]:
x.shape

(4177, 7)

In [14]:
x.shape, y.shape

((4177, 7), (4177,))

In [15]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

In [16]:
x_train.shape, x_test.shape, y_train.shape, y_test.shape 

((3341, 7), (836, 7), (3341,), (836,))

In [17]:
def gradient_of_cost_func(x_values, y_values, thetas):
    x_values_with_bias = np.c_[np.ones(len(x_values)), x_values]  # Add bias term
    predictions = np.dot(x_values_with_bias, thetas) 
    errors = predictions - y_values
    return np.dot(x_values_with_bias.T, errors) / len(y_values)

def cost_func(x_values, y_values, thetas):
    x_values_with_bias = np.c_[np.ones(len(x_values)), x_values]  # Add bias term
    predictions = np.dot(x_values_with_bias, thetas)
    return (1 / 2) * np.mean((predictions - y_values) ** 2)

def gradient_descent(x_values, y_values, learning_rate=0.01, threshold=1e-5, max_iterations=100000):
    thetas = np.random.rand(x_values.shape[1] + 1)  # +1 for bias term
    costs = []
    for _ in range(max_iterations):
        delta = -learning_rate * gradient_of_cost_func(x_values, y_values, thetas)
        costs.append(cost_func(x_values, y_values, thetas))
        if np.all(np.abs(delta) <= threshold):
            break
        thetas += delta
    return thetas, costs

In [18]:
thetas, costs = gradient_descent(x, y)

def calculate_r_squared(y_actual, y_predict):
    # Residual Sum of Squares (RSS)
    rss = np.sum((y_actual - y_predict) ** 2)
    # Total Sum of Squares (TSS)
    tss = np.sum((y_actual - np.mean(y_actual)) ** 2)
    # R^2 calculation
    r_squared = 1 - (rss / tss)
    return r_squared

y_predict = np.dot(np.c_[np.ones(len(x_test)), x_test], thetas)

calculate_r_squared(y_test, y_predict)

0.5137434961523606