# Laboratory 6

Error Based Learning (Linear Regression)

In [125]:
Name = "CONCHA, Athea Grace"
Section = "3DSA"

### Question 1

Use the wine quality dataset for this exercise. Import the data, and identify the X and Y columns.
https://archive.ics.uci.edu/dataset/186/wine+quality

In [126]:
# Import Necessary Libraries
import pandas as pd
import numpy as np

In [127]:
### INSERT CODE FOR IMPORT
red_wine = pd.read_csv("winequality-red.csv", sep=';')
white_wine = pd.read_csv("winequality-white.csv", sep=';')

red_wine["wine_color"] = "red"
white_wine["wine_color"] = "white"

wine_df = pd.concat([red_wine, white_wine], ignore_index=True)
wine_df.head()


Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,wine_color
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,red
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5,red
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5,red
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6,red
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,red


### Question 2

Make 80 training and test split. Do not use sklearn functions.`

In [128]:
wine_df["wine_color"] = wine_df["wine_color"].map({"red": 0, "white": 1})
wine_data_shuffled = wine_df.sample(frac=1, random_state=42).reset_index(drop=True)

# Split into training and test sets (80/20)
split_index = int(0.8 * len(wine_data_shuffled))
train_data = wine_data_shuffled[:split_index]
test_data = wine_data_shuffled[split_index:]

print("Training set size:", len(train_data))
print("Test set size:", len(test_data))
train_data.head()

Training set size: 5197
Test set size: 1300


Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,wine_color
0,7.0,0.17,0.74,12.8,0.045,24.0,126.0,0.9942,3.26,0.38,12.2,8,1
1,7.7,0.64,0.21,2.2,0.077,32.0,133.0,0.9956,3.27,0.45,9.9,5,0
2,6.8,0.39,0.34,7.4,0.02,38.0,133.0,0.99212,3.18,0.44,12.0,7,1
3,6.3,0.28,0.47,11.2,0.04,61.0,183.0,0.99592,3.12,0.51,9.5,6,1
4,7.4,0.35,0.2,13.9,0.054,63.0,229.0,0.99888,3.11,0.5,8.9,6,1


### Question 3

Make a function `calculate_SSE` that will calculate the SSE based on the predicted vs actual values of Y.

In [129]:
def calculate_SSE(predicted_values, actual_values):
    return np.sum((np.array(actual_values) - np.array(predicted_values)) ** 2)

In [130]:
## ASSERT DO NOT DELETE
predicted = [2, 3, 4]
actual = [1, 5, 2]
expected_sse = 9
assert calculate_SSE(predicted, actual) == expected_sse

### Question 4

Make a function `initialize_weights` that will start the linear regression by having all weights equal to zero. The number of weights will depend on the number of X columns the data has plus one.

In [131]:
def initialize_weights(X_columns):
    return np.zeros(len(X_columns))

### Question 5

Make a function `linear_regression` that will do the following
1. Call the `initialize_weights` to call the initial weights of the equation
2. For each iteration
 - For each data point, calculate predicted value based on the weights
 - Calculate the SSE (add a stopping condition if SSE changed by less than 1% of the previous SSE)
 - For each data point, calculate the Error between predicted and actual value
 - Calculate change in error (delta error) for each weight
 - Update the weights using Gradient Descent Formula

In [132]:
def linear_regression(dataset, learning_rate=0.01, max_iterations=1000):

    X = dataset.iloc[:, :-1]
    y = dataset.iloc[:, -1]
    
    means = X.mean()
    stds = X.std()
    X = (X - means) / stds
    
    linear_regression.means = means
    linear_regression.stds = stds
    
    # Insert bias column (this gives X an extra column)
    X.insert(0, 'bias', 1)
    
    weights = initialize_weights(X.columns)
    prev_sse = float('inf')
    sse_list = []
    
    for iteration in range(max_iterations):
        sse = 0
        gradients = [0] * len(weights)
        
        # Loop through each sample
        for i in range(len(X)):
            x_i = list(X.iloc[i]) 
            y_i = y.iloc[i]
            prediction = sum(w * x for w, x in zip(weights, x_i))
            error = prediction - y_i
            sse += error ** 2
            
            for j in range(len(weights)):
                gradients[j] += error * x_i[j]
        
        sse /= len(X)
        sse_list.append(sse)
        
        if abs(prev_sse - sse) < 0.01 * prev_sse:
            print(f"Stopped at iteration {iteration}, SSE: {sse:.4f}")
            break
        
        prev_sse = sse
        
        for j in range(len(weights)):
            weights[j] -= learning_rate * (2 / len(X)) * gradients[j]
    
    linear_regression.sse_list = sse_list
    return weights

In [133]:
weights = linear_regression(train_data, learning_rate=0.01, max_iterations=1000)

### Test Set Evaluation
X_test = test_data.iloc[:, :-1]
Y_test = test_data.iloc[:, -1]

X_test_std = (X_test - linear_regression.means) / linear_regression.stds
X_test_std.insert(0, 'bias', 1)
X_test_bias = [list(row) for _, row in X_test_std.iterrows()]
predictions = [sum(w * x for w, x in zip(weights, row)) for row in X_test_bias]

print("\nFinal Weights:")
for i, w in enumerate(weights):
    print(f"w{i}: {w:.4f}")

test_total_sse = calculate_SSE(predictions, Y_test)
print(f"\nFinal SSE on Test Set: {test_total_sse:.4f}")

Stopped at iteration 101, SSE: 0.0437

Final Weights:
w0: 0.6497
w1: -0.0776
w2: -0.1072
w3: 0.0282
w4: 0.0541
w5: -0.0572
w6: 0.0184
w7: 0.1282
w8: -0.0966
w9: -0.0603
w10: -0.0610
w11: -0.0067
w12: -0.0071

Final SSE on Test Set: 58.3747


### Question 6-10

Apply steps from Question 1 to 5, but using a Logistic Regression on the Titanic Dataset.
https://www.kaggle.com/c/titanic/data

Use the survival column as your target variable.

Note: For this exercise, please clean the data first, and use only the following columns
- pclass (required OHE)
- sex (required OHE)
- age
- fare
- sibsp
- parch



In [134]:
gender_submission = pd.read_csv("gender_submission.csv")
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

# Merge test with gender_submission on PassengerId
merged_df = pd.merge(test, gender_submission, on="PassengerId")
merged_df.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Survived
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q,0
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S,1
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q,0
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S,0
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S,1


In [135]:
titanic = pd.concat([train, merged_df], ignore_index=True)

print(f"Combined dataset shape: {titanic.shape}")
titanic.head()

Combined dataset shape: (1309, 12)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


#### **DATA CLEANING**

In [136]:
# missing values
missing_values = titanic.isnull().sum()

# Filter only columns with missing values
missing_values = missing_values[missing_values > 0]
print(missing_values)

Age          263
Fare           1
Cabin       1014
Embarked       2
dtype: int64


In [137]:
titanic2 = titanic.copy()

titanic2['Age'] = titanic2['Age'].fillna(titanic2['Age'].mean())
titanic2['Fare'] = titanic2['Fare'].fillna(titanic2['Fare'].mean())
titanic2 = titanic2.drop(columns=['Cabin'])
titanic2['Embarked'] = titanic2['Embarked'].fillna(titanic2['Embarked'].mode()[0])

print("\nAfter Cleaning, Missing Values:\n", titanic2.isnull().sum())


After Cleaning, Missing Values:
 PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
dtype: int64


In [138]:
print("\nCleaned Data:")
titanic2.head()


Cleaned Data:


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,S


In [139]:
### one-hot encoding

selected_cols = ['Survived', 'Pclass', 'Sex', 'Age', 'Fare', 'SibSp', 'Parch']
cleaned_data = titanic2[selected_cols]

# One-Hot Encoding for 'Pclass' and 'Sex'
cleaned_data = pd.get_dummies(cleaned_data, columns=['Pclass', 'Sex'], drop_first=True, dtype=int)

In [140]:
print("Cleaned & One-Hot Encoded Titanic Dataset:")
cleaned_data.head(10)

Cleaned & One-Hot Encoded Titanic Dataset:


Unnamed: 0,Survived,Age,Fare,SibSp,Parch,Pclass_2,Pclass_3,Sex_male
0,0,22.0,7.25,1,0,0,1,1
1,1,38.0,71.2833,1,0,0,0,0
2,1,26.0,7.925,0,0,0,1,0
3,1,35.0,53.1,1,0,0,0,0
4,0,35.0,8.05,0,0,0,1,1
5,0,29.881138,8.4583,0,0,0,1,1
6,0,54.0,51.8625,0,0,0,0,1
7,0,2.0,21.075,3,1,0,1,1
8,1,27.0,11.1333,0,2,0,1,0
9,1,14.0,30.0708,1,0,1,0,0


Make 80 training and test split. Do not use sklearn functions.

In [141]:
# Shuffle
cleaned_data = cleaned_data.sample(frac=1, random_state=42).reset_index(drop=True)

# Split
split_index = int(0.8 * len(cleaned_data))
train_data = cleaned_data[:split_index]
test_data = cleaned_data[split_index:]

print("Training set size:", len(train_data))
print("Test set size:", len(test_data))


Training set size: 1047
Test set size: 262


In [142]:
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

In [143]:
def calculate_log_loss(y_true, y_pred, epsilon=1e-8):
    y_true = np.array(y_true)
    y_pred = np.clip(np.array(y_pred), epsilon, 1 - epsilon)
    return -np.mean(y_true * np.log(y_pred) + (1 - y_true) * np.log(1 - y_pred))

Make a function `calculate_SSE` that will calculate the SSE based on the predicted vs actual values of Y.

In [144]:
def calculate_SSE(predicted_values, actual_values):
    return np.sum((np.array(actual_values) - np.array(predicted_values)) ** 2)

Make a function `initialize_weights` that will start the linear regression by having all weights equal to zero. The number of weights will depend on the number of X columns the data has plus one.

In [145]:
def initialize_weights(X_columns):
    return np.zeros(len(X_columns))

Using a Logistic Regression on the Titanic Dataset.
https://www.kaggle.com/c/titanic/data

Use the survival column as your target variable.

Note: For this exercise, please clean the data first, and use only the following columns
- pclass (required OHE)
- sex (required OHE)
- age
- fare
- sibsp
- parch

In [146]:
def logistic_regression(dataset, learning_rate=0.01, max_iterations=1000):

    y = dataset['Survived']
    X = dataset.drop(columns=['Survived'])
    
    means = X.mean()
    stds = X.std()
    X = (X - means) / stds
    
    logistic_regression.means = means
    logistic_regression.stds = stds
    
    # Insert bias column.
    X.insert(0, 'bias', 1)
    
    # Initialize weights: number of weights equals number of columns in X.
    weights = initialize_weights(X.columns)
    
    prev_loss = float('inf')
    loss_list = []
    
    # Training loop
    for iteration in range(max_iterations):
        loss = 0
        gradients = np.zeros_like(weights)
        
        # Loop through each training sample.
        for i in range(len(X)):
            x_i = list(X.iloc[i]) 
            y_i = y.iloc[i]
            z = np.dot(weights, x_i)
            prediction = sigmoid(z)
            # Cross-entropy loss for the sample:
            loss += - (y_i * np.log(prediction + 1e-8) + (1 - y_i) * np.log(1 - prediction + 1e-8))
            # Accumulate gradient: (prediction - y_i) * x_i
            gradients += (prediction - y_i) * np.array(x_i)
        
        loss /= len(X)
        loss_list.append(loss)
        
        # Convergence check: stop if relative change in loss is less than 0.1%
        if abs(prev_loss - loss) < 0.001 * prev_loss:
            print(f"Stopped at iteration {iteration}, Loss: {loss:.4f}")
            break
        prev_loss = loss
        
        # Update weights using the averaged gradient.
        weights -= learning_rate * (gradients / len(X))
    
    logistic_regression.loss_list = loss_list
    return weights

In [147]:
weights = logistic_regression(train_data, learning_rate=0.01, max_iterations=1000)

print("\nFinal Weights:")
for i, w in enumerate(weights):
    print(f"w{i}: {w:.4f}")

# Test Set Evaluation:

X_test = test_data.drop(columns=['Survived'])
Y_test = test_data['Survived']

X_test_std = (X_test - logistic_regression.means) / logistic_regression.stds
X_test_std.insert(0, 'bias', 1)

# Convert test DataFrame to numpy array for efficiency.
X_test_array = X_test_std.to_numpy()
# Compute predicted probabilities on the test set.
predictions_prob = sigmoid(np.dot(X_test_array, weights))
# For evaluation, calculate the log loss (cross-entropy) and accuracy.
test_log_loss = calculate_log_loss(Y_test, predictions_prob)
predictions_class = (predictions_prob >= 0.5).astype(int)
accuracy = np.mean(predictions_class == Y_test.values)

print(f"\nFinal Log Loss on Test Set: {test_log_loss:.4f}")
print(f"Test Accuracy: {accuracy * 100:.2f}%")

Stopped at iteration 198, Loss: 0.5114

Final Weights:
w0: -0.1882
w1: -0.0535
w2: 0.1423
w3: -0.0225
w4: 0.0471
w5: 0.0208
w6: -0.1602
w7: -0.5126

Final Log Loss on Test Set: 0.5115
Test Accuracy: 85.11%
