#### 【problem1】Hypothetical function


In [1]:
import numpy as np

class ScratchLinearRegression():
    def __init__(self, num_iter, lr, no_bias=False, verbose=False):
        # Initialize the number of iterations, learning rate, no_bias flag, and verbose flag
        self.num_iter = num_iter
        self.lr = lr
        self.no_bias = no_bias
        self.verbose = verbose
        
        # Prepare arrays to record the loss at each iteration
        self.loss = np.zeros(self.num_iter)
        self.val_loss = np.zeros(self.num_iter)
        
        # Placeholder for the coefficients (parameters), will be initialized during fit
        self.coef_ = None

    def _linear_hypothesis(self, X):
        """
        Compute a linear hypothetical function

        Parameters
        ----------
        X : ndarray, shape (n_samples, n_features)
          Training data

        Returns
        -------
        y_pred : ndarray, shape (n_samples,)
          Estimated result by linear hypothetical function
        """
        # Calculate the predicted values by taking the dot product of X and the coefficients
        return np.dot(X, self.coef_)

    def fit(self, X, y, X_val=None, y_val=None):
        # Add a bias term (column of ones) to the features if no_bias is False
        if not self.no_bias:
            X = np.hstack([np.ones((X.shape[0], 1)), X])
            if X_val is not None:
                X_val = np.hstack([np.ones((X_val.shape[0], 1)), X_val])
        
        # Initialize the coefficients to zeros (including bias term if applicable)
        self.coef_ = np.zeros(X.shape[1])
        
        for i in range(self.num_iter):
            # Compute the predicted values for the current iteration
            y_pred = self._linear_hypothesis(X)
            
            # Calculate the error as the difference between predicted and actual values
            error = y_pred - y
            
            # Update the coefficients using the gradient descent update rule
            self.coef_ -= self.lr * (2 / X.shape[0]) * np.dot(X.T, error)
            
            # Record the mean squared error for the training data
            self.loss[i] = np.mean(error ** 2)
            
            if X_val is not None and y_val is not None:
                # Compute the predicted values for the validation data
                y_val_pred = self._linear_hypothesis(X_val)
                
                # Calculate the validation error
                val_error = y_val_pred - y_val
                
                # Record the mean squared error for the validation data
                self.val_loss[i] = np.mean(val_error ** 2)
            
            # If verbose is True, print the training and validation loss for the current iteration
            if self.verbose:
                print(f"Iteration {i+1}, Training loss: {self.loss[i]}")
                if X_val is not None and y_val is not None:
                    print(f"Validation loss: {self.val_loss[i]}")

    def predict(self, X):
        # Add a bias term (column of ones) to the features if no_bias is False
        if not self.no_bias:
            X = np.hstack([np.ones((X.shape[0], 1)), X])
        
        # Return the predicted values using the hypothetical function
        return self._linear_hypothesis(X)


#### 【problem2】Steepest descent


In [2]:
import numpy as np

class ScratchLinearRegression():
    def __init__(self, num_iter, lr, no_bias=False, verbose=False):
        # Initialize the number of iterations, learning rate, no_bias flag, and verbose flag
        self.num_iter = num_iter
        self.lr = lr
        self.no_bias = no_bias
        self.verbose = verbose
        
        # Prepare arrays to record the loss at each iteration
        self.loss = np.zeros(self.num_iter)
        self.val_loss = np.zeros(self.num_iter)
        
        # Placeholder for the coefficients (parameters), will be initialized during fit
        self.coef_ = None

    def _linear_hypothesis(self, X):
        """
        Compute a linear hypothetical function

        Parameters
        ----------
        X : ndarray, shape (n_samples, n_features)
          Training data

        Returns
        -------
        y_pred : ndarray, shape (n_samples,)
          Estimated result by linear hypothetical function
        """
        # Calculate the predicted values by taking the dot product of X and the coefficients
        return np.dot(X, self.coef_)

    def _gradient_descent(self, X, error):
        """
        Perform one step of gradient descent to update the coefficients

        Parameters
        ----------
        X : ndarray, shape (n_samples, n_features)
          Training data
        error : ndarray, shape (n_samples,)
          Difference between predicted and actual values

        """
        # Update the coefficients based on the gradient of the loss function
        self.coef_ -= self.lr * (2 / X.shape[0]) * np.dot(X.T, error)

    def fit(self, X, y, X_val=None, y_val=None):
        # Add a bias term (column of ones) to the features if no_bias is False
        if not self.no_bias:
            X = np.hstack([np.ones((X.shape[0], 1)), X])
            if X_val is not None:
                X_val = np.hstack([np.ones((X_val.shape[0], 1)), X_val])
        
        # Initialize the coefficients to zeros (including bias term if applicable)
        self.coef_ = np.zeros(X.shape[1])
        
        for i in range(self.num_iter):
            # Compute the predicted values for the current iteration
            y_pred = self._linear_hypothesis(X)
            
            # Calculate the error as the difference between predicted and actual values
            error = y_pred - y
            
            # Update the coefficients using the gradient descent method
            self._gradient_descent(X, error)
            
            # Record the mean squared error for the training data
            self.loss[i] = np.mean(error ** 2)
            
            if X_val is not None and y_val is not None:
                # Compute the predicted values for the validation data
                y_val_pred = self._linear_hypothesis(X_val)
                
                # Calculate the validation error
                val_error = y_val_pred - y_val
                
                # Record the mean squared error for the validation data
                self.val_loss[i] = np.mean(val_error ** 2)
            
            # If verbose is True, print the training and validation loss for the current iteration
            if self.verbose:
                print(f"Iteration {i+1}, Training loss: {self.loss[i]}")
                if X_val is not None and y_val is not None:
                    print(f"Validation loss: {self.val_loss[i]}")

    def predict(self, X):
        # Add a bias term (column of ones) to the features if no_bias is False
        if not self.no_bias:
            X = np.hstack([np.ones((X.shape[0], 1)), X])
        
        # Return the predicted values using the hypothetical function
        return self._linear_hypothesis(X)


#### 【problem 3】Estimated

In [3]:
import numpy as np

class ScratchLinearRegression():
    def __init__(self, num_iter, lr, no_bias=False, verbose=False):
        # Initialize the number of iterations, learning rate, no_bias flag, and verbose flag
        self.num_iter = num_iter
        self.lr = lr
        self.no_bias = no_bias
        self.verbose = verbose
        
        # Prepare arrays to record the loss at each iteration
        self.loss = np.zeros(self.num_iter)
        self.val_loss = np.zeros(self.num_iter)
        
        # Placeholder for the coefficients (parameters), will be initialized during fit
        self.coef_ = None

    def _linear_hypothesis(self, X):
        """
        Compute a linear hypothetical function

        Parameters
        ----------
        X : ndarray, shape (n_samples, n_features)
          Training data

        Returns
        -------
        y_pred : ndarray, shape (n_samples,)
          Estimated result by linear hypothetical function
        """
        # Calculate the predicted values by taking the dot product of X and the coefficients
        return np.dot(X, self.coef_)

    def _gradient_descent(self, X, error):
        """
        Perform one step of gradient descent to update the coefficients

        Parameters
        ----------
        X : ndarray, shape (n_samples, n_features)
          Training data
        error : ndarray, shape (n_samples,)
          Difference between predicted and actual values

        """
        # Update the coefficients based on the gradient of the loss function
        self.coef_ -= self.lr * (2 / X.shape[0]) * np.dot(X.T, error)

    def fit(self, X, y, X_val=None, y_val=None):
        # Add a bias term (column of ones) to the features if no_bias is False
        if not self.no_bias:
            X = np.hstack([np.ones((X.shape[0], 1)), X])
            if X_val is not None:
                X_val = np.hstack([np.ones((X_val.shape[0], 1)), X_val])
        
        # Initialize the coefficients to zeros (including bias term if applicable)
        self.coef_ = np.zeros(X.shape[1])
        
        for i in range(self.num_iter):
            # Compute the predicted values for the current iteration
            y_pred = self._linear_hypothesis(X)
            
            # Calculate the error as the difference between predicted and actual values
            error = y_pred - y
            
            # Update the coefficients using the gradient descent method
            self._gradient_descent(X, error)
            
            # Record the mean squared error for the training data
            self.loss[i] = np.mean(error ** 2)
            
            if X_val is not None and y_val is not None:
                # Compute the predicted values for the validation data
                y_val_pred = self._linear_hypothesis(X_val)
                
                # Calculate the validation error
                val_error = y_val_pred - y_val
                
                # Record the mean squared error for the validation data
                self.val_loss[i] = np.mean(val_error ** 2)
            
            # If verbose is True, print the training and validation loss for the current iteration
            if self.verbose:
                print(f"Iteration {i+1}, Training loss: {self.loss[i]}")
                if X_val is not None and y_val is not None:
                    print(f"Validation loss: {self.val_loss[i]}")

    def predict(self, X):
        # Add a bias term (column of ones) to the features if no_bias is False
        if not self.no_bias:
            X = np.hstack([np.ones((X.shape[0], 1)), X])
        
        # Return the predicted values using the hypothetical function
        return self._linear_hypothesis(X)


#### 【problem 4】Mean squared error

In [4]:
import numpy as np

def MSE(y_pred, y):
    """
    Calculation of mean square error

    Parameters
    ----------
    y_pred : ndarray, shape (n_samples,)
      Estimated value
    y : ndarray, shape (n_samples,)
      Correct answer value

    Returns
    ----------
    mse : numpy.float
      Mean squared error
    """
    # Calculate the squared differences between predicted and actual values
    squared_errors = (y_pred - y) ** 2
    
    # Calculate the mean squared error
    mse = np.mean(squared_errors)
    
    return mse


#### 【problem 5】Objective function


In [5]:
import numpy as np

class ScratchLinearRegression():
    def __init__(self, num_iter, lr, no_bias=False, verbose=False):
        # Initialize the number of iterations, learning rate, no_bias flag, and verbose flag
        self.num_iter = num_iter
        self.lr = lr
        self.no_bias = no_bias
        self.verbose = verbose
        
        # Prepare arrays to record the loss at each iteration
        self.loss = np.zeros(self.num_iter)
        self.val_loss = np.zeros(self.num_iter)
        
        # Placeholder for the coefficients (parameters), will be initialized during fit
        self.coef_ = None

    

    def _linear_hypothesis(self, X):
        """
        Compute a linear hypothetical function

        Parameters
        ----------
        X : ndarray, shape (n_samples, n_features)
          Training data

        Returns
        -------
        y_pred : ndarray, shape (n_samples,)
          Estimated result by linear hypothetical function
        """
        # # Convert X to a NumPy array if it's not already
        # X = np.array(X)
    
        # # Print the shapes of X and coefficients for debugging
        # print("Shape of X:", X.shape)
        # print("Shape of coefficients:", self.coef_.shape)
        # print("Type of elements in coefficients:", self.coef_.dtype)

    
        # # Calculate the predicted values by taking the dot product of X and the coefficients
        # y_pred = np.dot(X, self.coef_)
    
        # return y_pred

        # Convert X to a NumPy array if it's not already
        X = np.array(X)
    
        # Print the shapes of X and coefficients for debugging
        print("Shape of X:", X.shape)
        print("Shape of coefficients:", self.coef_.shape)
    
        # Print the intermediate result of the dot product
        dot_product_result = np.dot(X, self.coef_)
        print("Shape of dot product result:", dot_product_result.shape)
    
        # Calculate the predicted values by taking the dot product of X and the coefficients
        y_pred = dot_product_result
    
        return y_pred



    def _gradient_descent(self, X, error):
        """
        Perform one step of gradient descent to update the coefficients

        Parameters
        ----------
        X : ndarray, shape (n_samples, n_features)
          Training data
        error : ndarray, shape (n_samples,)
          Difference between predicted and actual values

        """
        # Update the coefficients based on the gradient of the loss function
        # self.coef_ -= self.lr * (2 / X.shape[0]) * np.dot(X.T, error)
        # Update the coefficients based on the gradient of the loss function
        self.coef_ -= self.lr * (2 / X.shape[0]) * np.dot(X.T, error)

    def fit(self, X, y, X_val=None, y_val=None):
        """
        Learn linear regression. If validation data is entered, the loss and accuracy for it are also calculated for each iteration.
        Parameters
        ----------
        X : ndarray, shape (n_samples, n_features)
            Features of training data
        y : ndarray, shape (n_samples, )
            Correct answer value of training data
        X_val : ndarray, shape (n_samples, n_features)
            Features of validation data
        y_val : ndarray, shape (n_samples, )
            Correct value of validation data
        """
        if not self.no_bias:
            # Add a bias term (column of ones) to the features if no_bias is False
            X = np.hstack([np.ones((X.shape[0], 1)), X])
            if X_val is not None:
                X_val = np.hstack([np.ones((X_val.shape[0], 1)), X_val])

        # Initialize the coefficients to zeros (including bias term if applicable)
        self.coef_ = np.zeros(X.shape[1])

        for i in range(self.num_iter):
            # Compute the predicted values for the current iteration
            y_pred = self._linear_hypothesis(X)

            # Calculate the error as the difference between predicted and actual values
            error = y_pred - y

            # Update the coefficients using the gradient descent method
            self._gradient_descent(X, error)

            # Record the mean squared error for the training data
            self.loss[i] = np.mean(error ** 2)

            if X_val is not None and y_val is not None:
                # Compute the predicted values for the validation data
                y_val_pred = self._linear_hypothesis(X_val)

                # Calculate the validation error
                val_error = y_val_pred - y_val

                # Record the mean squared error for the validation data
                self.val_loss[i] = np.mean(val_error ** 2)

            # If verbose is True, print the training and validation loss for the current iteration
            if self.verbose:
                print(f"Iteration {i + 1}, Training loss: {self.loss[i]}")
                if X_val is not None and y_val is not None:
                    print(f"Validation loss: {self.val_loss[i]}")

        
       

    def predict(self, X):
        # Add a bias term (column of ones) to the features if no_bias is False
        if not self.no_bias:
            X = np.hstack([np.ones((X.shape[0], 1)), X])
        
        # Return the predicted values using the hypothetical function
        return self._linear_hypothesis(X)


#### Problem 6 Learning and Estimation

In [6]:
import pandas as pd

# Load the data
data = pd.read_csv("train.csv")

# Display the first few rows of the data
print(data.head())

   Id  MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape  \
0   1          60       RL         65.0     8450   Pave   NaN      Reg   
1   2          20       RL         80.0     9600   Pave   NaN      Reg   
2   3          60       RL         68.0    11250   Pave   NaN      IR1   
3   4          70       RL         60.0     9550   Pave   NaN      IR1   
4   5          60       RL         84.0    14260   Pave   NaN      IR1   

  LandContour Utilities  ... PoolArea PoolQC Fence MiscFeature MiscVal MoSold  \
0         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      2   
1         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      5   
2         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      9   
3         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      2   
4         Lvl    AllPub  ...        0    NaN   NaN         NaN       0     12   

  YrSold  SaleType  SaleCondition  SalePrice  
0   2008        WD   

In [7]:
data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [8]:
data.describe()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
count,1460.0,1460.0,1201.0,1460.0,1460.0,1460.0,1460.0,1460.0,1452.0,1460.0,...,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0
mean,730.5,56.89726,70.049958,10516.828082,6.099315,5.575342,1971.267808,1984.865753,103.685262,443.639726,...,94.244521,46.660274,21.95411,3.409589,15.060959,2.758904,43.489041,6.321918,2007.815753,180921.19589
std,421.610009,42.300571,24.284752,9981.264932,1.382997,1.112799,30.202904,20.645407,181.066207,456.098091,...,125.338794,66.256028,61.119149,29.317331,55.757415,40.177307,496.123024,2.703626,1.328095,79442.502883
min,1.0,20.0,21.0,1300.0,1.0,1.0,1872.0,1950.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2006.0,34900.0
25%,365.75,20.0,59.0,7553.5,5.0,5.0,1954.0,1967.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,2007.0,129975.0
50%,730.5,50.0,69.0,9478.5,6.0,5.0,1973.0,1994.0,0.0,383.5,...,0.0,25.0,0.0,0.0,0.0,0.0,0.0,6.0,2008.0,163000.0
75%,1095.25,70.0,80.0,11601.5,7.0,6.0,2000.0,2004.0,166.0,712.25,...,168.0,68.0,0.0,0.0,0.0,0.0,0.0,8.0,2009.0,214000.0
max,1460.0,190.0,313.0,215245.0,10.0,9.0,2010.0,2010.0,1600.0,5644.0,...,857.0,547.0,552.0,508.0,480.0,738.0,15500.0,12.0,2010.0,755000.0


In [9]:
# Identify the target variable
target_variable = "SalePrice" 

In [10]:
numerical_features = data.dtypes[data.dtypes != "object"].index
print("Number of Numerical features: ", len(numerical_features))

categorical_features = data.dtypes[data.dtypes == "object"].index
print("Number of Categorical features: ", len(categorical_features))

Number of Numerical features:  38
Number of Categorical features:  43


In [11]:
missing_values = data.isnull().sum()
missing_values

Id                 0
MSSubClass         0
MSZoning           0
LotFrontage      259
LotArea            0
                ... 
MoSold             0
YrSold             0
SaleType           0
SaleCondition      0
SalePrice          0
Length: 81, dtype: int64

In [12]:
missing_values_percentage = (missing_values / len(data)) * 100
missing_values_percentage

Id                0.000000
MSSubClass        0.000000
MSZoning          0.000000
LotFrontage      17.739726
LotArea           0.000000
                   ...    
MoSold            0.000000
YrSold            0.000000
SaleType          0.000000
SaleCondition     0.000000
SalePrice         0.000000
Length: 81, dtype: float64

In [13]:
missing_df = pd.DataFrame({'Total': missing_values, 'missing_ratio': missing_values_percentage})
print("Missing Values for Each Feature:")
missing_df.head(30)

Missing Values for Each Feature:


Unnamed: 0,Total,missing_ratio
Id,0,0.0
MSSubClass,0,0.0
MSZoning,0,0.0
LotFrontage,259,17.739726
LotArea,0,0.0
Street,0,0.0
Alley,1369,93.767123
LotShape,0,0.0
LandContour,0,0.0
Utilities,0,0.0


In [14]:
# Delete features (columns) that have 5 or more missing values
columns_to_drop = missing_df[missing_df['Total'] >= 5].index
df_cleaned = data.drop(columns=columns_to_drop)
print("\nAfter dropping features with 5 or more missing values:")
df_cleaned.head()



After dropping features with 5 or more missing values:


Unnamed: 0,Id,MSSubClass,MSZoning,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,8450,Pave,Reg,Lvl,AllPub,Inside,Gtl,...,0,0,0,0,0,2,2008,WD,Normal,208500
1,2,20,RL,9600,Pave,Reg,Lvl,AllPub,FR2,Gtl,...,0,0,0,0,0,5,2007,WD,Normal,181500
2,3,60,RL,11250,Pave,IR1,Lvl,AllPub,Inside,Gtl,...,0,0,0,0,0,9,2008,WD,Normal,223500
3,4,70,RL,9550,Pave,IR1,Lvl,AllPub,Corner,Gtl,...,272,0,0,0,0,2,2006,WD,Abnorml,140000
4,5,60,RL,14260,Pave,IR1,Lvl,AllPub,FR2,Gtl,...,0,0,0,0,0,12,2008,WD,Normal,250000


In [15]:
from sklearn.model_selection import train_test_split

# Define features (X) and target variable (y)
X = df_cleaned.drop(columns=['SalePrice'])  # Features
y = df_cleaned['SalePrice']  # Target variable

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [16]:
# Instantiate the ScratchLinearRegression class
model_scratch = ScratchLinearRegression(num_iter=1000, lr=0.01, verbose=True)

# Fit the model on the training data
model_scratch.fit(X_train.values, y_train.values)

# Make predictions on the testing data
y_pred_scratch = model_scratch.predict(X_test.values)


Shape of X: (1168, 63)
Shape of coefficients: (63,)


TypeError: can't multiply sequence by non-int of type 'float'