# <span style="color:pink">Projekt część 2.4.0.1</span>

<span style="color:pink">**_Zofia Różańska, 280526_**</span>

<span style="color:pink">_Dataset: Estimation of Obesity Levels Based On Eating Habits and Physical Condition_</span>


In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [3]:
# Load the dataset
dataset_url = 'https://raw.githubusercontent.com/zosia-r/msid/refs/heads/main/I/dataset.csv'
data = pd.read_csv(dataset_url)

# Display all columns and rows in the DataFrame
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

print(data.head())

   Gender   Age  Height  Weight family_history_with_overweight FAVC  FCVC  \
0  Female  21.0    1.62    64.0                            yes   no   2.0   
1  Female  21.0    1.52    56.0                            yes   no   3.0   
2    Male  23.0    1.80    77.0                            yes   no   2.0   
3    Male  27.0    1.80    87.0                             no   no   3.0   
4    Male  22.0    1.78    89.8                             no   no   2.0   

   NCP       CAEC SMOKE  CH2O  SCC  FAF  TUE        CALC  \
0  3.0  Sometimes    no   2.0   no  0.0  1.0          no   
1  3.0  Sometimes   yes   3.0  yes  3.0  0.0   Sometimes   
2  3.0  Sometimes    no   2.0   no  2.0  1.0  Frequently   
3  3.0  Sometimes    no   2.0   no  2.0  0.0  Frequently   
4  1.0  Sometimes    no   2.0   no  0.0  0.0   Sometimes   

                  MTRANS           NObeyesdad  
0  Public_Transportation        Normal_Weight  
1  Public_Transportation        Normal_Weight  
2  Public_Transportation        

## <span style="color:pink">Linear Regression</span>


### <span style="color:pink">Preprocessing</span>

In [4]:
# Separate features and target
X = data.drop('Weight', axis=1)
Y = data['Weight']
# This solution can be used only for numeric values,
# so I chose Weight(float64) as ground truth vector

In [5]:
def preprocess(X):
    # Identify categorical and numerical columns
    categorical_cols = X.select_dtypes(include=['object']).columns.tolist()
    numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

    # Define the preprocessing steps for numerical and categorical features
    numerical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler())
    ])

    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

    # Combine preprocessing steps
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_cols),
            ('cat', categorical_transformer, categorical_cols)
        ]
    )

    return preprocessor

In [6]:
# Split data into trainig and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=0)

### <span style="color:pink">Closed-Form Solution</span>

$$ \theta = (X^TX)^{-1} X^T Y $$

where:

* $X$ = the matrix of input features
* $Y$ = the vector of ground truth values
* $\theta$ = the vector of model coefficients



In [7]:
# Define the closed-form solution function
def closed_form_solution(X, Y):
    XT = np.transpose(X)
    return np.linalg.inv(XT @ X) @ XT @ Y

In [8]:
# Perform preprocessing on X
preprocessor = preprocess(X)

X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

In [9]:
# Add a bias term (intercept) to the transformed data
X_train = np.hstack([np.ones((X_train.shape[0], 1)), X_train])
X_test = np.hstack([np.ones((X_test.shape[0], 1)), X_test])

In [10]:
# Training the model using closed-form solution
theta = closed_form_solution(X_train, Y_train)
print(f'Theta: {theta}')

Theta: [-6.98426098e+17 -1.68729101e+03 -1.19534426e+03  3.56167529e+03
  8.45379382e+02 -6.57662699e+01 -5.47693524e+02  3.71683730e+02
  2.03361377e+19  2.03361377e+19 -9.28032309e+18 -9.28032309e+18
 -1.03281403e+19 -1.03281403e+19  3.62204242e+18  3.62204242e+18
  3.62204242e+18  3.62204242e+18  1.30366450e+17  1.30366450e+17
  2.92565688e+17  2.92565688e+17  3.96446687e+18  3.96446687e+18
  3.96446687e+18  3.96446687e+18 -8.03868969e+18 -8.03868969e+18
 -8.03868969e+18 -8.03868969e+18 -8.03868969e+18 -2.78683288e+02
  8.57913014e+02  6.90601309e+03  3.06152736e+03  3.96740654e+03
  4.42743358e+03  3.60014533e+03]


In [11]:
# Evaluate the model
Y_pred = X_test @ theta

print(f'Mean Squared Error: {mean_squared_error(Y_test, Y_pred):.3f}')

Mean Squared Error: 24043817.897


### <span style="color:pink">OGRANICZENIA ZAMKNIĘTEJ FORMUŁY</span>
1. Duża złożoność obliczeniowa
    * Odwracanie macierzy $X^TX$ ma złożoność obliczeniową $O(n^3)$, gdzie $n$ to liczba cech (kolumn).
    * Dla dużych zbiorów danych i dla danych o wielu cechach staje się te bardzo kosztowne obliczeniowo.

2. Wysokie zużycie pamięci
    * Obliczanie i przechowywanie dużych macierzy wymaga ogromnych zasobów pamięci RAM.
    * Dla dużych zbiorów danych może to prowadzić do przeciążenia pamięci

3. Niestabilność numeryczna
    * Jeśli macierz $X^TX$ jest bliska osobliwości (nieodwracalna) (np. gdy cechy są silnie skorelowane) - odwrócenie macierzy staje się bardzo niestabilne.
    * Prowadzi to do błędnych wyników.

### <span style="color:pink">Compare with sklearn's LinearRegression</span>

In [12]:
# Compare with sklearn's LinearRegression
from sklearn.linear_model import LinearRegression

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=0)

model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

model.fit(X_train, Y_train)

Y_pred = model.predict(X_test)

print(f'Mean Squared Error: {mean_squared_error(Y_test, Y_pred):.3f}')

Mean Squared Error: 24.829
