Linear regression using sk-learn models and clear numpy using kaggle house prices dataset

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression,Ridge,ElasticNet,Lasso
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.metrics import accuracy_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.impute import SimpleImputer
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
df = pd.read_csv('datasets/linear_reg/train.csv')
df.drop(columns=['Id'])
y=df['SalePrice']
x=df.drop(columns=['SalePrice'])
ordinal_col=['LotShape', 'LandSlope', 'OverallQual', 'OverallCond', 'ExterQual', 'ExterCond',
'BsmtQual', 'BsmtCond', 'HeatingQC', 'KitchenQual', 'FireplaceQu','GarageQual', 'GarageCond', 'PoolQC']
ordinal_cats = [
    ['Missing','IR3', 'IR2', 'IR1', 'Reg'],      # LotShape — порядок от худшего к лучшему
    ['Missing','Sev', 'Mod', 'Gtl'],             # LandSlope
    list(range(1, 11)),                # OverallQual (1-10)
    list(range(1, 11)),                # OverallCond (1-10)
    ['Missing','Po', 'Fa', 'TA', 'Gd', 'Ex'],   # ExterQual
    ['Missing','Po', 'Fa', 'TA', 'Gd', 'Ex'],   # ExterCond
    ['Missing','Po', 'Fa', 'TA', 'Gd', 'Ex'],   # BsmtQual
    ['Missing','Po', 'Fa', 'TA', 'Gd', 'Ex'],   # BsmtCond
    ['Missing','Po', 'Fa', 'TA', 'Gd', 'Ex'],   # HeatingQC
    ['Missing','Po', 'Fa', 'TA', 'Gd', 'Ex'],   # KitchenQual
    ['Missing','Po', 'Fa', 'TA', 'Gd', 'Ex'],   # FireplaceQu
    ['Missing','Po', 'Fa', 'TA', 'Gd', 'Ex'],   # GarageQual
    ['Missing','Po', 'Fa', 'TA', 'Gd', 'Ex'],   # GarageCond
    ['Missing','Po', 'Fa', 'TA', 'Gd', 'Ex']    # PoolQC
]
nominal_col=['MSZoning', 'Street', 'Alley', 'LandContour', 'Utilities', 'LotConfig', 'Neighborhood',
'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl',
'Exterior1st', 'Exterior2nd', 'MasVnrType', 'Foundation', 'BsmtExposure', 'BsmtFinType1',
'BsmtFinType2', 'Heating', 'CentralAir', 'Electrical', 'Functional', 'GarageType',
'GarageFinish', 'PavedDrive', 'Fence', 'MiscFeature', 'SaleType', 'SaleCondition']
numeric_col=[col for col in x.columns if col not in ordinal_col + nominal_col]
num_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])
preprocessor = ColumnTransformer(transformers=[
    ('ord', Pipeline([
        ('imputer', SimpleImputer(strategy='constant', fill_value='Missing')),
        ('encoder', OrdinalEncoder(categories=ordinal_cats))
    ]), ordinal_col),
    ('nom', Pipeline([
        ('imputer', SimpleImputer(strategy='constant', fill_value='Missing')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ]), nominal_col),
    ('num', num_pipeline, numeric_col)
])
model=Pipeline([
    ('preprocessor',preprocessor),
    ('regressor',ElasticNet(alpha=0.2,l1_ratio=0.9,random_state=123,max_iter=10000))])
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=123)
model.fit(x_train,y_train)
y_pred=model.predict(x_test)
mse=mean_squared_error(y_test,y_pred)
r2=r2_score(y_test,y_pred)
print(f'MSE: {mse:.2f}')
print(f'R2: {r2:.2f}')
scores = cross_val_score(model, x, y, cv=5, scoring='r2')
print("R2 по кросс-валидации (5 фолдов):", scores)
print("Среднее R2:", scores.mean())

MSE: 729214703.23
R2: 0.88
R2 по кросс-валидации (5 фолдов): [0.89102627 0.83833461 0.86566292 0.86973108 0.69180658]
Среднее R2: 0.8313122924425926


linear regression based on gradient descent using age-expirence-income dataset

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
class GDLinearRegression:
    def __init__(self,learning_rate=0.01,tolerance=1e-8):
        self.learning_rate=learning_rate
        self.tolerance=tolerance
    def fit(self,x,y):
        n_samples,n_features=x.shape
        self.bias,self.weights=0,np.zeros(n_features)
        previous_db,previous_dw=0,np.zeros(n_features)
        while True:
            y_pred = x @ self.weights + self.bias
            db = 1 / n_samples * np.sum(y_pred - y)
            dw = 1 / n_samples * x.T @ (y_pred - y)
            self.bias -= self.learning_rate * db
            self.weights -= self.learning_rate * dw
            abs_db_reduction = np.abs(db - previous_db)
            abs_dw_reduction = np.abs(dw - previous_dw)
            if abs_db_reduction < self.tolerance:
                if np.all(abs_dw_reduction < self.tolerance):
                    break
            previous_db = db
            previous_dw = dw
    def predict(self,x_test,y_test):
        y_pred = x_test @ self.weights + self.bias
        mse = 1 / len(y_test) * np.sum(np.square(y_pred - y_test))
        up=np.sum(np.square(y_test - y_pred))
        down=np.sum(np.square(y_test - np.mean(y_test)))
        r2 = 1 - (up/down)
        return (f"pred: {y_pred} test: {y_test}\nr2: {r2:.2f}\nmse: {mse:.2f}")
df = pd.read_csv('multiple_linear_regression_dataset.csv')
y=df['income']
x=df.drop(columns=['income'])
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=123)
model=GDLinearRegression(learning_rate=0.001,tolerance=1e-6)
model.fit(x_train,y_train)
result = model.predict(x_test=x_test,y_test=y_test)
print(result)

