<a href="https://colab.research.google.com/github/zabolotnaydev-cloud/ML_internship/blob/main/Liniar_Regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [None]:
url = 'https://raw.githubusercontent.com/selva86/datasets/master/BostonHousing.csv'
df = pd.read_csv(url)

In [None]:
print("Dataset shape:", df.shape)
print("\nFirst rows:")
print(df.head())

Dataset shape: (506, 14)

First rows:
      crim    zn  indus  chas    nox     rm   age     dis  rad  tax  ptratio  \
0  0.00632  18.0   2.31     0  0.538  6.575  65.2  4.0900    1  296     15.3   
1  0.02731   0.0   7.07     0  0.469  6.421  78.9  4.9671    2  242     17.8   
2  0.02729   0.0   7.07     0  0.469  7.185  61.1  4.9671    2  242     17.8   
3  0.03237   0.0   2.18     0  0.458  6.998  45.8  6.0622    3  222     18.7   
4  0.06905   0.0   2.18     0  0.458  7.147  54.2  6.0622    3  222     18.7   

        b  lstat  medv  
0  396.90   4.98  24.0  
1  396.90   9.14  21.6  
2  392.83   4.03  34.7  
3  394.63   2.94  33.4  
4  396.90   5.33  36.2  


In [None]:
correlation_with_target = df.corr()['medv'].sort_values(ascending=False)
print("\nCorrelation with target:")
print(correlation_with_target)


Correlation with target:
medv       1.000000
rm         0.695360
zn         0.360445
b          0.333461
dis        0.249929
chas       0.175260
age       -0.376955
rad       -0.381626
crim      -0.388305
nox       -0.427321
tax       -0.468536
indus     -0.483725
ptratio   -0.507787
lstat     -0.737663
Name: medv, dtype: float64


In [None]:
high_corr = correlation_with_target[abs(correlation_with_target) >= 0.5].drop('medv')
print("\nFeatures with high correlation (|corr| >= 0.5):")
for feature, corr in high_corr.items():
    print(f"\n{feature}: {corr:.4f}")


Features with high correlation (|corr| >= 0.5):

rm: 0.6954

ptratio: -0.5078

lstat: -0.7377


In [None]:
selected_features = []
for feature, corr in correlation_with_target.items():
    if feature != 'medv' and 0.5 <= abs(corr) <= 0.8:
        selected_features.append(feature)

print(f"\nSelected features (0.5-0.8): {selected_features}")


Selected features (0.5-0.8): ['rm', 'ptratio', 'lstat']


In [None]:
class LinearRegressionScratch:
    def __init__(self, lr=0.01, epochs=1000):
        self.lr = lr
        self.epochs = epochs
        self.w = None
        self.b = None

    def fit(self, X, y):
        n_samples, n_features = X.shape
        self.w = np.zeros(n_features)
        self.b = 0

        for _ in range(self.epochs):
            y_pred = self.predict(X)
            dw = -(2/n_samples) * np.dot(X.T, (y - y_pred))
            db = -(2/n_samples) * np.sum(y - y_pred)
            self.w -= self.lr * dw
            self.b -= self.lr * db

        return self

    def predict(self, X):
        return np.dot(X, self.w) + self.b

    def score(self, X, y):
        y_pred = self.predict(X)
        ss_res = np.sum((y - y_pred) ** 2)
        ss_tot = np.sum((y - np.mean(y)) ** 2)
        return 1 - (ss_res / ss_tot)

In [None]:
X_full = df.drop('medv', axis=1)
y = df['medv']

# Split full dataset
X_train_full, X_test_full, y_train_full, y_test_full = train_test_split(
    X_full, y, test_size=0.2, random_state=42
)

# sklearn
model_sklearn_full = LinearRegression()
model_sklearn_full.fit(X_train_full, y_train_full)
score_sklearn_full = model_sklearn_full.score(X_test_full, y_test_full)

# from-scratch
model_scratch_full = LinearRegressionScratch()
model_scratch_full.fit(X_train_full.values, y_train_full.values)
score_scratch_full = model_scratch_full.score(X_test_full.values, y_test_full.values)


In [None]:
X_selected = df[selected_features]

X_train_sel, X_test_sel, y_train_sel, y_test_sel = train_test_split(
    X_selected, y, test_size=0.2, random_state=42
)

# sklearn
model_sklearn_sel = LinearRegression()
model_sklearn_sel.fit(X_train_sel, y_train_sel)
score_sklearn_sel = model_sklearn_sel.score(X_test_sel, y_test_sel)

# from-scratch
model_scratch_sel = LinearRegressionScratch()
model_scratch_sel.fit(X_train_sel.values, y_train_sel.values)
score_scratch_sel = model_scratch_sel.score(X_test_sel.values, y_test_sel.values)

print("\nselected")
print(f"sklearn R^2: {score_sklearn_sel:.4f}")
print(f"from-scratch R^2: {score_scratch_sel:.4f}")

# Results interpretation
print(f"Full dataset works better: {score_sklearn_full > score_sklearn_sel}")
print(f"sklearn and from-scratch are similar: {abs(score_sklearn_full - score_scratch_full) < 0.001}")


selected
sklearn R^2: 0.6303
from-scratch R^2: 0.6303
Full dataset works better: True
sklearn and from-scratch are similar: True
