<a href="https://colab.research.google.com/github/vsevolod-BR/prictice/blob/main/liner_regression(pr_3).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_california_housing
from sklearn import datasets
from sklearn import linear_model
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [7]:
X, Y = datasets.fetch_california_housing(as_frame=True, return_X_y=True)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size=0.2, random_state=42)

In [16]:
X_scaler = StandardScaler()
X_scaler.fit(X_train)
# get_feature_names_out - метод, который позволяет вытянуть имена, переданные при обучении
# transform - используется для применения обученного преобразования к новым данным
X_train_scaled = pd.DataFrame(X_scaler.transform(X_train), columns=X_scaler.get_feature_names_out())
X_test_scaled = pd.DataFrame(X_scaler.transform(X_test), columns=X_scaler.get_feature_names_out())

In [21]:
alphas = [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10, 50]

In [19]:
model = linear_model.LinearRegression()
model.fit(X_train_scaled, Y_train)
print(f'MSE train:{metrics.mean_squared_error(Y_train, model.predict(X_train_scaled))}')
print(f'MSE test:{metrics.mean_squared_error(Y_test, model.predict(X_test_scaled))}')

MSE train:0.547148403838709
MSE test:0.5218435360402414


In [26]:
best_train_mse = np.inf
best_test_mse = np.inf
best_alpha = None

for alpha in alphas:
  model = linear_model.Ridge(alpha=alpha)
  model.fit(X_train_scaled, Y_train)
  train_mse = metrics.mean_squared_error(Y_train, model.predict(X_train_scaled))
  test_mse = metrics.mean_squared_error(Y_test, model.predict(X_test_scaled))
  if best_test_mse > test_mse:
    best_train_mse = train_mse
    best_test_mse = test_mse
    best_alpha = alpha
print(f'Train MSE: {best_train_mse}, Test MSE: {best_test_mse}, Alpha = {best_alpha}')

Train MSE: 0.5471500621410261, Test MSE: 0.5218380901770937, Alpha = 1


In [27]:
best_train_mse = np.inf
best_test_mse = np.inf
best_alpha = None

for alpha in alphas:
  model = linear_model.Lasso(alpha=alpha)
  model.fit(X_train_scaled, Y_train)
  train_mse = metrics.mean_squared_error(Y_train, model.predict(X_train_scaled))
  test_mse = metrics.mean_squared_error(Y_test, model.predict(X_test_scaled))
  if best_test_mse > test_mse:
    best_train_mse = train_mse
    best_test_mse = test_mse
    best_alpha = alpha
print(f'Train MSE: {best_train_mse}, Test MSE: {best_test_mse}, Alpha = {best_alpha}')

Train MSE: 0.5472190922587039, Test MSE: 0.5218735345704069, Alpha = 0.001


In [28]:
best_train_mse = np.inf
best_test_mse = np.inf
best_alpha = None

for alpha in alphas:
  model = linear_model.ElasticNet(alpha=alpha)
  model.fit(X_train_scaled, Y_train)
  train_mse = metrics.mean_squared_error(Y_train, model.predict(X_train_scaled))
  test_mse = metrics.mean_squared_error(Y_test, model.predict(X_test_scaled))
  if best_test_mse > test_mse:
    best_train_mse = train_mse
    best_test_mse = test_mse
    best_alpha = alpha
print(f'Train MSE: {best_train_mse}, Test MSE: {best_test_mse}, Alpha = {best_alpha}')

Train MSE: 0.547191522417984, Test MSE: 0.5218521475774447, Alpha = 0.001


In [45]:
model.coef_
ind = np.argpartition(np.abs(model.coef_), 2)[:2]
print(X_train_scaled.columns[ind].tolist())

['Population_squared', 'Population_X_Latitude']


In [39]:
#num_feature = X.shape[1]
#for i in range(num_feature):
#  column = X.columns[i]
#  X_train_scaled[f'{column}_squared'] = X_train_scaled[column] ** 2
#  X_test_scaled[f'{column}_squared'] = X_test_scaled[column] ** 2
#  for j in range(1+i, num_feature):
#    other_column = X.columns[j]
#    X_train_scaled[f'{column}_X_{other_column}'] = X_train_scaled[column] * X_train_scaled[other_column]
#    X_test_scaled[f'{column}_X_{other_column}'] = X_test_scaled[column] * X_test_scaled[other_column]