# Data Loading and Initial Processing
Load the datasets, handle missing values, and perform initial data exploration including correlation analysis and outlier detection.

In [None]:
# Importer les bibliothèques nécessaires
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, PolynomialFeatures
from sklearn.model_selection import GridSearchCV

# Charger les données
train_data = pd.read_csv('../ynov-data/train_housing_train.csv')
valid_data = pd.read_csv('../ynov-data/train_housing_valid.csv')

# Séparer les caractéristiques et la cible
X = train_data.drop(['median_house_value', 'id'], axis=1)
y = train_data['median_house_value']

# Identifier les caractéristiques catégorielles et numériques
categorical_features = X.select_dtypes(include=['object']).columns
numeric_features = X.select_dtypes(include=['number']).columns

# Définir le préprocesseur
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline([
            ('imputer', SimpleImputer(strategy='mean')),
            ('poly', PolynomialFeatures(degree=2, include_bias=False)),
            ('scaler', StandardScaler())
        ]), numeric_features),
        ('cat', Pipeline([
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('encoder', OneHotEncoder())
        ]), categorical_features)
    ])

# Construire le pipeline
pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

# Définir la grille de paramètres pour GridSearchCV
param_grid = {
    'preprocessor__num__poly__degree': [1, 2, 3]
}

# Utiliser GridSearchCV pour trouver les meilleurs paramètres
grid_search = GridSearchCV(pipe, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search.fit(X, y)
best_model = grid_search.best_estimator_

# Faire des prédictions sur les données d'entraînement
train_predictions = best_model.predict(X)

# Évaluer le modèle sur les données d'entraînement
train_rmse = np.sqrt(mean_squared_error(y, train_predictions))
train_r2 = r2_score(y, train_predictions)
print(f'Train RMSE: {train_rmse:.2f}')
print(f'Train R²: {train_r2:.2f}')

# Charger les données de validation
X_valid = valid_data.drop(['median_house_value', 'id'], axis=1)
y_valid = valid_data['median_house_value']

# Faire des prédictions sur les données de validation
valid_predictions = best_model.predict(X_valid)

# Évaluer le modèle sur les données de validation
valid_rmse = np.sqrt(mean_squared_error(y_valid, valid_predictions))
valid_r2 = r2_score(y_valid, valid_predictions)
print(f'Validation RMSE: {valid_rmse:.2f}')
print(f'Validation R²: {valid_r2:.2f}')

# Charger les données de test
test = pd.read_csv('../ynov-data/test_housing.csv')
X_test = test.drop('id', axis=1)

# Faire des prédictions sur les données de test
test_predictions = best_model.predict(X_test)

# Créer le fichier de soumission
submission = pd.read_csv('../ynov-data/submission.csv')
submission['median_house_value'] = test_predictions
submission.to_csv('../ynov-data/submission.csv', index=False)

# Afficher les coefficients du modèle
model = best_model.named_steps['regressor']
feature_names = best_model.named_steps['preprocessor'].get_feature_names_out()
coefficients = pd.DataFrame({'Feature': feature_names, 'Coefficient': model.coef_})
print("\nCoefficients:")
print(coefficients)

# Feature Engineering and Selection
Create new features through interactions, apply polynomial features selectively, and implement feature scaling and encoding. Focus on features with strong correlations to house prices.

In [None]:
# Importer les bibliothèques nécessaires
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, PolynomialFeatures
from sklearn.model_selection import GridSearchCV, train_test_split

# Charger les données
train_data = pd.read_csv('../ynov-data/train_housing_train.csv')
valid_data = pd.read_csv('../ynov-data/train_housing_valid.csv')

# Séparer les caractéristiques et la cible
X = train_data.drop(['median_house_value', 'id'], axis=1)
y = train_data['median_house_value']

# Identifier les caractéristiques catégorielles et numériques
categorical_features = X.select_dtypes(include=['object']).columns
numeric_features = X.select_dtypes(include=['number']).columns

# Définir le préprocesseur
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline([
            ('imputer', SimpleImputer(strategy='mean')),
            ('poly', PolynomialFeatures(degree=2, include_bias=False)),
            ('scaler', StandardScaler())
        ]), numeric_features),
        ('cat', Pipeline([
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('encoder', OneHotEncoder())
        ]), categorical_features)
    ])

# Créer le pipeline
pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

# Définir la grille de paramètres pour GridSearchCV
param_grid = {
    'preprocessor__num__poly__degree': [1, 2, 3]
}

# Utiliser GridSearchCV pour trouver les meilleurs paramètres
grid_search = GridSearchCV(pipe, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search.fit(X, y)
best_model = grid_search.best_estimator_

# Faire des prédictions sur les données d'entraînement
train_predictions = best_model.predict(X)

# Calculer les métriques de performance sur les données d'entraînement
train_rmse = np.sqrt(mean_squared_error(y, train_predictions))
train_r2 = r2_score(y, train_predictions)

print(f'Train RMSE: {train_rmse:.2f}')
print(f'Train R²: {train_r2:.2f}')

# Faire des prédictions sur les données de validation
X_valid = valid_data.drop(['median_house_value', 'id'], axis=1)
y_valid = valid_data['median_house_value']
valid_predictions = best_model.predict(X_valid)

# Calculer les métriques de performance sur les données de validation
valid_rmse = np.sqrt(mean_squared_error(y_valid, valid_predictions))
valid_r2 = r2_score(y_valid, valid_predictions)

print(f'Validation RMSE: {valid_rmse:.2f}')
print(f'Validation R²: {valid_r2:.2f}')

# Charger les données de test
test = pd.read_csv('../ynov-data/test_housing.csv')
X_test = test.drop('id', axis=1)

# Faire des prédictions sur les données de test
test_predictions = best_model.predict(X_test)

# Créer le fichier de soumission
submission = pd.read_csv('../ynov-data/submission.csv')
submission['median_house_value'] = test_predictions
submission.to_csv('../ynov-data/submission.csv', index=False)

# Afficher les coefficients du modèle
model = best_model.named_steps['regressor']
feature_names = best_model.named_steps['preprocessor'].get_feature_names_out()
coefficients = pd.DataFrame({'Feature': feature_names, 'Coefficient': model.coef_})

print("\nCoefficients:")
print(coefficients)

# Model Pipeline Creation
Build a robust preprocessing pipeline with targeted imputation strategies, feature transformations, and regularization options.

In [None]:
# Importer les bibliothèques nécessaires
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, PolynomialFeatures
from sklearn.model_selection import GridSearchCV

# Charger les données
train_data = pd.read_csv('../ynov-data/train_housing_train.csv')
valid_data = pd.read_csv('../ynov-data/train_housing_valid.csv')

# Séparer les caractéristiques et la cible
X = train_data.drop(['median_house_value', 'id'], axis=1)
y = train_data['median_house_value']

# Identifier les caractéristiques catégorielles et numériques
categorical_features = X.select_dtypes(include=['object']).columns
numeric_features = X.select_dtypes(include=['number']).columns

# Définir le préprocesseur
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline([
            ('imputer', SimpleImputer(strategy='mean')),
            ('poly', PolynomialFeatures(degree=2, include_bias=False)),
            ('scaler', StandardScaler())
        ]), numeric_features),
        ('cat', Pipeline([
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('encoder', OneHotEncoder())
        ]), categorical_features)
    ])

# Construire le pipeline
pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

# Définir la grille de paramètres pour GridSearchCV
param_grid = {
    'preprocessor__num__poly__degree': [1, 2, 3]
}

# Utiliser GridSearchCV pour trouver les meilleurs paramètres
grid_search = GridSearchCV(pipe, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search.fit(X, y)
best_model = grid_search.best_estimator_

# Faire des prédictions sur les données d'entraînement
train_predictions = best_model.predict(X)

# Évaluer le modèle sur les données d'entraînement
train_rmse = mean_squared_error(y, train_predictions, squared=False)
train_r2 = r2_score(y, train_predictions)
print(f'Train RMSE: {train_rmse:.2f}')
print(f'Train R²: {train_r2:.2f}')

# Préparer les données de validation
X_valid = valid_data.drop(['median_house_value', 'id'], axis=1)
y_valid = valid_data['median_house_value']

# Faire des prédictions sur les données de validation
valid_predictions = best_model.predict(X_valid)

# Évaluer le modèle sur les données de validation
valid_rmse = mean_squared_error(y_valid, valid_predictions, squared=False)
valid_r2 = r2_score(y_valid, valid_predictions)
print(f'Validation RMSE: {valid_rmse:.2f}')
print(f'Validation R²: {valid_r2:.2f}')

# Charger les données de test
test = pd.read_csv('../ynov-data/test_housing.csv')
X_test = test.drop('id', axis=1)

# Faire des prédictions sur les données de test
test_predictions = best_model.predict(X_test)

# Créer le fichier de soumission
submission = pd.read_csv('../ynov-data/submission.csv')
submission['median_house_value'] = test_predictions
submission.to_csv('../ynov-data/submission.csv', index=False)

# Afficher les coefficients du modèle
model = best_model.named_steps['regressor']
feature_names = best_model.named_steps['preprocessor'].get_feature_names_out()
coefficients = pd.DataFrame({'Feature': feature_names, 'Coefficient': model.coef_})
print("\nCoefficients:")
print(coefficients)

# Model Training and Tuning
Implement cross-validation, grid search for hyperparameter optimization, and evaluate different polynomial degrees and regularization strengths.

In [None]:
# Importer les bibliothèques nécessaires
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, PolynomialFeatures
from sklearn.model_selection import GridSearchCV

# Charger les données
train_data = pd.read_csv('../ynov-data/train_housing_train.csv')
valid_data = pd.read_csv('../ynov-data/train_housing_valid.csv')

# Séparer les caractéristiques et la cible
X = train_data.drop(['median_house_value', 'id'], axis=1)
y = train_data['median_house_value']

# Identifier les caractéristiques numériques et catégorielles
categorical_features = X.select_dtypes(include=['object']).columns
numeric_features = X.select_dtypes(include=['number']).columns

# Définir le préprocesseur
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline([
            ('imputer', SimpleImputer(strategy='mean')),
            ('poly', PolynomialFeatures(degree=2, include_bias=False)),
            ('scaler', StandardScaler())
        ]), numeric_features),
        ('cat', Pipeline([
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('encoder', OneHotEncoder())
        ]), categorical_features)
    ])

# Construire le pipeline
pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

# Définir la grille de paramètres pour GridSearchCV
param_grid = {
    'preprocessor__num__poly__degree': [1, 2, 3]
}

# Utiliser GridSearchCV pour trouver les meilleurs paramètres
grid_search = GridSearchCV(pipe, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search.fit(X, y)
best_model = grid_search.best_estimator_

# Faire des prédictions sur les données d'entraînement
train_predictions = best_model.predict(X)

# Évaluer le modèle sur les données d'entraînement
train_rmse = np.sqrt(mean_squared_error(y, train_predictions))
train_r2 = r2_score(y, train_predictions)
print(f'Train RMSE: {train_rmse:.2f}')
print(f'Train R²: {train_r2:.2f}')

# Préparer les données de validation
X_valid = valid_data.drop(['median_house_value', 'id'], axis=1)
y_valid = valid_data['median_house_value']

# Faire des prédictions sur les données de validation
valid_predictions = best_model.predict(X_valid)

# Évaluer le modèle sur les données de validation
valid_rmse = np.sqrt(mean_squared_error(y_valid, valid_predictions))
valid_r2 = r2_score(y_valid, valid_predictions)
print(f'Validation RMSE: {valid_rmse:.2f}')
print(f'Validation R²: {valid_r2:.2f}')

# Charger les données de test
test = pd.read_csv('../ynov-data/test_housing.csv')
X_test = test.drop('id', axis=1)

# Faire des prédictions sur les données de test
test_predictions = best_model.predict(X_test)

# Préparer le fichier de soumission
submission = pd.read_csv('../ynov-data/submission.csv')
submission['median_house_value'] = test_predictions
submission.to_csv('../ynov-data/submission.csv', index=False)

# Afficher les coefficients du modèle
model = best_model.named_steps['regressor']
feature_names = best_model.named_steps['preprocessor'].get_feature_names_out()
coefficients = pd.DataFrame({'Feature': feature_names, 'Coefficient': model.coef_})
print("\nCoefficients:")
print(coefficients)

# Model Evaluation and Predictions
Calculate performance metrics, analyze residuals, identify influential features, and generate final predictions for test data.

In [None]:
# Importer les bibliothèques nécessaires
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, PolynomialFeatures
from sklearn.model_selection import GridSearchCV

# Charger les données
train_data = pd.read_csv('../ynov-data/train_housing_train.csv')
valid_data = pd.read_csv('../ynov-data/train_housing_valid.csv')

# Séparer les caractéristiques et la cible
X = train_data.drop(['median_house_value', 'id'], axis=1)
y = train_data['median_house_value']

# Identifier les caractéristiques catégorielles et numériques
categorical_features = X.select_dtypes(include=['object']).columns
numeric_features = X.select_dtypes(include=['number']).columns

# Définir le préprocesseur
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline([
            ('imputer', SimpleImputer(strategy='mean')),
            ('poly', PolynomialFeatures(degree=2, include_bias=False)),
            ('scaler', StandardScaler())
        ]), numeric_features),
        ('cat', Pipeline([
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('encoder', OneHotEncoder())
        ]), categorical_features)
    ])

# Construire le pipeline
pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

# Définir la grille de paramètres pour GridSearchCV
param_grid = {
    'preprocessor__num__poly__degree': [1, 2, 3]
}

# Utiliser GridSearchCV pour trouver les meilleurs paramètres
grid_search = GridSearchCV(pipe, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search.fit(X, y)
best_model = grid_search.best_estimator_

# Faire des prédictions sur les données d'entraînement
train_predictions = best_model.predict(X)

# Évaluer le modèle sur les données d'entraînement
train_rmse = mean_squared_error(y, train_predictions, squared=False)
train_r2 = r2_score(y, train_predictions)

print(f'Train RMSE: {train_rmse:.2f}')
print(f'Train R²: {train_r2:.2f}')

# Faire des prédictions sur les données de validation
X_valid = valid_data.drop(['median_house_value', 'id'], axis=1)
y_valid = valid_data['median_house_value']
valid_predictions = best_model.predict(X_valid)

# Évaluer le modèle sur les données de validation
valid_rmse = mean_squared_error(y_valid, valid_predictions, squared=False)
valid_r2 = r2_score(y_valid, valid_predictions)

print(f'Validation RMSE: {valid_rmse:.2f}')
print(f'Validation R²: {valid_r2:.2f}')

# Charger les données de test
test = pd.read_csv('../ynov-data/test_housing.csv')
X_test = test.drop('id', axis=1)

# Faire des prédictions sur les données de test
test_predictions = best_model.predict(X_test)

# Créer le fichier de soumission
submission = pd.read_csv('../ynov-data/submission.csv')
submission['median_house_value'] = test_predictions
submission.to_csv('../ynov-data/submission.csv', index=False)

# Afficher les coefficients du modèle
model = best_model.named_steps['regressor']
feature_names = best_model.named_steps['preprocessor'].get_feature_names_out()
coefficients = pd.DataFrame({'Feature': feature_names, 'Coefficient': model.coef_})

print("\nCoefficients:")
print(coefficients)