In [53]:
def create_new_features(df):
    df = df.copy()
    # Create derived ratio features
    df['rooms_per_household'] = df['total_rooms'] / df['households']
    df['bedrooms_per_room'] = df['total_bedrooms'] / df['total_rooms']
    df['population_per_household'] = df['population'] / df['households']
    df['income_per_person'] = df['median_income'] / df['population']
    df['bedrooms_per_household'] = df['total_bedrooms'] / df['households']

    # Handle missing values in total_bedrooms
    df['total_bedrooms'].fillna(df['total_bedrooms'].median(), inplace=True)
    df['missing_bedrooms'] = df['total_bedrooms'].isnull().astype(int)

    # Encoder ocean_proximity de manière plus robuste
    ocean_categories = ['INLAND', 'NEAR BAY', 'NEAR OCEAN', '<1H OCEAN']
    
    # Créer une colonne pour chaque catégorie
    for category in ocean_categories:
        df[f'ocean_proximity_{category}'] = (df['ocean_proximity'] == category).astype(int)
    
    # Supprimer la colonne originale
    df = df.drop('ocean_proximity', axis=1)
    
    return df

In [54]:

from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, root_mean_squared_error

# Charger les données d'entraînement, de validation et de test
train_data = pd.read_csv('./ynov-data/train_housing_train.csv')
valid_data = pd.read_csv('./ynov-data/train_housing_valid.csv')
test_data = pd.read_csv('./ynov-data/test_housing.csv')

# Appliquer les transformations
train_data = create_new_features(train_data)
valid_data = create_new_features(valid_data)
test_data = create_new_features(test_data)


# Créer un pipeline avec un imputeur et le modèle
pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),  # Impute les valeurs manquantes avec la moyenne
    ('model', LinearRegression())
])

# Mettre à jour X_train avec les nouvelles colonnes
X_train = train_data[[ 
    'longitude', 'latitude', 'median_income',   
    'rooms_per_household', 'bedrooms_per_room',
    'population_per_household','ocean_proximity_INLAND',
    'ocean_proximity_NEAR BAY', 
    'ocean_proximity_NEAR OCEAN',
    'ocean_proximity_<1H OCEAN'
]]
y_train = train_data['median_house_value']

# Entraîner le modèle avec le pipeline
pipeline.fit(X_train, y_train)

# Afficher les coefficients du modèle
print("Coefficients:", pipeline.named_steps['model'].coef_)
print("Intercept:", pipeline.named_steps['model'].intercept_)

Coefficients: [-2.91991309e+04 -2.73310382e+04  4.14411131e+04  3.22263012e+03
  3.36676677e+05 -2.71133112e+02 -1.69727463e+05 -1.17716262e+05
 -1.22002084e+05 -1.32284177e+05]
Intercept: -2418243.408941365


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['total_bedrooms'].fillna(df['total_bedrooms'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['total_bedrooms'].fillna(df['total_bedrooms'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because th

In [55]:
# Évaluation sur les données de validation
X_valid = valid_data[[ 
    'longitude', 'latitude', 'median_income',   
    'rooms_per_household', 'bedrooms_per_room',
    'population_per_household','ocean_proximity_INLAND',
    'ocean_proximity_NEAR BAY', 
    'ocean_proximity_NEAR OCEAN',
    'ocean_proximity_<1H OCEAN'
]]
y_valid = valid_data['median_house_value']

# Utiliser le pipeline pour faire des prédictions
valid_predictions = pipeline.predict(X_valid)  # Utiliser le pipeline pour prédire

mae_valid = mean_absolute_error(y_valid, valid_predictions)
print("\nMean Absolute Error sur les données de validation:", mae_valid)
rmse_valid = mean_squared_error(y_valid, valid_predictions, squared=False)
print("\nRoot Mean Squared Error sur les données de validation:", rmse_valid)


Mean Absolute Error sur les données de validation: 52301.458058159864

Root Mean Squared Error sur les données de validation: 72865.63496156107




In [56]:
# Prédictions sur les données de test
X_test = test_data[[ 
    'longitude', 'latitude', 'median_income',   
    'rooms_per_household', 'bedrooms_per_room',
    'population_per_household','ocean_proximity_INLAND',
    'ocean_proximity_NEAR BAY', 
    'ocean_proximity_NEAR OCEAN',
    'ocean_proximity_<1H OCEAN'
]]

# Utiliser pipeline.predict au lieu de model.predict
test_predictions = pipeline.predict(X_test)
print("\nPremières prédictions sur les données de test:", test_predictions[:5])

# Créer un DataFrame avec les prédictions et les ID pour le fichier de soumission
submission = pd.DataFrame({'id': test_data['id'], 'median_house_value': test_predictions})
print(submission)

# Enregistrer le DataFrame dans un fichier CSV
submission.to_csv('./ynov-data/submit.csv', index=False)



Premières prédictions sur les données de test: [313169.81602139 215581.67050806 222079.25201596 207459.04491776
 215262.07416024]
         id  median_house_value
0         3       313169.816021
1        10       215581.670508
2        11       222079.252016
3        12       207459.044918
4        13       215262.074160
...     ...                 ...
8635  20635        24574.314350
8636  20636        69221.831168
8637  20637        32923.557279
8638  20638        44815.746724
8639  20639        65787.293890

[8640 rows x 2 columns]
