In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import Ridge
# from tensorflow.keras.callbacks import EarlyStopping
# from tensorflow.keras.models import Sequential
# from tensorflow.keras.layers import Dense
# from tensorflow.keras.preprocessing.image import ImageDataGenerator

# Load the data
data = pd.read_csv('data.csv')
X = data.drop('MEDV', axis=1)
y = data['MEDV']

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# 1. Cross-validation
rf = RandomForestRegressor()
scores = cross_val_score(rf, X_train, y_train, cv=5, scoring='r2')
print("Cross-validation R-squared scores:", scores)

# 2. Regularization
ridge = Ridge(alpha=0.1)
ridge.fit(X_train, y_train)
coefficients = pd.DataFrame({'Feature': X.columns, 'Coefficient': ridge.coef_})
print(coefficients)

# 3. Early stopping
# model = Sequential([
#     Dense(32, activation='relu', input_shape=(X_train.shape[1],)),
#     Dense(1)
# ])
# model.compile(optimizer='adam', loss='mse', metrics=['mae'])
# early_stop = EarlyStopping(monitor='val_loss', patience=10)
# history = model.fit(X_train, y_train, epochs=100, validation_split=0.2, callbacks=[early_stop])

# 4. Data augmentation
# datagen = ImageDataGenerator(rotation_range=10, zoom_range=0.2, horizontal_flip=True)
# train_generator = datagen.flow(X_train, y_train, batch_size=32)
# history = model.fit(train_generator, epochs=100, validation_data=(X_val, y_val))

# 5. Model selection
rf = RandomForestRegressor()
param_grid_rf = {'n_estimators': [50, 100, 200], 'max_depth': [None, 5, 10]}
grid_rf = GridSearchCV(rf, param_grid_rf, cv=5)
grid_rf.fit(X_train, y_train)
print("Best random forest model:", grid_rf.best_estimator_)

# Fit the final model
model_rf = grid_rf.best_estimator_
model_rf.fit(X_train, y_train)

# Evaluate the final model
y_pred = model_rf.predict(X_val)
mse = mean_squared_error(y_val, y_pred)
rmse = mse ** 0.5
r2 = r2_score(y_val, y_pred)
print("Mean squared error:", mse)
print("RMSE:", rmse)
print("R-squared:", r2)


Cross-validation R-squared scores: [0.83814915 0.73336192 0.84521898 0.88734874 0.8079846 ]
    Feature  Coefficient
0      CRIM    -0.112400
1        ZN     0.030459
2     INDUS     0.034896
3      CHAS     2.750333
4       NOX   -15.924459
5        RM     4.445779
6       AGE    -0.007305
7       DIS    -1.429608
8       RAD     0.260043
9       TAX    -0.010780
10  PTRATIO    -0.900771
11        B     0.012400
12    LSTAT    -0.510902
Best random forest model: RandomForestRegressor(max_depth=10, n_estimators=200)
Mean squared error: 8.88945094175975
RMSE: 2.9815182276417076
R-squared: 0.8787809579886864
