In [1]:
import pandas as pd
data = pd.read_csv('modefied.csv')

In [2]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import RFE
from sklearn.model_selection import train_test_split

In [3]:
Features1 = ['Avg. Area Income', 'Avg. Area House Age', 'Avg. Area Number of Rooms', 'Avg. Area Number of Bedrooms', 
             'Area Population', 'Address','Total_Rooms']
target = 'Price'
X = data[Features1]
y = data[target]

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
correlation_matrix = X.corrwith(y) 
highly_correlated_features = correlation_matrix[abs(correlation_matrix) > 0.5].index.tolist()
random_forest = RandomForestRegressor(n_estimators=100, random_state=42)
random_forest.fit(X_train, y_train)
feature_importances = pd.Series(random_forest.feature_importances_, index=X.columns)
important_features = feature_importances.nlargest(3).index.tolist()  # Select top 3 features
estimator = RandomForestRegressor(n_estimators=100, random_state=42)
rfe = RFE(estimator, n_features_to_select=3)  # Select top 3 features
rfe.fit(X_train, y_train)
rfe_selected_features = X.columns[rfe.support_]
final_selected_features = list(set(highly_correlated_features) & set(important_features) & set(rfe_selected_features))
print("Final Selected Features:", final_selected_features)

Final Selected Features: ['Avg. Area Income']


In [7]:
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error, r2_score

In [8]:
Features1 = ['Avg. Area Income']
target = 'Price'
X2 = data[Features1]
y2 = data[target]
X_train, X_test, y_train, y_test = train_test_split(X2, y2, test_size=0.2, random_state=42)

In [9]:
from sklearn.linear_model import LinearRegression
linear_reg_model = LinearRegression()
linear_reg_model.fit(X_train, y_train)
linear_reg_preds = linear_reg_model.predict(X_test)
linear_reg_mae = mean_absolute_error(y_test, linear_reg_preds)
linear_reg_r2 = r2_score(y_test, linear_reg_preds)
print("Linear Regression MAE:", linear_reg_mae)
print("\nLinear Regression R-squared:", linear_reg_r2)

Linear Regression MAE: 0.614095764880655

Linear Regression R-squared: 0.39694865192619766


In [10]:
random_forest = RandomForestRegressor(random_state=42)
random_forest.fit(X_train, y_train)
rf_param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10]
}
rf_grid_search = GridSearchCV(estimator=random_forest, param_grid=rf_param_grid, cv=5, scoring='neg_mean_absolute_error')
rf_grid_search.fit(X_train, y_train)
best_rf_model = rf_grid_search.best_estimator_
rf_preds = best_rf_model.predict(X_test)
rf_mae = mean_absolute_error(y_test, rf_preds)
print("Random Forest MAE:", rf_mae)
rf_r2 = r2_score(y_test, rf_preds)
print("Random Forest R-squared:", rf_r2)

Random Forest MAE: 0.6274762585121568
Random Forest R-squared: 0.364879646731914


In [11]:
gradient_boosting = GradientBoostingRegressor(random_state=42)
gradient_boosting.fit(X_train, y_train)
gb_param_dist = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.05, 0.1, 0.2],
    'max_depth': [3, 4, 5]
}
gb_random_search = RandomizedSearchCV(estimator=gradient_boosting, param_distributions=gb_param_dist, n_iter=10, cv=5, scoring='neg_mean_absolute_error', random_state=42)
gb_random_search.fit(X_train, y_train)
best_gb_model = gb_random_search.best_estimator_
gb_preds = best_gb_model.predict(X_test)
gb_mae = mean_absolute_error(y_test, gb_preds)
print("Gradient Boosting MAE:", gb_mae)
gb_r2 = r2_score(y_test, gb_preds)
print("Gradient Boosting R-squared:", gb_r2)

Gradient Boosting MAE: 0.6161802678828084
Gradient Boosting R-squared: 0.38822110155851586


In [12]:
svm_model = SVR(kernel='linear')
svm_model.fit(X_train, y_train)
svm_preds = svm_model.predict(X_test)
svm_mae = mean_absolute_error(y_test, svm_preds)
print("SVM MAE:", svm_mae)
svm_r2 = r2_score(y_test, svm_preds)
print("SVM R-squared:", svm_r2)

SVM MAE: 0.6141761939659601
SVM R-squared: 0.39649575674037885


In [13]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score, KFold
def evaluate_model(model, X, y):

    predictions = model.predict(X)
    
    
    mae = mean_absolute_error(y, predictions)
    rmse = mean_squared_error(y, predictions, squared=False)
    r2 = r2_score(y, predictions)
    
    
    kfold = KFold(n_splits=5, shuffle=True, random_state=42)
    cross_val_mae = -cross_val_score(model, X, y, cv=kfold, scoring='neg_mean_absolute_error').mean()
    
    return mae, rmse, r2, cross_val_mae


linear_reg_mae, linear_reg_rmse, linear_reg_r2, linear_reg_cross_val_mae = evaluate_model(linear_reg_model, X_test, y_test)


rf_mae, rf_rmse, rf_r2, rf_cross_val_mae = evaluate_model(best_rf_model, X_test, y_test)


gb_mae, gb_rmse, gb_r2, gb_cross_val_mae = evaluate_model(best_gb_model, X_test, y_test)


svm_mae, svm_rmse, svm_r2, svm_cross_val_mae = evaluate_model(svm_model, X_test, y_test)


print("Linear Regression Results:")
print("MAE:", linear_reg_mae)
print("RMSE:", linear_reg_rmse)
print("R-squared:", linear_reg_r2)
print("Cross-Validation MAE:", linear_reg_cross_val_mae)
print("\nRandom Forest Results:")
print("MAE:", rf_mae)
print("RMSE:", rf_rmse)
print("R-squared:", rf_r2)
print("Cross-Validation MAE:", rf_cross_val_mae)
print("\nGradient Boosting Results:")
print("MAE:", gb_mae)
print("RMSE:", gb_rmse)
print("R-squared:", gb_r2)
print("Cross-Validation MAE:", gb_cross_val_mae)
print("\nSVM Results:")
print("MAE:", svm_mae)
print("RMSE:", svm_rmse)
print("R-squared:", svm_r2)
print("Cross-Validation MAE:", svm_cross_val_mae)


Linear Regression Results:
MAE: 0.614095764880655
RMSE: 0.7714557600514983
R-squared: 0.39694865192619766
Cross-Validation MAE: 0.6178299742194231

Random Forest Results:
MAE: 0.6274762585121568
RMSE: 0.7917022787557104
R-squared: 0.364879646731914
Cross-Validation MAE: 0.6734429490816606

Gradient Boosting Results:
MAE: 0.6161802678828084
RMSE: 0.7770180836289108
R-squared: 0.38822110155851586
Cross-Validation MAE: 0.6322871758616422

SVM Results:
MAE: 0.6141761939659601
RMSE: 0.7717453896388473
R-squared: 0.39649575674037885
Cross-Validation MAE: 0.6195717939351397
