In [None]:
from sklearn.datasets import fetch_california_housing

# Load the dataset
california_housing = fetch_california_housing()

# View dataset description
print(california_housing.DESCR)

# Get feature names
print(california_housing.feature_names)

# Access data and target
X = california_housing.data
y = california_housing.target

print(X.shape, y.shape)  # Check dimensions


In [None]:
import pandas as pd
import numpy as np

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_california_housing
from sklearn.preprocessing import StandardScaler

In [None]:
# Convert to DataFrame
df = pd.DataFrame(california_housing.data, columns=california_housing.feature_names)

# Add target variable
df["MedHouseVal"] = california_housing.target

# Display the first few rows
print(df.head())

In [None]:
# Exploratory Data Analysis (EDA)
print(df.info())  # Data types and missing values

In [None]:
print(df.describe())  # Summary statistics

In [None]:
print(df.isnull().sum())  # Check missing values

In [None]:
# Handle missing values (if any)
df = df.dropna()  # California housing dataset typically has no missing values

In [None]:
# Feature Engineering: Scaling numerical features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(df.drop(columns=["MedHouseVal"]))
df_scaled = pd.DataFrame(scaled_features, columns=df.columns[:-1])
df_scaled["MedHouseVal"] = df["MedHouseVal"]

In [None]:
# Correlation Matrix Visualization
plt.figure(figsize=(10, 6))
sns.heatmap(df.corr(), annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Feature Correlation Heatmap")
plt.show()

In [None]:
# Pairplot to visualize relationships with the target variable
sns.pairplot(df.sample(500), diag_kind="kde", plot_kws={"alpha": 0.5})
plt.show()

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split,GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [None]:
# Split dataset into features and target
X = df.drop(columns=["MedHouseVal"])
y = df["MedHouseVal"]

In [None]:
# Split into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
# Initialize models
models = {
    "Linear Regression": LinearRegression(),
    "Decision Tree": DecisionTreeRegressor(random_state=42),
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42),
    "XGBoost": XGBRegressor(objective="reg:squarederror", n_estimators=100, random_state=42),
        }

In [None]:
# Train and evaluate models
results = []
for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)

    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    results.append({"Model": name, "RMSE": rmse, "MAE": mae, "R² Score": r2})

In [None]:
# Convert results to a DataFrame and display
results_df = pd.DataFrame(results).sort_values(by="RMSE")
print(results_df)

In [None]:
# Visualization of model performance
plt.figure(figsize=(10, 5))
sns.barplot(x="Model", y="RMSE", data=results_df,hue="Model", palette="viridis", legend=False)
plt.title("Model Comparison: RMSE")
plt.ylabel("RMSE")
plt.xticks(rotation=45)
plt.show()

In [None]:
### Hyperparameter tuning using GridSearchCV and RandomizedSearchCV

# Random Forest Hyperparameter Tuning
rf_params = {
    "n_estimators": [50, 100, 200],
    "max_depth": [10, 20, None],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4]
}

rf = RandomForestRegressor(random_state=42)
rf_search = RandomizedSearchCV(rf, rf_params, n_iter=10, scoring="neg_mean_squared_error", cv=3, random_state=42, n_jobs=-1)
rf_search.fit(X_train_scaled, y_train)
best_rf = rf_search.best_estimator_

In [None]:
# XGBoost Hyperparameter Tuning
xgb_params = {
    "n_estimators": [50, 100, 200],
    "learning_rate": [0.01, 0.1, 0.2],
    "max_depth": [3, 6, 10],
    "subsample": [0.7, 0.8, 1.0]
}

xgb = XGBRegressor(objective="reg:squarederror", random_state=42)
xgb_search = GridSearchCV(xgb, xgb_params, scoring="neg_mean_squared_error", cv=3, n_jobs=-1)
xgb_search.fit(X_train_scaled, y_train)
best_xgb = xgb_search.best_estimator_

In [None]:
# Evaluate Models
models = {
    "Optimized Random Forest": best_rf,
    "Optimized XGBoost": best_xgb
}

results = []
for name, model in models.items():
    y_pred = model.predict(X_test_scaled)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    results.append({"Model": name, "RMSE": rmse, "MAE": mae, "R² Score": r2})

In [None]:
# Convert results to a DataFrame and display
results_df = pd.DataFrame(results).sort_values(by="RMSE")
print(results_df)

In [None]:
# Visualization of model performance
plt.figure(figsize=(10, 5))
sns.barplot(x="Model", y="RMSE", data=results_df, hue="Model", palette="viridis", legend=False)
plt.title("Optimized Model Comparison: RMSE")
plt.ylabel("RMSE")
plt.xticks(rotation=45)
plt.show()

In [None]:
import pickle
import joblib

# Save using Pickle
with open("random_forest_model.pkl", "wb") as f:
    pickle.dump(best_rf, f)

with open("xgboost_model.pkl", "wb") as f:
    pickle.dump(best_xgb, f)

# Save using Joblib (recommended for large models)
joblib.dump(best_rf, "random_forest_model.joblib")
joblib.dump(best_xgb, "xgboost_model.joblib")

print("Models saved successfully!")


In [None]:
!pip install fastapi uvicorn nest-asyncio pyngrok joblib pydantic scikit-learn



In [None]:
from fastapi import FastAPI
import joblib
import numpy as np
from pydantic import BaseModel

In [None]:
# Load trained model and scaler
model = joblib.load("random_forest_model.joblib")  # Load the optimized model
# scaler = joblib.load("scaler.joblib")  # Save and load the StandardScaler if used

In [None]:
# Initialize FastAPI app
app = FastAPI()

In [None]:
# Define request model
class HouseFeatures(BaseModel):
    MedInc: float
    HouseAge: float
    AveRooms: float
    AveBedrms: float
    Population: float
    AveOccup: float
    Latitude: float
    Longitude: float

In [None]:
# Define prediction endpoint
@app.post("/predict")
def predict_price(features: HouseFeatures):
    # Convert input data to NumPy array
    input_data = np.array([[features.MedInc, features.HouseAge, features.AveRooms,
                            features.AveBedrms, features.Population, features.AveOccup,
                            features.Latitude, features.Longitude]])

    # Scale input data
    input_scaled = scaler.transform(input_data)

    # Make prediction
    prediction = model.predict(input_scaled)

    return {"predicted_price": prediction[0]}

In [None]:
!pip install uvicorn

In [None]:
import nest_asyncio
import uvicorn

# Allow running Uvicorn inside Colab
nest_asyncio.apply()

# Run the FastAPI app in the background
!nohup uvicorn app:app --host 0.0.0.0 --port 8000 &
