### Subtask 1: Load the Dataset
- Load the housing dataset from the specified CSV file into a pandas DataFrame.
- Display the first 5 rows to understand its structure.


In [1]:
import pandas as pd

# Load the housing dataset
file_path = r'F:\ITShoulders\AI_Data_Science_agent\temp_uploads\housing.csv'
housing_df = pd.read_csv(file_path)

# Display the first 5 rows of the dataset
housing_df.head()


Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


### Subtask 2: Initial Data Exploration
- Identify and summarize any missing values in each column.


In [2]:
# Check for missing values in each column
missing_values_summary = housing_df.isnull().sum()
missing_values_summary


longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
median_house_value      0
ocean_proximity         0
dtype: int64

### Subtask 3: Statistical Summaries
- Calculate and display basic statistical summaries for numerical columns to gain insights into data distribution.


In [3]:
# Display basic statistical summaries for numerical columns
numerical_summary = housing_df.describe()
numerical_summary


Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
count,20640.0,20640.0,20640.0,20640.0,20433.0,20640.0,20640.0,20640.0,20640.0
mean,-119.569704,35.631861,28.639486,2635.763081,537.870553,1425.476744,499.53968,3.870671,206855.816909
std,2.003532,2.135952,12.585558,2181.615252,421.38507,1132.462122,382.329753,1.899822,115395.615874
min,-124.35,32.54,1.0,2.0,1.0,3.0,1.0,0.4999,14999.0
25%,-121.8,33.93,18.0,1447.75,296.0,787.0,280.0,2.5634,119600.0
50%,-118.49,34.26,29.0,2127.0,435.0,1166.0,409.0,3.5348,179700.0
75%,-118.01,37.71,37.0,3148.0,647.0,1725.0,605.0,4.74325,264725.0
max,-114.31,41.95,52.0,39320.0,6445.0,35682.0,6082.0,15.0001,500001.0


### Subtask 4: Data Visualization
- Create visualizations, such as histograms or box plots, for key numerical variables to understand their distributions and potential impact on house prices.


In [4]:
import matplotlib.pyplot as plt

# Plot histograms for key numerical variables
housing_df.hist(bins=50, figsize=(20, 15))
plt.tight_layout()
plt.show()


  plt.show()


### Subtask 5: Data Preprocessing
- Address missing values, encode categorical variables, and normalize or standardize numerical features as necessary for model training.


In [5]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

# Handle missing values for 'total_bedrooms'
imputer = SimpleImputer(strategy='median')
housing_df['total_bedrooms'] = imputer.fit_transform(housing_df[['total_bedrooms']])

# Encode categorical variables 'ocean_proximity'
encoder = OneHotEncoder(sparse_output=False)
encoded_features = encoder.fit_transform(housing_df[['ocean_proximity']])
encoded_features_df = pd.DataFrame(encoded_features, columns=encoder.get_feature_names_out(['ocean_proximity']))

# Drop the original categorical column and add the encoded features
housing_df = housing_df.drop('ocean_proximity', axis=1)
housing_df = pd.concat([housing_df, encoded_features_df], axis=1)

# Standardize numerical features
scaler = StandardScaler()
numerical_features = ['longitude', 'latitude', 'housing_median_age', 'total_rooms',
                      'total_bedrooms', 'population', 'households', 'median_income']
housing_df[numerical_features] = scaler.fit_transform(housing_df[numerical_features])

# Display the preprocessed dataset
housing_df.head()


Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
0,-1.327835,1.052548,0.982143,-0.804819,-0.972476,-0.974429,-0.977033,2.344766,452600.0,0.0,0.0,0.0,1.0,0.0
1,-1.322844,1.043185,-0.607019,2.04589,1.357143,0.861439,1.669961,2.332238,358500.0,0.0,0.0,0.0,1.0,0.0
2,-1.332827,1.038503,1.856182,-0.535746,-0.827024,-0.820777,-0.843637,1.782699,352100.0,0.0,0.0,0.0,1.0,0.0
3,-1.337818,1.038503,1.856182,-0.624215,-0.719723,-0.766028,-0.733781,0.932968,341300.0,0.0,0.0,0.0,1.0,0.0
4,-1.337818,1.038503,1.856182,-0.462404,-0.612423,-0.759847,-0.629157,-0.012881,342200.0,0.0,0.0,0.0,1.0,0.0


### Subtask 6: Data Splitting
- Split the dataset into separate training and testing sets to prepare for model building and evaluation.


In [6]:
from sklearn.model_selection import train_test_split

# Separate the target variable and features
X = housing_df.drop('median_house_value', axis=1)
y = housing_df['median_house_value']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display the shape of the training and testing sets
(X_train.shape, X_test.shape, y_train.shape, y_test.shape)


((16512, 13), (4128, 13), (16512,), (4128,))

### Subtask 7: Model Development
- Develop multiple machine learning models, such as linear regression, decision tree, and random forest models, aimed at predicting house prices.


In [7]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

# Initialize the models
linear_regression_model = LinearRegression()
decision_tree_model = DecisionTreeRegressor(random_state=42)
random_forest_model = RandomForestRegressor(random_state=42)

# Store the models in a dictionary for easy access
models = {
    'Linear Regression': linear_regression_model,
    'Decision Tree': decision_tree_model,
    'Random Forest': random_forest_model
}

# Display the initialized models
models


{'Linear Regression': LinearRegression(),
 'Decision Tree': DecisionTreeRegressor(random_state=42),
 'Random Forest': RandomForestRegressor(random_state=42)}

### Subtask 8: Model Training and Evaluation
- Train each model on the training dataset and assess their performance on the testing dataset using metrics like RMSE or R-squared.


In [8]:
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# Train and evaluate each model
model_performance = {}

for model_name, model in models.items():
    # Train the model
    model.fit(X_train, y_train)
    
    # Make predictions
    predictions = model.predict(X_test)
    
    # Calculate evaluation metrics
    rmse = np.sqrt(mean_squared_error(y_test, predictions))
    r_squared = r2_score(y_test, predictions)
    
    # Store the performance
    model_performance[model_name] = {'RMSE': rmse, 'R-squared': r_squared}

# Display the model performance
model_performance


{'Linear Regression': {'RMSE': np.float64(70060.5218447352),
  'R-squared': 0.6254240620553605},
 'Decision Tree': {'RMSE': np.float64(69116.02657873192),
  'R-squared': 0.6354554029044672},
 'Random Forest': {'RMSE': np.float64(48975.818369986104),
  'R-squared': 0.8169555593071559}}

### Subtask 9: Model Comparison
- Compare the model performances based on evaluation metrics to determine and select the best model.


In [9]:
# Determine the best model based on RMSE
best_model_name = min(model_performance, key=lambda model: model_performance[model]['RMSE'])
best_model_performance = model_performance[best_model_name]

# Display the best model and its performance
best_model_name, best_model_performance


('Random Forest',
 {'RMSE': np.float64(48975.818369986104), 'R-squared': 0.8169555593071559})

### Subtask 10: Production-Ready Inference Function
- Develop a production-ready function that includes necessary preprocessing steps and uses the best model for house price prediction based on new data inputs.


In [10]:
def predict_house_price(new_data):
    """Preprocess input data and predict house prices using the trained Random Forest model."""
    # Ensure new data is a DataFrame
    if not isinstance(new_data, pd.DataFrame):
        new_data = pd.DataFrame(new_data, index=[0])
    
    # Handle missing values
    new_data['total_bedrooms'] = imputer.transform(new_data[['total_bedrooms']])
    
    # Encode categorical variables
    encoded_features = encoder.transform(new_data[['ocean_proximity']])
    encoded_features_df = pd.DataFrame(encoded_features, columns=encoder.get_feature_names_out(['ocean_proximity']))
    
    # Add encoded features and drop original categorical column
    new_data = new_data.drop('ocean_proximity', axis=1)
    new_data = pd.concat([new_data, encoded_features_df], axis=1)
    
    # Standardize numerical features
    new_data[numerical_features] = scaler.transform(new_data[numerical_features])
    
    # Predict using the Random Forest model
    predictions = models['Random Forest'].predict(new_data)
    
    return predictions

# Example usage with dummy data
example_data = {
    'longitude': -122.23,
    'latitude': 37.88,
    'housing_median_age': 41.0,
    'total_rooms': 880.0,
    'total_bedrooms': 129.0,
    'population': 322.0,
    'households': 126.0,
    'median_income': 8.3252,
    'ocean_proximity': 'NEAR BAY'
}

predict_house_price(example_data)


array([431942.36])