### Subtask 1: Load the Dataset
- Load the dataset from the specified CSV file into a pandas DataFrame.


In [1]:
# Load the dataset
import pandas as pd

# Load the dataset into a pandas DataFrame
file_path = r'F:\ITShoulders\AI_Data_Science_agent\temp_uploads\housing.csv'
housing_df = pd.read_csv(file_path)
housing_df.head()


Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


### Subtask 2: Display the First 5 Rows
- Display the first 5 rows of the DataFrame to get an initial understanding of the data structure.


In [2]:
# Display the first 5 rows of the DataFrame
housing_df.head()


Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


### Subtask 3: Check for Missing Values
- Check for missing values in the DataFrame and print a summary of missing data.


In [3]:
# Check for missing values in the DataFrame
missing_values_summary = housing_df.isnull().sum()
missing_values_summary


longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
median_house_value      0
ocean_proximity         0
dtype: int64

### Subtask 4: Descriptive Statistics
- Generate and display descriptive statistics of the dataset to understand the distribution of its features.


In [4]:
# Display descriptive statistics of the dataset
descriptive_stats = housing_df.describe()
descriptive_stats


Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
count,20640.0,20640.0,20640.0,20640.0,20433.0,20640.0,20640.0,20640.0,20640.0
mean,-119.569704,35.631861,28.639486,2635.763081,537.870553,1425.476744,499.53968,3.870671,206855.816909
std,2.003532,2.135952,12.585558,2181.615252,421.38507,1132.462122,382.329753,1.899822,115395.615874
min,-124.35,32.54,1.0,2.0,1.0,3.0,1.0,0.4999,14999.0
25%,-121.8,33.93,18.0,1447.75,296.0,787.0,280.0,2.5634,119600.0
50%,-118.49,34.26,29.0,2127.0,435.0,1166.0,409.0,3.5348,179700.0
75%,-118.01,37.71,37.0,3148.0,647.0,1725.0,605.0,4.74325,264725.0
max,-114.31,41.95,52.0,39320.0,6445.0,35682.0,6082.0,15.0001,500001.0


### Subtask 5: Visualizations for Insights
- Create visualizations such as histograms, scatter plots, or pair plots to identify patterns and insights in the dataset.


In [5]:
# Create visualizations to identify patterns and insights
import matplotlib.pyplot as plt
import seaborn as sns

# Plot histograms for numerical features
housing_df.hist(bins=50, figsize=(20, 15))
plt.show()

# Pair plot to observe relationships
sns.pairplot(housing_df)
plt.show()

# Scatter plot to visualize relationship between median_income and median_house_value
plt.figure(figsize=(10, 6))
plt.scatter(housing_df['median_income'], housing_df['median_house_value'], alpha=0.1)
plt.title('Median Income vs Median House Value')
plt.xlabel('Median Income')
plt.ylabel('Median House Value')
plt.grid(True)
plt.show()


  plt.show()


  plt.show()
  plt.show()


### Subtask 6: Data Preprocessing
- Preprocess the data by handling missing values, encoding categorical features, and standardizing numerical features.


In [6]:
# Data preprocessing steps
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Separate features and target variable
X = housing_df.drop('median_house_value', axis=1)
y = housing_df['median_house_value']

# Identify numerical and categorical columns
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = X.select_dtypes(include=['object']).columns

# Preprocessing pipelines for numerical and categorical features
numerical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_pipeline = Pipeline([
    ('encoder', OneHotEncoder(sparse_output=False, handle_unknown='ignore'))
])

preprocessor = ColumnTransformer([
    ('numerical', numerical_pipeline, numerical_cols),
    ('categorical', categorical_pipeline, categorical_cols)
])

# Fit and transform the dataset
X_processed = preprocessor.fit_transform(X)
X_processed[:5]


array([[-1.32783522,  1.05254828,  0.98214266, -0.8048191 , -0.97247648,
        -0.9744286 , -0.97703285,  2.34476576,  0.        ,  0.        ,
         0.        ,  1.        ,  0.        ],
       [-1.32284391,  1.04318455, -0.60701891,  2.0458901 ,  1.35714343,
         0.86143887,  1.66996103,  2.33223796,  0.        ,  0.        ,
         0.        ,  1.        ,  0.        ],
       [-1.33282653,  1.03850269,  1.85618152, -0.53574589, -0.82702426,
        -0.82077735, -0.84363692,  1.7826994 ,  0.        ,  0.        ,
         0.        ,  1.        ,  0.        ],
       [-1.33781784,  1.03850269,  1.85618152, -0.62421459, -0.71972345,
        -0.76602806, -0.73378144,  0.93296751,  0.        ,  0.        ,
         0.        ,  1.        ,  0.        ],
       [-1.33781784,  1.03850269,  1.85618152, -0.46240395, -0.61242263,
        -0.75984669, -0.62915718, -0.012881  ,  0.        ,  0.        ,
         0.        ,  1.        ,  0.        ]])

### Subtask 7: Split the Dataset
- Split the dataset into training and testing sets with an appropriate ratio for model evaluation.


In [7]:
# Split the dataset into training and testing sets
from sklearn.model_selection import train_test_split

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.2, random_state=42)

# Output the shape of the datasets
(X_train.shape, X_test.shape, y_train.shape, y_test.shape)


((16512, 13), (4128, 13), (16512,), (4128,))

### Subtask 8: Train Machine Learning Models
- Train several machine learning models, such as Linear Regression, Decision Tree, and Random Forest, on the training set to predict house prices.


In [8]:
# Train various machine learning models
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

# Initialize models
linear_model = LinearRegression()
tree_model = DecisionTreeRegressor(random_state=42)
forest_model = RandomForestRegressor(random_state=42)

# Train the models
linear_model.fit(X_train, y_train)
tree_model.fit(X_train, y_train)
forest_model.fit(X_train, y_train)

# Print model details
('Linear Regression, Decision Tree, and Random Forest models have been trained.')


'Linear Regression, Decision Tree, and Random Forest models have been trained.'

### Subtask 9: Evaluate Model Performance
- Evaluate the performance of each trained model on the test set using metrics like Mean Absolute Error, Mean Squared Error, or R-squared.


In [9]:
# Evaluate model performance
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Function to evaluate models
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    return {'MAE': mae, 'MSE': mse, 'R2': r2}

# Evaluate each model
linear_results = evaluate_model(linear_model, X_test, y_test)
tree_results = evaluate_model(tree_model, X_test, y_test)
forest_results = evaluate_model(forest_model, X_test, y_test)

(linear_results, tree_results, forest_results)


({'MAE': 50670.73824097192,
  'MSE': 4908476721.156617,
  'R2': 0.6254240620553605},
 {'MAE': 43600.67151162791,
  'MSE': 4777025130.031977,
  'R2': 0.6354554029044672},
 {'MAE': 31643.65566860465,
  'MSE': 2398630785.009868,
  'R2': 0.8169555593071559})

### Subtask 10: Compare and Select the Best Model
- Compare the performance of the models and select the best performing model based on evaluation metrics.


In [10]:
# Compare model performances and select the best model
model_performance = {
    'Linear Regression': linear_results['R2'],
    'Decision Tree': tree_results['R2'],
    'Random Forest': forest_results['R2']
}

# Select the best model based on the highest R-squared value
best_model_name = max(model_performance, key=model_performance.get)
best_model = forest_model if best_model_name == 'Random Forest' else (linear_model if best_model_name == 'Linear Regression' else tree_model)

(best_model_name, model_performance)


('Random Forest',
 {'Linear Regression': 0.6254240620553605,
  'Decision Tree': 0.6354554029044672,
  'Random Forest': 0.8169555593071559})

### Subtask 11: Create a Production-Ready Inference Function
- Create a production-ready function for making predictions using the best-performing model, ensuring the function can accept new data and output predictions.


In [11]:
# Create the inference function
def predict_house_price(new_data, preprocessor, model):
    # Transform the new data using the preprocessor
    processed_data = preprocessor.transform(new_data)
    # Predict using the model
    prediction = model.predict(processed_data)
    return prediction

# Example use of the inference function
# Create a sample input similar to the raw data
sample_input = pd.DataFrame({
    'longitude': [-122.23],
    'latitude': [37.88],
    'housing_median_age': [41.0],
    'total_rooms': [880.0],
    'total_bedrooms': [129.0],
    'population': [322.0],
    'households': [126.0],
    'median_income': [8.3252],
    'ocean_proximity': ['NEAR BAY']
})

# Predict the house price for the sample input
predicted_price = predict_house_price(sample_input, preprocessor, best_model)
predicted_price


array([431942.36])

### Conclusion
- Successfully analyzed the housing dataset and built multiple models.
- Random Forest was selected as the best performing model based on R-squared values.
- A production-ready inference function was created for predicting house prices.
- Predicted house price for sample input: $431,942.36


### Conclusion
- Successfully analyzed the housing dataset and built multiple models.
- Random Forest was selected as the best performing model based on R-squared values.
- A production-ready inference function was created for predicting house prices.
- Predicted house price for sample input: $431,942.36


The notebook has been successfully created with all subtasks executed, including data analysis, multiple model training, model evaluation, selecting the best model, and implementing a production-ready inference function. The Random Forest model was identified as the best-performing model based on its R-squared value. The notebook is now ready for sharing or further enhancements.