## House Price Prediction
1. How you approach the dataset.
2. EDA performed if any
3. Different Models built
4. Parameters you optimized and your final model
5. Final recommendations wrt to the model

Dataset: data.csv

In [None]:
# set file path
file_path = 'data.csv'

In [None]:
import pandas as pd
df = pd.read_csv(file_path)
df.head()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
print("Columns:", df.columns.tolist())

In [None]:
# Target column
target = 'median_house_value'

In [None]:
#Check for missing values
df.isnull().sum().sort_values(ascending=False)

In [None]:
# Separate numerical and categorical variables
# Numerical features
num_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Categorical features
cat_cols = df.select_dtypes(include=['object']).columns.tolist()

# Remove target from features (if it's numeric)
if target in num_cols:
    num_cols.remove(target)
elif target in cat_cols:
    cat_cols.remove(target)

print("Numerical columns:", num_cols)
print("Categorical columns:", cat_cols)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

In [None]:
plt.figure(figsize=(14, 6))

# Histogram
plt.subplot(1, 2, 1)
sns.histplot(df[target], kde=True)
plt.title('Histogram of Target Variable')

# Boxplot
plt.subplot(1, 2, 2)
sns.boxplot(x=df[target])
plt.title('Boxplot of Target Variable')

plt.tight_layout()
plt.show()

In [None]:
# Correlation matrix
corr_matrix = df.select_dtypes(include=['int64', 'float64']).corr()

plt.figure(figsize=(12, 10))
sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm', square=True)
plt.title('Correlation Heatmap of Numerical Features')
plt.show()

In [None]:
for col in cat_cols:
    plt.figure(figsize=(10, 4))
    sns.barplot(x=col, y=target, data=df, estimator=np.mean)
    plt.title(f'{col} vs. {target}')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

In [None]:
# Get top 5 correlated features with the target
top_corr = corr_matrix[target].abs().sort_values(ascending=False).head(6).index.tolist()

# Pairplot
sns.pairplot(df[top_corr])
plt.suptitle("Pair Plot of Top Correlated Numerical Features", y=1.02)
plt.show()

In [None]:
for col in top_corr:
    if col != target:
        plt.figure(figsize=(8, 4))
        sns.scatterplot(x=df[col], y=df[target])
        plt.title(f'{target} vs {col}')
        plt.tight_layout()
        plt.show()

In [None]:
df.describe()

In [None]:
df['ocean_proximity'].value_counts()

In [None]:
# check the missing values
df.isnull().sum().sort_values(ascending=False)

In [None]:
# Fill missing values with Median
df['total_bedrooms'] = df['total_bedrooms'].fillna(df['total_bedrooms'].median())

In [None]:
df.isnull().sum().sort_values(ascending=False)

In [None]:
df.head()

In [None]:
df['total_bedrooms'] = df['total_bedrooms'].astype('int64')
df.info()

In [None]:
df.head()

In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.pipeline import Pipeline

In [None]:
# Assuming your data is loaded in df
# Separate features and target
X = df.drop('median_house_value', axis=1)  # Replace 'price' with your target column name
y = df['median_house_value']

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)