In [51]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
from sklearn.impute import SimpleImputer

# Load the dataset
df = pd.read_csv('house_price_bd.csv')

# Clean the 'Floor_no' column: remove non-numeric characters and convert to numeric
df['Floor_no'] = pd.to_numeric(df['Floor_no'].replace(r'\D+', '', regex=True), errors='coerce')

# Clean the 'Price_in_taka' column: remove non-numeric characters and convert to numeric
df['Price_in_taka'] = df['Price_in_taka'].replace(r'[^\d]', '', regex=True).astype(float)

# Define the target column
target_column = 'Price_in_taka'

# Ensure 'target_column' exists in the DataFrame
if target_column not in df.columns:
    raise KeyError(f"Column '{target_column}' not found in the DataFrame. Available columns: {df.columns.tolist()}")

# Separate features and target variable
X = df.drop(columns=[target_column])
y = df[target_column]

# Define categorical and numerical features
categorical_features = ['Title', 'Occupancy_status', 'City', 'Location']
numerical_features = ['Bedrooms', 'Bathrooms', 'Floor_no', 'Floor_area']

# Define preprocessing for numerical and categorical features
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Create preprocessing pipelines
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Create the full pipeline with preprocessing and modeling
model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', Ridge())
])

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit the model pipeline
model_pipeline.fit(X_train, y_train)

# Predict and evaluate
y_pred = model_pipeline.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')

# Display a sample prediction
print(f'Sample Prediction: {y_pred[:5]}')

Mean Squared Error: 255875350704160.88
Sample Prediction: [4294590.77994521 6619390.05104343 8013866.40525855 8254341.0894412
 8774248.10307644]
