<a href="https://colab.research.google.com/github/vamsi160505-design/MLDA/blob/main/T1_DATA_PRE_PROCESSING.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# 1. Load the dataset
data = pd.read_csv("111.csv")
print("Initial data sample:")
print(data.head())

# 2. Drop unnecessary columns (if any)
data.drop(columns=["Posted On", "Area Type"], inplace=True, errors='ignore')

# 3. Rename columns for consistency (optional)
data.rename(columns={
    "Size": "Size (sqft)",
    "Rent": "Price (INR)",
    "BHK": "Rooms",
    "Bathroom": "Bathrooms"
}, inplace=True)

# 4. Select relevant columns
features = ['Size (sqft)', 'Rooms', 'Bathrooms', 'City', 'Furnishing Status']
target = 'Price (INR)'

# 5. Define preprocessing for numeric and categorical features
num_features = ['Size (sqft)', 'Rooms', 'Bathrooms']
cat_features = ['City', 'Furnishing Status']

num_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocessor = ColumnTransformer(transformers=[
    ('num', num_transformer, num_features),
    ('cat', cat_transformer, cat_features)
])

preprocessor.set_output(transform="pandas")

# 6. Preprocess input data (X)
X = data[features]
X_preprocessed = preprocessor.fit_transform(X)
print("\nProcessed feature sample:")
print(X_preprocessed.head())

# 7. Preprocess output data (y)
y = data[[target]].copy()
y_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])
y_preprocessed = y_transformer.fit_transform(y)

# 8. Feature engineering: Rent per sqft
X_preprocessed['Price_per_sqft'] = data['Price (INR)'] / data['Size (sqft)']

# 9. Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X_preprocessed, y_preprocessed, test_size=0.2, random_state=42
)

# 10. Display train sample
print("\nX_train sample:")
print(X_train.head())
print("\ny_train sample:")
print(y_train[:5])

Initial data sample:
    Posted On  BHK   Rent  Size            Floor    Area Type  \
0  18-05-2022    2  10000  1100  Ground out of 2   Super Area   
1  13-05-2022    2  20000   800       1 out of 3   Super Area   
2  16-05-2022    2  17000  1000       1 out of 3   Super Area   
3  04-07-2022    2  10000   800       1 out of 2   Super Area   
4  09-05-2022    2   7500   850       1 out of 2  Carpet Area   

              Area Locality     City Furnishing Status  Tenant Preferred  \
0                    Bandel  Kolkata       Unfurnished  Bachelors/Family   
1  Phool Bagan, Kankurgachi  Kolkata    Semi-Furnished  Bachelors/Family   
2   Salt Lake City Sector 2  Kolkata    Semi-Furnished  Bachelors/Family   
3               Dumdum Park  Kolkata       Unfurnished  Bachelors/Family   
4             South Dum Dum  Kolkata       Unfurnished         Bachelors   

   Bathroom Point of Contact  
0         2    Contact Owner  
1         1    Contact Owner  
2         1    Contact Owner  
3      