In [9]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
import joblib
import re


In [11]:
# Load dataset
df = pd.read_csv('Datasets/House_rent/House_Rent_Dataset.csv')

# Drop columns that are not needed or are identifiers
df = df.drop(columns=['Point of Contact'])

# Check for date columns and convert them to numeric if present
# For example, if there's a 'Date' column, convert it:
# if 'Date' in df.columns:
#     df['Date'] = pd.to_datetime(df['Date']).astype(int) / 10**9  # Convert to timestamp

# Ensure categorical variables are encoded
label_encoders = {}
for column in ['Area Type', 'Area Locality', 'City', 'Furnishing Status', 'Tenant Preferred']:
    if column in df.columns:
        le = LabelEncoder()
        df[column] = le.fit_transform(df[column])
        label_encoders[column] = le

# Handle 'Floor' column to extract numeric values
def extract_floor_number(floor_str):
    match = re.match(r'(\d+)', floor_str)
    if match:
        return int(match.group(1))
    return None

def extract_total_floors(floor_str):
    match = re.search(r'out of (\d+)', floor_str)
    if match:
        return int(match.group(1))
    return None

# Apply extraction functions
df['Current Floor'] = df['Floor'].apply(extract_floor_number)
df['Total Floors'] = df['Floor'].apply(extract_total_floors)

# Drop the original 'Floor' column
df = df.drop(columns=['Floor'])

# Convert all columns to numeric, coerce errors to NaN, then drop rows with NaN values
df = df.apply(pd.to_numeric, errors='coerce')
df = df.dropna()

# Normalize numerical features
scaler = StandardScaler()
numerical_features = ['Size', 'Current Floor', 'Total Floors', 'Bathroom']
df[numerical_features] = scaler.fit_transform(df[numerical_features])
joblib.dump(scaler, 'scaler.joblib')

# Split the data into features and target
X = df.drop(columns=['Rent'])
y = df['Rent']

# Convert target variable to float32
y = y.astype('float32')

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Save the test set to a CSV file
test_data = pd.concat([X_test, y_test], axis=1)
test_data.to_csv('test.csv', index=False)

# Reshape data for LSTM
def reshape_for_lstm(X):
    return np.array(X).astype('float32').reshape((X.shape[0], 1, X.shape[1]))

X_train_lstm = reshape_for_lstm(X_train)
X_test_lstm = reshape_for_lstm(X_test)



ValueError: Found array with 0 sample(s) (shape=(0, 4)) while a minimum of 1 is required by StandardScaler.

In [7]:
# Build LSTM model
model = Sequential()
model.add(LSTM(units=50, input_shape=(X_train_lstm.shape[1], X_train_lstm.shape[2]), return_sequences=True))
model.add(LSTM(units=50))
model.add(Dense(1))

# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error')

# Train the model
model.fit(X_train_lstm, y_train, epochs=20, batch_size=32, validation_data=(X_test_lstm, y_test))

# Save the trained model
model.save('lstm_model.h5')


ValueError: Failed to convert a NumPy array to a Tensor (Unsupported object type int).