**Load and preprocess data**

In [24]:
import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MultiLabelBinarizer

# Load the data
df = pd.read_json('database.json')
df = df[['address', 'city', 'country', 'name', 'price', 'sqft', 'state']]

# convert to string
df['price'] = df['price'].astype(str)
df['sqft'] = df['sqft'].astype(str)

# Fill missing values
df = df.fillna('Unknown')

# Tokenize text fields
from tensorflow.keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(df['address'] + ' ' + df['city'] + ' ' + df['state'] + ' ' + df['name'])
sequences = tokenizer.texts_to_sequences(df['address'] + ' ' + df['city'] + ' ' + df['state'] + ' ' + df['name'])
padded_sequences = pad_sequences(sequences, maxlen=20)

# Scale numerical features
df['price'] = df['price'].astype(float)
df['sqft'] = df['sqft'].astype(float)
numerical_features = df[['price', 'sqft']]

scaler = StandardScaler()
scaled_numerical_features = scaler.fit_transform(numerical_features)

# MultiLabelBinarizer for categorical features
mlb = MultiLabelBinarizer()

# Define which columns are categorical labels
labels = df[['city', 'state', 'name']]
binary_labels = mlb.fit_transform(labels.values)
binary_labels_df = pd.DataFrame(binary_labels, columns=mlb.classes_)

# Combine all features
final_features = np.hstack((padded_sequences, scaled_numerical_features, binary_labels))

# Create DataFrame for combined features
columns = (
    ['tokenized_feature_' + str(i) for i in range(padded_sequences.shape[1])] + 
    ['scaled_price', 'scaled_sqft'] + 
    mlb.classes_.tolist()
)
final_df = pd.DataFrame(final_features, columns=columns)

# Save to CSV
final_df.to_csv('encoded_property_data.csv', index=False)

print("Data saved to 'encoded_property_data.csv'")


Data saved to 'encoded_property_data.csv'


**Build and train model**

In [27]:

from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Dropout
import pickle
# Define the model
input_layer = Input(shape=(final_df.shape[1] - len(mlb.classes_),))
dense_layer = Dense(128, activation='relu')(input_layer)
dropout_layer = Dropout(0.5)(dense_layer)
output_layer = Dense(len(mlb.classes_), activation='sigmoid')(dropout_layer)

model = Model(inputs=input_layer, outputs=output_layer)
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Print model summary
model.summary()

# Split the data into training and validation sets
from sklearn.model_selection import train_test_split

features = final_df.iloc[:, :-len(mlb.classes_)]
labels = final_df.iloc[:, -len(mlb.classes_):]

X_train, X_val, y_train, y_val = train_test_split(features, labels, test_size=0.2, random_state=42)

# Train the model
history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_val, y_val))

Epoch 1/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 555ms/step - accuracy: 0.0000e+00 - loss: 5.0224 - val_accuracy: 0.0000e+00 - val_loss: 4.2256
Epoch 2/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step - accuracy: 0.0000e+00 - loss: 4.5588 - val_accuracy: 0.0000e+00 - val_loss: 3.9921
Epoch 3/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step - accuracy: 0.0000e+00 - loss: 4.5572 - val_accuracy: 0.0000e+00 - val_loss: 3.7807
Epoch 4/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step - accuracy: 0.0000e+00 - loss: 4.1016 - val_accuracy: 0.0000e+00 - val_loss: 3.5675
Epoch 5/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step - accuracy: 0.0000e+00 - loss: 3.5948 - val_accuracy: 0.0000e+00 - val_loss: 3.3633
Epoch 6/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step - accuracy: 0.1667 - loss: 3.5557 - val_accuracy: 0.0000e+00 - val_loss: 3.1752
E

**Evaluate model**

In [31]:
# Evaluate the model
results = model.evaluate(X_val, y_val)
print(f"Validation Loss: {results[0]}, Validation Accuracy: {results[1]}")

model.save('model.h5')
pickle.dump(model, open('mlp_model.h5', 'wb'))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 0.0000e+00 - loss: 2.5176




Validation Loss: 2.5176165103912354, Validation Accuracy: 0.0


**Make prediction**

In [13]:
# Make predictions
predictions = model.predict(X_val)

# Convert predictions to binary (using a threshold, e.g., 0.5)
predictions_binary = (predictions > 0.5).astype(int)

# Compare predictions to actual labels
print("Predictions:\n", predictions_binary)
print("Actual Labels:\n", y_val.values)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 57ms/step
Predictions:
 [[1 0 0 1 0 1 0 0 0 1 0 0 0 1 1]
 [1 0 1 0 0 0 0 0 0 1 0 0 0 1 0]]
Actual Labels:
 [[1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 1. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0.]]


**Analyze performance**

In [14]:
from sklearn.metrics import classification_report

# Generate a classification report
report = classification_report(y_val, predictions_binary, target_names=mlb.classes_)
print(report)


               precision    recall  f1-score   support

     Bungalow       0.50      1.00      0.67         1
       Cheras       0.00      0.00      0.00         0
   Codominium       0.00      0.00      0.00         0
       Dungun       0.00      0.00      0.00         0
       Duplex       0.00      0.00      0.00         1
        Johor       0.00      0.00      0.00         1
        Kedah       0.00      0.00      0.00         0
      Kemaman       0.00      0.00      0.00         0
         Muar       0.00      0.00      0.00         1
      Puchong       0.50      1.00      0.67         1
       Rawang       0.00      0.00      0.00         0
     Selangor       0.00      0.00      0.00         1
   Setia Alam       0.00      0.00      0.00         0
Sungai Petani       0.00      0.00      0.00         0
   Terengganu       0.00      0.00      0.00         0

    micro avg       0.20      0.33      0.25         6
    macro avg       0.07      0.13      0.09         6
 weighte

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


**Save model**

In [15]:
model.save('LSTM_model.h5')



**Test with new input**

In [21]:
new_property = {
    "address": "123 New Street",
    "city": "Kuala Lumpur",
    "state": "Selangor",
    "name": "Condominium",
    "price": 0,
    "sqft": 0
}

# preprocess
# Combine text fields and tokenize
text_data = new_property['city'] + ' ' + new_property['address'] + ' ' + new_property['state'] + ' ' + new_property['name']
text_sequence = tokenizer.texts_to_sequences([text_data])
padded_text_sequence = pad_sequences(text_sequence, maxlen=20)

# Convert and scale numerical features
numerical_data = np.array([[float(new_property['price']), float(new_property['sqft'])]])
scaled_numerical_data = scaler.transform(numerical_data)

# Combine text and numerical features
input_features = np.hstack((padded_text_sequence, scaled_numerical_data))

# Make predictions
prediction = model.predict(input_features)

# Convert prediction to binary (using a threshold, e.g., 0.5)
prediction_binary = (prediction > 0.5).astype(int)

# Convert binary prediction to labels
predicted_labels = mlb.inverse_transform(prediction_binary)
predicted_labels
# print("Predicted Labels:", predicted_labels)



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step




[('Bungalow',
  'Dungun',
  'Kemaman',
  'Puchong',
  'Selangor',
  'Sungai Petani',
  'Terengganu')]