In [1]:
import pandas as pd
import os
#file_name = 'Downloads/merged_sale_data_updated1.csv'
file_name = 'Downloads/Fintech project/merged_sale_data_updated1.csv'
df = pd.read_csv(file_name,encoding='latin-1')

In [2]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler

categorical_features = ['Type of Sale', 'Type of Area', 'Market Segment', 'Property Type', 'Postal District_x']
# Apply one-hot encoding
encoder = OneHotEncoder(handle_unknown='error')  # Or choose a suitable missing value strategy
encoded_data = encoder.fit_transform(df[categorical_features])
encoded_data = encoded_data[:1000]
# Create a DataFrame from encoded data
individual_rows = []
encoded_data_array = encoded_data.toarray()  # Convert the sparse matrix to a dense array
# Iterate over each row in the encoded data array
for row in encoded_data_array:
    # Convert the row array to a list
    row_data = row.tolist()
    # Append the row data to the list of individual rows
    individual_rows.append(row_data)
# Create a DataFrame from the list of individual rows
df_encoded_individual = pd.DataFrame(individual_rows, columns=encoder.get_feature_names_out(categorical_features))

# Now df_encoded_individual should contain each row as an individual input

# Feature Scaling (Numerical Features)
numerical_features = ['Transacted Price ($)', 'Area (SQFT)', 'Unit Price ($ PSF)']
scaler = StandardScaler()
df[numerical_features] = scaler.fit_transform(df[numerical_features])


# Combine with 'Reference Quarter'
final_data = pd.concat([df_encoded_individual, df[['Reference Quarter', 'Area (SQFT)', 'Unit Price ($ PSF)', 'Remaining Lease']]], axis=1)



In [3]:
# Define features to be used for modeling (excluding unwanted ones)
features_to_use = ['Area (SQFT)', 'Unit Price ($ PSF)', 
                   'Remaining Lease'] + df_encoded_individual.columns.tolist()
df_encoded = final_data[features_to_use]
print(final_data)

        Type of Sale_New Sale  Type of Sale_Resale  Type of Sale_Sub Sale  \
0                         0.0                  1.0                    0.0   
1                         1.0                  0.0                    0.0   
2                         0.0                  1.0                    0.0   
3                         0.0                  1.0                    0.0   
4                         0.0                  1.0                    0.0   
...                       ...                  ...                    ...   
118327                    NaN                  NaN                    NaN   
118328                    NaN                  NaN                    NaN   
118329                    NaN                  NaN                    NaN   
118330                    NaN                  NaN                    NaN   
118331                    NaN                  NaN                    NaN   

        Type of Area_Land  Type of Area_Strata  \
0                     0.0

In [4]:
from sklearn.linear_model import LinearRegression

# Split data into training and testing sets
from sklearn.model_selection import train_test_split
df_temp = df_encoded.drop("Unit Price ($ PSF)", axis = 1, inplace = False)
df_temp = df_temp[:1000]
X_train, X_test, y_train, y_test = train_test_split(df_temp, df['Transacted Price ($)'][:1000], test_size=0.2, random_state=42)

# Train the model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions on test set
y_pred = model.predict(X_test)

# Evaluate model performance (e.g., R-squared)
from sklearn.metrics import r2_score
r2 = r2_score(y_test, y_pred)
print("R-squared:", r2)


R-squared: 0.7519116533492539


In [5]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

# Load the dataset
# Preprocessing
# Handle missing values
imputer = SimpleImputer(strategy='mean')

# One-hot encoding for categorical variables
categorical_cols = ['Type of Sale', 'Type of Area', 'Market Segment', 'Property Type', 'Postal District_x']
encoder = OneHotEncoder()
encoded_data = encoder.fit_transform(df[categorical_cols])

# Combine encoded data with numerical features
X = torch.tensor(encoded_data.toarray(), dtype=torch.float32)
y = torch.tensor(df['Transacted Price ($)'].values, dtype=torch.float32)

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the neural network model
class Net(nn.Module):
    def __init__(self, input_size):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(input_size, 64)
        self.fc2 = nn.Linear(64, 32)
        self.fc3 = nn.Linear(32, 1)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.fc3(x)
        return x

# Instantiate the model
model = Net(input_size=X.shape[1])

# Define loss function and optimizer
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
epochs = 20
for epoch in range(epochs):
    model.train()
    optimizer.zero_grad()
    outputs = model(X_train)
    loss = criterion(outputs.squeeze(), y_train)
    loss.backward()
    optimizer.step()

    # Print training loss
    print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item()}")

# Evaluate the model
model.eval()
with torch.no_grad():
    test_outputs = model(X_test)
    test_loss = criterion(test_outputs.squeeze(), y_test)
    print(f"Test Loss: {test_loss.item()}")


Epoch 1/20, Loss: 1.146941065788269
Epoch 2/20, Loss: 1.1431187391281128
Epoch 3/20, Loss: 1.139675259590149
Epoch 4/20, Loss: 1.1364686489105225
Epoch 5/20, Loss: 1.1335506439208984
Epoch 6/20, Loss: 1.1309758424758911
Epoch 7/20, Loss: 1.128696322441101
Epoch 8/20, Loss: 1.1267131567001343
Epoch 9/20, Loss: 1.125048279762268
Epoch 10/20, Loss: 1.123633623123169
Epoch 11/20, Loss: 1.122431755065918
Epoch 12/20, Loss: 1.1214350461959839
Epoch 13/20, Loss: 1.1206334829330444
Epoch 14/20, Loss: 1.1199429035186768
Epoch 15/20, Loss: 1.119302749633789
Epoch 16/20, Loss: 1.1186631917953491
Epoch 17/20, Loss: 1.117985486984253
Epoch 18/20, Loss: 1.1172871589660645
Epoch 19/20, Loss: 1.116546392440796
Epoch 20/20, Loss: 1.115759015083313
Test Loss: 0.49647605419158936
