In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, RobustScaler
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.metrics import RootMeanSquaredError
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

In [10]:
# Load the dataset
file_path = 'train.csv'  # Replace with your CSV file path
data = pd.read_csv(file_path)

In [11]:
# Selecting key features for the model
key_features = ['full_sq', 'life_sq', 'floor', 'leisure_count_500', 'cafe_count_1000_price_high']
data = data[key_features + ['price_doc']]

In [12]:
# Applying logarithmic transformations
data['full_sq_log'] = np.log1p(data['full_sq'])
data['leisure_count_500_log'] = np.log1p(data['leisure_count_500'])

In [13]:
# Handling any potential infinities or NaNs
data.replace([np.inf, -np.inf], np.nan, inplace=True)
data.fillna(0, inplace=True)

In [14]:
# Splitting the data into features (X) and target variable (y)
X = data.drop('price_doc', axis=1)
y = data['price_doc']

In [15]:
# Splitting the dataset into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [16]:
# Feature Scaling using Robust Scaling
scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

In [17]:
# Define the learning rate
learning_rate = 0.01  # You can change this value as needed

In [18]:
batch_size = 32 # Adjust the batch size

In [19]:
# Neural Network Design
model = Sequential([
    Dense(256, activation='relu', input_shape=(X_train_scaled.shape[1],)),
    Dropout(0.4),  # Increased dropout rate
    Dense(128, activation='relu'),
    Dropout(0.3),  # Added dropout layer
    Dense(1, activation='linear')  # Output layer for regression
])


In [20]:
# Compiling the Model with a custom learning rate
model.compile(optimizer=Adam(learning_rate=learning_rate), loss='mean_squared_error', metrics=[RootMeanSquaredError(name='rmse')])


In [21]:
# Implementing Early Stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=40, restore_best_weights=True)

In [22]:
# Training the Model with the specified batch size
history = model.fit(X_train_scaled, y_train, batch_size=32, epochs=150, validation_data=(X_val_scaled, y_val), callbacks=[early_stopping])

Epoch 1/150
Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150
Epoch 7/150
Epoch 8/150
Epoch 9/150
Epoch 10/150
Epoch 11/150
Epoch 12/150
Epoch 13/150
Epoch 14/150
Epoch 15/150
Epoch 16/150
Epoch 17/150
Epoch 18/150
Epoch 19/150
Epoch 20/150
Epoch 21/150
Epoch 22/150
Epoch 23/150
Epoch 24/150
Epoch 25/150
Epoch 26/150
Epoch 27/150
Epoch 28/150
Epoch 29/150
Epoch 30/150
Epoch 31/150
Epoch 32/150
Epoch 33/150
Epoch 34/150
Epoch 35/150
Epoch 36/150
Epoch 37/150
Epoch 38/150
Epoch 39/150
Epoch 40/150
Epoch 41/150
Epoch 42/150
Epoch 43/150
Epoch 44/150
Epoch 45/150
Epoch 46/150
Epoch 47/150
Epoch 48/150
Epoch 49/150
Epoch 50/150
Epoch 51/150
Epoch 52/150
Epoch 53/150
Epoch 54/150
Epoch 55/150
Epoch 56/150
Epoch 57/150
Epoch 58/150
Epoch 59/150
Epoch 60/150
Epoch 61/150
Epoch 62/150
Epoch 63/150
Epoch 64/150
Epoch 65/150
Epoch 66/150
Epoch 67/150
Epoch 68/150
Epoch 69/150
Epoch 70/150
Epoch 71/150
Epoch 72/150
Epoch 73/150
Epoch 74/150
Epoch 75/150
Epoch 76/150
Epoch 77/150
Epoch 78

In [26]:
# Model Summary
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 256)               2048      
                                                                 
 dropout (Dropout)           (None, 256)               0         
                                                                 
 dense_1 (Dense)             (None, 128)               32896     
                                                                 
 dropout_1 (Dropout)         (None, 128)               0         
                                                                 
 dense_2 (Dense)             (None, 1)                 129       
                                                                 
Total params: 35073 (137.00 KB)
Trainable params: 35073 (137.00 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [27]:
# Evaluating the model
results = model.evaluate(X_val_scaled, y_val)
print(f"Validation RMSE: {results[1]}")

Validation RMSE: 13118033.0


In [28]:
# Load the test dataset
test_file_path = 'test.csv'  # Replace with your test CSV file path
test_data = pd.read_csv(test_file_path)

# Apply the same preprocessing steps to the test dataset
test_data['full_sq_log'] = np.log1p(test_data['full_sq'])
test_data['leisure_count_500_log'] = np.log1p(test_data['leisure_count_500'])

# Handle potential infinities or NaNs
test_data.replace([np.inf, -np.inf], np.nan, inplace=True)
test_data.fillna(0, inplace=True)

# Select the same features used for training (including original and transformed features)
X_test = test_data[['full_sq', 'life_sq', 'floor', 'leisure_count_500', 'cafe_count_1000_price_high', 'full_sq_log', 'leisure_count_500_log']]

# Scale the test data using the same scaler as the training data
X_test_scaled = scaler.transform(X_test)

# Make predictions
predictions = model.predict(X_test_scaled)

# Save predictions to a CSV file
submission = pd.DataFrame({
    'row ID': test_data['row ID'],  # Replace 'row ID' with the actual identifier column of your test dataset
    'price_doc': predictions.flatten()
})
submission.to_csv('neural-network-2.csv', index=False)



