In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error
import tensorflow as tf
from tensorflow.keras import layers


In [None]:

# Load data
df = pd.read_csv('train.csv')

# Select features and target
X = df.drop('price_doc', axis=1)
y = df['price_doc']

# Handling categorical variables
categorical_features = X.select_dtypes(include=['object']).columns
numeric_features = X.select_dtypes(exclude=['object']).columns

# Preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

X_processed = preprocessor.fit_transform(X)

In [None]:
# Split the data
X_train, X_val, y_train, y_val = train_test_split(X_processed, y, test_size=0.2, random_state=42)

# Define the neural network architecture
model = tf.keras.Sequential([
    layers.Dense(128, activation='relu', input_shape=[X_train.shape[1]]),
    layers.Dropout(0.3),  # Dropout layer to reduce overfitting
    layers.Dense(64, activation='relu'),
    layers.Dropout(0.2),
    layers.Dense(32, activation='relu'),
    layers.Dense(1)  # Output layer for regression (single neuron)
])

# Compile the model
model.compile(
    optimizer='adam',
    loss='mean_squared_error'
)

# Early stopping to prevent overfitting
early_stopping = tf.keras.callbacks.EarlyStopping(
    patience=10,
    min_delta=0.001,
    restore_best_weights=True
)

In [None]:

# Train the model
history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=100,  # Adjust the number of epochs based on training performance
    batch_size=32,  # Batch size for training
    callbacks=[early_stopping]
)

# Predict on validation data
y_val_pred = model.predict(X_val)

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))
print(f'Validation RMSE: {rmse}')

# Load test data
test_df = pd.read_csv('/path/to/your/test.csv')

# Preprocess the test data (Ensure it has the same features as the training data)
X_test = preprocessor.transform(test_df.drop(['id'], axis=1))  # Replace 'id' with your identifier column

# Predict on test data
y_test_pred = model.predict(X_test)

In [None]:

# Prepare submission file
submission = pd.DataFrame({
    'Row ID': test_df['Row ID'],  # Replace 'id' with the actual identifier column
    'price_doc': y_test_pred.flatten()  # Flatten to convert predictions to 1D array
})

# Save the submission file
submission.to_csv('neural_network_submission.csv', index=False)