Nueral Network

In [25]:
import pandas as pd   
import time
from sklearn.model_selection import train_test_split #, RepeatedKFold, GridSearchCV, cross_val_score
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.preprocessing import LabelEncoder, StandardScaler, PolynomialFeatures
import warnings
import os
warnings.filterwarnings('ignore')

In [26]:
train_data = pd.read_csv('train.csv') 
train_data.shape

(181507, 272)

In [27]:
test_data = pd.read_csv('test.csv')
test_data = test_data.drop('row ID', axis=1)
test_data.shape

(77789, 271)

In [28]:
# One-hot encode 'sub_area' column in both datasets
train_data = pd.get_dummies(train_data, columns=['sub_area'], prefix='', prefix_sep='')
test_data = pd.get_dummies(test_data, columns=['sub_area'], prefix='', prefix_sep='')

# Identify sub_areas present in the training data but not in the test data
target_column = 'price_doc'
train_unique_sub_areas = set(train_data.columns) - set(test_data.columns) - {target_column}

# Create 'other_sub_area' column for both datasets
train_data['other_sub_area'] = 0
test_data['other_sub_area'] = 0

# Set values for 'other_sub_area' column in the training data
train_data.loc[:, 'other_sub_area'] = (
    (train_data[list(train_unique_sub_areas)].sum(axis=1) == 0) & (train_data['other_sub_area'] == 0)
).astype(int)

# Remove columns for uncommon sub_areas in the training data
train_data = train_data.drop(columns=list(train_unique_sub_areas))

# Convert data types to integer
# train_data = train_data.astype(int)
# test_data = test_data.astype(int)


In [29]:
categorical_cols = train_data.select_dtypes(include=['object']).columns

In [30]:
for col in categorical_cols:
    le = LabelEncoder()
    train_data[col] = le.fit_transform(train_data[col])


In [31]:
for col in categorical_cols:
    one_hot_encoded = pd.get_dummies(train_data[col], prefix=col, drop_first=True)
    train_data = pd.concat([train_data, one_hot_encoded], axis=1)
    train_data.drop(col, axis=1, inplace=True)

In [32]:
for col in categorical_cols:
    one_hot_encoded = pd.get_dummies(test_data[col], prefix=col, drop_first=True)
    test_data = pd.concat([test_data, one_hot_encoded], axis=1)
    test_data.drop(col, axis=1, inplace=True)

In [33]:
train_data = train_data.astype('float32')
test_data = test_data.astype('float32')

In [34]:
X = train_data.drop('price_doc', axis=1)
y = train_data['price_doc']

In [35]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
trainX, testX, trainy, testy = train_test_split(X_scaled, y, test_size=0.3, random_state=2)

In [36]:
testy.value_counts()

price_doc
1000000.00     1131
2000000.00      876
3480000.00      337
3000000.00      276
6000000.00      177
               ... 
2424427.00        1
3514343.25        1
11174028.00       1
13186980.00       1
10798442.00       1
Name: count, Length: 45880, dtype: int64

In [37]:
trainX.shape

(127054, 2183)

In [38]:
testX.shape

(54453, 2183)

In [39]:
trainX = trainX.astype('float32')
testX = testX.astype('float32')
trainy = trainy.astype('float32')
testy = testy.astype('float32')

In [40]:
import torch
os.environ["KERAS_BACKEND"] = "torch"
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from keras.models import Sequential
from keras.layers import Dense, Dropout
physical_devices = tf.config.list_physical_devices('GPU')
if len(physical_devices) > 0:
    tf.config.experimental.set_memory_growth(physical_devices[0], True)

In [41]:
n_features = trainX.shape[1]
# Create a Sequential model
model = keras.Sequential([
    layers.Dense(256, activation='relu', input_shape=[trainX.shape[1]]),
    layers.Dense(128, activation='relu'),
    layers.Dense(64, activation='relu'),
    layers.Dense(1)
])

In [42]:
model.compile(optimizer='adam', loss='mean_squared_error', metrics=['accuracy', 'mse']) #change optimizer

# Print the model summary
print(model.summary())

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_3 (Dense)             (None, 256)               559104    
                                                                 
 dense_4 (Dense)             (None, 128)               32896     
                                                                 
 dense_5 (Dense)             (None, 64)                8256      
                                                                 
 dense_6 (Dense)             (None, 1)                 65        
                                                                 
Total params: 600321 (2.29 MB)
Trainable params: 600321 (2.29 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None


In [43]:
model.fit(trainX, trainy, epochs=100, batch_size=1000)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.src.callbacks.History at 0x2831f6d2890>

In [44]:
model.evaluate(testX, testy, batch_size=1000)



[190800432463872.0, 0.0, 190800432463872.0]

In [45]:
for col in categorical_cols:
    le = LabelEncoder()
    test_data[col] = le.fit_transform(test_data[col])

KeyError: 'product_type'

In [None]:
test_data_scaled = scaler.transform(test_data)

In [None]:
test_predictions = model.predict(test_data_scaled)
print(test_predictions.size)

In [None]:
test_predictions.shape

In [None]:
submission = pd.DataFrame({
    'row ID': range(1, 77790),
    'price_doc': test_predictions.flatten()
})

submission.to_csv('submission.csv', index=False)