In [1]:
import pandas as pd

### Data Extraction

Batch Fetching data if stored in feature store.

In [2]:
df = pd.read_csv('../data/cleaned-data-restaurant.csv')

### Data Validation

In [3]:
# data validation for numeric type

for col in df.columns:
    if df[col].dtype == 'float64' or df[col].dtype == 'int64':
        pass
    else:
        print("Data validation failed for column: ", col, " as it is not of numeric type")
        
# data validation check for missing values
for col in df.columns:
    if df[col].isna().sum() > 0:
        print("Data validation failed for column: ", col, " as it has missing values")

# data validation for rating should be between 0 and 10
for col in df.columns:
    if 'Rating' in col:
        if df[col].min() < 0 or df[col].max() > 10:
            print("Data validation failed for column: `", col, "` as rating is not between 0 and 10")

In [6]:
## Data stability check using evidently

from evidently.test_suite import TestSuite
from evidently.tests.base_test import generate_column_tests
from evidently.test_preset import DataStabilityTestPreset, NoTargetPerformanceTestPreset
from evidently.tests import *

tests = TestSuite(tests=[
    TestNumberOfColumnsWithMissingValues(),
    TestNumberOfRowsWithMissingValues(),
    TestNumberOfConstantColumns(),
    TestNumberOfDuplicatedRows(),
    TestNumberOfDuplicatedColumns(),
    TestColumnsType(),
])

tests.run(reference_data=None, current_data=df)
tests.as_dict()

{'tests': [{'name': 'The Number of Columns With Missing Values',
   'description': 'The number of columns with missing values is 0. The test threshold is eq=0.',
   'status': 'SUCCESS',
   'group': 'data_integrity',
   'parameters': {'condition': {'eq': 0}, 'value': 0.0}},
  {'name': 'The Number Of Rows With Missing Values',
   'description': 'The number of rows with missing values is 0. The test threshold is eq=0.',
   'status': 'SUCCESS',
   'group': 'data_integrity',
   'parameters': {'condition': {'eq': 0}, 'value': 0.0}},
  {'name': 'Number of Constant Columns',
   'description': 'The number of constant columns is 0. The test threshold is eq=0.',
   'status': 'SUCCESS',
   'group': 'data_integrity',
   'parameters': {'condition': {'eq': 0}, 'value': 0.0}},
  {'name': 'Number of Duplicate Rows',
   'description': 'The number of duplicate rows is 0. The test threshold is eq=0.',
   'status': 'SUCCESS',
   'group': 'data_integrity',
   'parameters': {'condition': {'eq': 0}, 'value': 

### Data Preparation

Train and Test Split, Normalization

In [5]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

X = df.drop('Annual Turnover', axis=1)
y = df['Annual Turnover']

X_scaled = pd.DataFrame({col: scaler.fit_transform(X[[col]])[:, 0] for col in X.columns})

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

### Model Training

In [7]:
# Importing necessary libraries
import tensorflow as tf
from keras import Sequential
from keras.layers import Dense

# Define the neural network architecture
model = Sequential([
    Dense(256, activation='relu', input_shape=(X_train.shape[1],)),
    Dense(128, activation='relu', kernel_initializer='random_uniform'),
    Dense(64, activation='relu', kernel_initializer='random_uniform'),
    Dense(64, activation='relu', kernel_initializer='random_uniform'),
    Dense(32, activation='relu', kernel_initializer='random_uniform'),
    Dense(16, activation='relu', kernel_initializer='random_uniform'),
    Dense(1)  # Output layer with 1 neuron for regression
])

# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error')

# Display the model summary
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_7 (Dense)             (None, 256)               8192      
                                                                 
 dense_8 (Dense)             (None, 128)               32896     
                                                                 
 dense_9 (Dense)             (None, 64)                8256      
                                                                 
 dense_10 (Dense)            (None, 64)                4160      
                                                                 
 dense_11 (Dense)            (None, 32)                2080      
                                                                 
 dense_12 (Dense)            (None, 16)                528       
                                                                 
 dense_13 (Dense)            (None, 1)                

In [8]:
# Train the model
history = model.fit(X_train, y_train, epochs=50, batch_size=32)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


### Model Evaluation

In [9]:
y_pred = model.predict(X_test)



In [10]:
from sklearn.metrics import mean_squared_error
import numpy as np

rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print(rmse)

19928645.127132114


### Model Analysis and Model Validation

In [19]:
test_df = pd.read_csv('../data/test_separate.csv')
test_df.drop('Registration Number', axis=1, inplace=True)
test_df = pd.DataFrame({col: scaler.fit_transform(test_df[[col]])[:, 0] for col in test_df.columns})


In [20]:
y_pred_test = model.predict(test_df.iloc[:10, :])



In [21]:
y_pred_test.reshape(-1)

array([25947874., 35578844., 22312880., 51685184., 52054384., 39747880.,
       30223778., 50242480., 21572546., 15681174.], dtype=float32)

- Neural Network model performing better as compare to decision tree algorithm.