In [109]:
import pandas as pd
import numpy as np

# Make numpy values easier to read.
np.set_printoptions(precision=3, suppress=True)

import tensorflow as tf
from tensorflow.keras import layers

In [110]:
data = pd.read_csv('training_data_student_perf.csv')
data

Unnamed: 0,Hours Studied,Previous Scores,Extracurricular Activities,Sleep Hours,Sample Question Papers Practiced,Performance Index
0,1,54,Yes,8,5,28.0
1,1,87,Yes,7,5,64.0
2,3,84,No,6,6,61.0
3,1,92,Yes,4,6,67.0
4,4,81,Yes,7,0,65.0
...,...,...,...,...,...,...
8995,8,50,Yes,6,6,48.0
8996,4,68,No,9,3,51.0
8997,9,48,No,7,6,44.0
8998,1,47,No,9,0,20.0


In [111]:
train = data.copy()
labels = train.pop('Performance Index')

In [112]:
# Preprocessing the data
inputs = {}

for name, col in train.items():
    dtype = col.dtype
    
    if dtype == object:
        dtype = tf.string
    else:
        dtype = tf.float32
    
    inputs[name] = tf.keras.Input(shape=(1,), name=name, dtype=dtype)
    
inputs

{'Hours Studied': <KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'Hours Studied')>,
 'Previous Scores': <KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'Previous Scores')>,
 'Extracurricular Activities': <KerasTensor: shape=(None, 1) dtype=string (created by layer 'Extracurricular Activities')>,
 'Sleep Hours': <KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'Sleep Hours')>,
 'Sample Question Papers Practiced': <KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'Sample Question Papers Practiced')>}

In [113]:
# concatenate the numeric inputs together, and run them through a normalization layer

numeric_inputs = {name: input for name, input in inputs.items() if input.dtype == tf.float32}

x = layers.Concatenate()(list(numeric_inputs.values()))     # Used to combine the numeric input layers into a single tensor
norm = layers.Normalization()
norm.adapt(np.array(data[numeric_inputs.keys()]))           # Sdapts the normalization layer to the statistics (mean and variance) of the numeric features in the training data
all_numeric_inputs = norm(x)

all_numeric_inputs

<KerasTensor: shape=(None, 4) dtype=float32 (created by layer 'normalization_6')>

In [114]:
preprocessed_inputs = [all_numeric_inputs]

In [115]:
# Dealing with the string inputs

for name, input in inputs.items():
    if input.dtype == tf.float32:
        continue
    
    lookup = layers.StringLookup(vocabulary=np.unique(train[name]))             # A preprocessing layer which maps string features to integer indices.
    one_hot = layers.CategoryEncoding(num_tokens=lookup.vocabulary_size())
    
    x = lookup(input)
    x = one_hot(x)
    
    preprocessed_inputs.append(x)

In [116]:
preprocessed_inputs

[<KerasTensor: shape=(None, 4) dtype=float32 (created by layer 'normalization_6')>,
 <KerasTensor: shape=(None, 3) dtype=float32 (created by layer 'category_encoding_6')>]

In [117]:
# Concatenate all the preprocessed inputs together, and build a model that handles the preprocessing

preprocessed_inputs_cat = layers.Concatenate()(preprocessed_inputs)

preprocessing_model = tf.keras.Model(inputs, preprocessed_inputs_cat)

# This model just contains the input preprocessing.
# Keras models don't automatically convert pandas DataFrames because it's not clear if it should be 
# converted to one tensor or to a dictionary of tensors. So, we convert it into a dictionary of tensors.

In [118]:
train_dict = {name: np.array(value) for name, value in train.items()}

In [119]:
# Passing one batch of data through the preprocessing model to see what happens to it.

features_dict = {name:values[:1] for name, values in train_dict.items()}
preprocessing_model(features_dict)

# We can see that the preprocessing model takes in the raw data and returns a single tensor for each example.
# The numeric features and string one-hots all concatenated together

<tf.Tensor: shape=(1, 7), dtype=float32, numpy=
array([[-1.537, -0.895,  0.858,  0.142,  0.   ,  0.   ,  1.   ]],
      dtype=float32)>

In [120]:
def build_model(preprocessing_head, inputs):
    body = tf.keras.Sequential([
        layers.Dense(32),
        layers.Dropout(0.5),
        layers.Dense(64),
        layers.Dropout(0.4),
        layers.Dense(1)
    ])
    
    preprocessed_inputs = preprocessing_head(inputs)
    result = body(preprocessed_inputs)
    
    model = tf.keras.Model(inputs, result)
    
    model.compile(loss=tf.losses.MeanSquaredError(), optimizer=tf.optimizers.Adam())
    
    return model

model = build_model(preprocessing_model, inputs)

In [121]:
model.fit(train_dict, labels, epochs=5)

""" 
Model v1:

Epoch 1/5
282/282 [==============================] - 8s 17ms/step - loss: 2582.1079
Epoch 2/5
282/282 [==============================] - 5s 18ms/step - loss: 426.6039
Epoch 3/5
282/282 [==============================] - 5s 16ms/step - loss: 8.5628             Overfitting?
Epoch 4/5
282/282 [==============================] - 7s 24ms/step - loss: 4.2276
Epoch 5/5
282/282 [==============================] - 7s 26ms/step - loss: 4.1705

Model v2:

Epoch 1/5
282/282 [==============================] - 6s 17ms/step - loss: 1231.4812
Epoch 2/5
282/282 [==============================] - 6s 22ms/step - loss: 183.0338
Epoch 3/5
282/282 [==============================] - 9s 31ms/step - loss: 162.2968
Epoch 4/5
282/282 [==============================] - 6s 22ms/step - loss: 145.6754
Epoch 5/5
282/282 [==============================] - 6s 21ms/step - loss: 132.4440

"""

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5




In [122]:
model.save('marks_predictor_v2.h5')

In [123]:
# # Saving the model using pickle
# import pickle

# with open('marks_predictor_v1.pkl', 'wb') as f:
#     pickle.dump(model, f)

In [124]:
loaded_model = tf.keras.models.load_model('marks_predictor_v2.h5')

In [125]:
# Preparing the test data in the same way we prepared the training data 

test_data = pd.read_csv('test_data_student_perf.csv')

test_dict = {name: np.array(value) for name, value in test_data.items()}

# Writing the predictions into a csv file
with open('predictions_v1.csv', 'w') as file:
    for pred in loaded_model.predict(test_dict):
        file.write(str(pred[0]) + '\n')


  inputs = self._flatten_to_reference_inputs(inputs)




In [126]:
# Function that takes ID,Hours Studied,Previous Scores,Extracurricular Activities,Sleep Hours,Sample Question Papers Practiced
# as inputs and returns the predicted performance index

def predict(hrs_std, prev_scr, ex_ac, sleep, qp):
    test = pd.DataFrame({'Hours Studied': [hrs_std], 'Previous Scores': [prev_scr], 'Extracurricular Activities': [ex_ac], 'Sleep Hours': [sleep], 'Sample Question Papers Practiced': [qp]})
    test_dict = {name: np.array(value) for name, value in test.items()}
    return model.predict(test_dict)[0][0]

print(predict(5, 90, "Yes", 7, 5))

73.039
