# Total Earnings Predictor

## Load and View Data

In [1]:
# import useful libraries for project
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

In [2]:
# Read in training and test data
training_df = pd.read_csv('Exercise Files/03/sales_data_training.csv')
test_df = pd.read_csv('Exercise Files/03/sales_data_test.csv')

In [3]:
# View head to check data was imported correctly
training_df.head()

Unnamed: 0,critic_rating,is_action,is_exclusive_to_us,is_portable,is_role_playing,is_sequel,is_sports,suitable_for_kids,total_earnings,unit_price
0,3.5,1,0,1,0,1,0,0,132717,59.99
1,4.5,0,0,0,0,1,1,0,83407,49.99
2,3.0,0,0,0,0,1,1,0,62423,49.99
3,4.5,1,0,0,0,0,0,1,69889,39.99
4,4.0,1,0,1,0,1,0,1,161382,59.99


## Preprocess the Data

In [4]:
# Data needs to be scaled to small range for the neural network to work well, initialise scaler object
scaler = MinMaxScaler(feature_range=(0,1))

In [5]:
# Fit the scaler on the training data, transform and transform on test data
scaled_training_df = scaler.fit_transform(training_df)
scaled_test_df = scaler.transform(test_df)

In [6]:
# Check scaler adjustment on the total_earnings column, will be useful to know later when making prediction
print('The scaler multiplied by {:.10f} and added {:.10f} on the total earnings column'.format(scaler.scale_[8], scaler.min_[8]))

The scaler multiplied by 0.0000036968 and added -0.1159128297 on the total earnings column


In [7]:
# Convert the scaled arrays into data frame
scaled_training_df_actual = pd.DataFrame(scaled_training_df, columns=training_df.columns)
scaled_test_df_actual = pd.DataFrame(scaled_test_df, columns=test_df.columns)

In [8]:
# Check the scaled version of the dataframe
scaled_training_df_actual.head()

Unnamed: 0,critic_rating,is_action,is_exclusive_to_us,is_portable,is_role_playing,is_sequel,is_sports,suitable_for_kids,total_earnings,unit_price
0,0.5,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.374714,1.0
1,0.833333,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.192425,0.5
2,0.333333,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.114852,0.5
3,0.833333,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.142452,0.0
4,0.666667,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.480682,1.0


We can save the dataframe objects as csv files to use them later by using df.to_csv function

## Create and Train Model

In [9]:
# Import the Sequential model and the layers we will be using for this model
from keras.models import Sequential
from keras.layers import Dense

In [10]:
# Split training data into features and labels
X = scaled_training_df_actual.drop('total_earnings', axis=1).values
y = scaled_training_df_actual['total_earnings'].values

In [11]:
# Define the neural network model
model_nn = Sequential()
model_nn.add(Dense(50, activation='relu', input_dim=9))
model_nn.add(Dense(100, activation='relu'))
model_nn.add(Dense(50, activation='relu'))
model_nn.add(Dense(1, activation='linear'))

model_nn.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 50)                500       
_________________________________________________________________
dense_1 (Dense)              (None, 100)               5100      
_________________________________________________________________
dense_2 (Dense)              (None, 50)                5050      
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 51        
Total params: 10,701
Trainable params: 10,701
Non-trainable params: 0
_________________________________________________________________


The last layer only contains one node as we want the prediction to be an output of this final node in the final layer. The final layer uses a linear activation function as we are predicting a continuous quantitative value rather than a categorical or binary value

In [12]:
# Compile the model
model_nn.compile(optimizer='adam', loss='mse')

In [13]:
# Fit model on our training data
model_nn.fit(X, y, epochs=50, verbose=2, shuffle=True)

Epoch 1/50
32/32 - 0s - loss: 0.0057
Epoch 2/50
32/32 - 0s - loss: 0.0011
Epoch 3/50
32/32 - 0s - loss: 5.8682e-04
Epoch 4/50
32/32 - 0s - loss: 3.3481e-04
Epoch 5/50
32/32 - 0s - loss: 1.8351e-04
Epoch 6/50
32/32 - 0s - loss: 1.3217e-04
Epoch 7/50
32/32 - 0s - loss: 1.1358e-04
Epoch 8/50
32/32 - 0s - loss: 8.4668e-05
Epoch 9/50
32/32 - 0s - loss: 7.1360e-05
Epoch 10/50
32/32 - 0s - loss: 5.0549e-05
Epoch 11/50
32/32 - 0s - loss: 4.6824e-05
Epoch 12/50
32/32 - 0s - loss: 4.9062e-05
Epoch 13/50
32/32 - 0s - loss: 5.0569e-05
Epoch 14/50
32/32 - 0s - loss: 4.4293e-05
Epoch 15/50
32/32 - 0s - loss: 3.4979e-05
Epoch 16/50
32/32 - 0s - loss: 5.2678e-05
Epoch 17/50
32/32 - 0s - loss: 3.2928e-05
Epoch 18/50
32/32 - 0s - loss: 4.3437e-05
Epoch 19/50
32/32 - 0s - loss: 4.4419e-05
Epoch 20/50
32/32 - 0s - loss: 5.8349e-05
Epoch 21/50
32/32 - 0s - loss: 6.0341e-05
Epoch 22/50
32/32 - 0s - loss: 4.1617e-05
Epoch 23/50
32/32 - 0s - loss: 3.0113e-05
Epoch 24/50
32/32 - 0s - loss: 2.9104e-05
Epoch 25/

<keras.callbacks.History at 0x202255d1eb0>

In [14]:
# Split test data into features and labels
X_test = scaled_test_df_actual.drop('total_earnings', axis=1).values
y_test = scaled_test_df_actual['total_earnings'].values

In [15]:
# check the error rate of our model on the test dataset
test_error_rate = model_nn.evaluate(X_test, y_test, verbose=0)
print('The mean squared error (MSE) for the test data set is {:10f}'.format(test_error_rate))

The mean squared error (MSE) for the test data set is   0.000089


In [21]:
# Make predictions using the model, load data in 
predict_df = pd.read_csv('Exercise Files/04/proposed_new_product.csv')

In [22]:
# Use model to make prediction
prediction = model_nn.predict(predict_df.values)[0][0]
prediction

0.87243444

In [23]:
# We can rescale this data using the initial rescaling metrics
prediction = prediction - scaler.min_[8]
prediction = prediction / scaler.scale_[8]
print('Earning prediction for proposed product {}'.format(prediction))

Earning prediction for proposed product 267352.8774550557


The mse of the model appears to be extremely low, therefore, when we make a prediction, the prediction should be fairly accurate. Note that above we have rescaled the final prediction to give the final prediction value. We have done this because we initially had to scale the data using the MinMaxScaler

In [24]:
# Save model
model_nn.save('trained_earning_predictor_model.h5')