In [12]:
import pandas as pd
import numpy as np

# Set the seed for reproducibility
np.random.seed(0)

# Generate a fictional dataset
num_records = 1000

# Features
regions = ['North', 'South', 'East', 'West']
months = np.random.randint(1, 13, num_records)
marketing_spend = np.random.uniform(1000, 50000, num_records)
economic_index = np.random.uniform(0.5, 1.5, num_records) # Assuming a normalized index
competitor_activity = np.random.uniform(0, 10, num_records) # Score out of 10
historical_sales = np.random.randint(20, 500, num_records)

# Sales target variable (with a simple relation for the fictional data)
base_sales = 50
sales = base_sales * economic_index + 0.05 * marketing_spend - 5 * competitor_activity + 0.3 * historical_sales
sales = np.round(sales + np.random.normal(0, 10, num_records)) # Adding some noise

# Create the dataframe
data = {
    'Region': np.random.choice(regions, num_records),
    'Month': months,
    'Marketing Spend': marketing_spend,
    'Economic Index': economic_index,
    'Competitor Activity': competitor_activity,
    'Historical Sales': historical_sales,
    'Sales': sales
}

sales_data = pd.DataFrame(data)

# Saving the dataset as a .pkl file as per the user's preference
training_data_pkl = "data/sales_data.pkl"
sales_data.to_pickle(training_data_pkl)

#filename, sales_data.head()


In [13]:
len(training_data_pkl)

19

In [14]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler

# Load the dataset
sales_data = pd.read_pickle(f"{training_data_pkl}")

# One-hot encode the 'Region' categorical feature
encoder = OneHotEncoder(sparse=False)
region_encoded = encoder.fit_transform(sales_data[['Region']])

# Normalize the numerical features
scaler = StandardScaler()
numerical_features = ['Month', 'Marketing Spend', 'Economic Index', 'Competitor Activity', 'Historical Sales']
scaled_numerical = scaler.fit_transform(sales_data[numerical_features])

# Combine the encoded categorical and scaled numerical features
processed_features = np.hstack((region_encoded, scaled_numerical))

# Define the target
target = sales_data['Sales'].values

# Split the dataset into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(processed_features, target, test_size=0.2, random_state=0)

# Show the shape of the training and testing sets
X_train.shape, X_test.shape, y_train.shape, y_test.shape


((800, 9), (200, 9), (800,), (200,))

In [15]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# Design the neural network model
model = Sequential([
    Dense(64, input_dim=X_train.shape[1], activation='relu'),  # Input layer & 1st Hidden layer with 64 neurons
    Dense(32, activation='relu'),  # 2nd Hidden layer with 32 neurons
    Dense(1, activation='linear')  # Output layer with 1 neuron for regression output
])

# Compile the model with the Adam optimizer and mean squared error loss function
model.compile(optimizer='adam', loss='mean_squared_error')

# Summary of the model to see the structure
model.summary()


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 64)                640       
                                                                 
 dense_1 (Dense)             (None, 32)                2080      
                                                                 
 dense_2 (Dense)             (None, 1)                 33        
                                                                 
Total params: 2,753
Trainable params: 2,753
Non-trainable params: 0
_________________________________________________________________


In [16]:
# Train the model
history = model.fit(X_train, y_train, epochs=100, batch_size=32, validation_split=0.2)


Epoch 1/100


2023-11-07 19:42:30.688011: W tensorflow/tsl/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 7

In [17]:
# Evaluate the model on the test set
test_loss = model.evaluate(X_test, y_test)

# Make predictions
predictions = model.predict(X_test)

# Calculate additional metrics, like MAE and R-squared
from sklearn.metrics import mean_absolute_error, r2_score
mae = mean_absolute_error(y_test, predictions)
r2 = r2_score(y_test, predictions)

# Display the evaluation results
print(f"Test MSE: {test_loss}")
print(f"Mean Absolute Error: {mae}")
print(f"R-squared: {r2}")


Test MSE: 1457.1072998046875
Mean Absolute Error: 28.04448860168457
R-squared: 0.9971967267376793


In [18]:
from sklearn.metrics import mean_absolute_error, r2_score
import pandas as pd

def evaluate_model_and_add_expectations(model, X_test, y_test):
    # Evaluate the model on the test set
    test_loss = model.evaluate(X_test, y_test, verbose=0)

    # Make predictions
    predictions = model.predict(X_test).flatten()

    # Calculate additional metrics, like MAE and R-squared
    mae = mean_absolute_error(y_test, predictions)
    r2 = r2_score(y_test, predictions)

    # Create a DataFrame for the test set with an additional column for predictions
    test_results = pd.DataFrame(X_test, columns=['Region_North', 'Region_South', 'Region_East', 'Region_West'] + numerical_features)
    test_results['Actual Sales'] = y_test
    test_results['Predicted Sales'] = predictions

    # Return the evaluation results and the test set with expectations
    evaluation_results = {
        'Test MSE': test_loss,
        'Mean Absolute Error': mae,
        'R-squared': r2
    }

    return evaluation_results, test_results

# Assume 'model' is your trained Keras model, 'X_test' is your test features, and 'y_test' are the true sales values
# Call the function with the trained model and test data
evaluation_results, test_results_with_expectations = evaluate_model_and_add_expectations(model, X_test, y_test)

# Now 'test_results_with_expectations' will have the actual and predicted sales




In [19]:
test_results_with_expectations

Unnamed: 0,Region_North,Region_South,Region_East,Region_West,Month,Marketing Spend,Economic Index,Competitor Activity,Historical Sales,Actual Sales,Predicted Sales
0,0.0,0.0,0.0,1.0,1.359084,1.537543,0.470067,-0.063569,0.373728,2481.0,2522.298584
1,0.0,1.0,0.0,0.0,-0.402909,-0.043736,-1.319288,0.418788,-1.241433,1276.0,1235.307495
2,0.0,0.0,0.0,1.0,-0.990240,-0.366504,1.200781,0.377495,-1.397968,1070.0,1060.696777
3,0.0,0.0,1.0,0.0,-0.990240,0.361536,0.789813,-1.141282,-0.223952,1656.0,1636.454712
4,1.0,0.0,0.0,0.0,-0.990240,-1.453435,0.499034,-1.410613,-1.561619,298.0,390.073975
...,...,...,...,...,...,...,...,...,...,...,...
195,0.0,0.0,1.0,0.0,1.359084,-0.850449,0.727438,1.065002,-1.084897,711.0,692.932495
196,1.0,0.0,0.0,0.0,-0.109244,-0.142240,-0.963638,-0.642229,1.419670,1322.0,1297.891724
197,1.0,0.0,0.0,0.0,-0.990240,-0.074166,-1.579489,-0.986084,0.380844,1334.0,1301.346924
198,1.0,0.0,0.0,0.0,0.184422,-1.277243,0.870549,1.544282,1.049677,510.0,503.842590


In [28]:
# Save the entire model to a HDF5 file
model.save('my_model.h5')


In [29]:
import joblib

# Save the encoder and scaler
joblib.dump(encoder, 'encoder.joblib')
joblib.dump(scaler, 'scaler.joblib')

# Load them back
loaded_encoder = joblib.load('encoder.joblib')
loaded_scaler = joblib.load('scaler.joblib')
