In [1]:
# preparation of dataset using GenZDealZ.ai platform, all the plotforms mention are from GenZDealZ.ai
platforms = ['AMAZON PRIME Membership','Flipkart','Amazon Shopping','Zomato','Myntra','Hotstar','McDonald', 
'Pizza Hut','OLA Cabs'
'PVR Cinemas',
'Reliance Digital',
'KFC',
'MakeMyTrip',
'Uber',
'Cafe Coffee Day',
'Blinkit',
'Swiggy',
'AJIO','Vijay Sales',
'Croma',
'Dominos',
'Wildcraft',
'Reliance Smart',
'Max Fashion',
'Netmeds',
'Puma',
'Westside',
'Fastrack',
'Nykaa',
'Bewakoof',
'Zee5',
'Gaana',
'Louis Philippe',
'@Home (Processing)',
'1mg Prescription Medicine',
'Abhibus',
'Absolute Barbecue',
'AD','Aeropostale','Aldo','Aligarh House by Eatfit','Allen Solly','ALT BALAJI','Amazon Kindle Unlimited','American Eagle','AND India','Anita Dongre','Apollo Pharmacy']

In [2]:
platforms

['AMAZON PRIME Membership',
 'Flipkart',
 'Amazon Shopping',
 'Zomato',
 'Myntra',
 'Hotstar',
 'McDonald',
 'Pizza Hut',
 'OLA CabsPVR Cinemas',
 'Reliance Digital',
 'KFC',
 'MakeMyTrip',
 'Uber',
 'Cafe Coffee Day',
 'Blinkit',
 'Swiggy',
 'AJIO',
 'Vijay Sales',
 'Croma',
 'Dominos',
 'Wildcraft',
 'Reliance Smart',
 'Max Fashion',
 'Netmeds',
 'Puma',
 'Westside',
 'Fastrack',
 'Nykaa',
 'Bewakoof',
 'Zee5',
 'Gaana',
 'Louis Philippe',
 '@Home (Processing)',
 '1mg Prescription Medicine',
 'Abhibus',
 'Absolute Barbecue',
 'AD',
 'Aeropostale',
 'Aldo',
 'Aligarh House by Eatfit',
 'Allen Solly',
 'ALT BALAJI',
 'Amazon Kindle Unlimited',
 'American Eagle',
 'AND India',
 'Anita Dongre',
 'Apollo Pharmacy']

Dataset Preparation according to the given Condition

In [3]:
import random

# Function to generate a random sequence of purchases
def generate_purchases(num_purchases):
    return random.sample(platforms, num_purchases)

# Function to create the dataset
def create_dataset(num_users):
    data = []
    for i in range(1, num_users + 1):
        user_id = f'user{i}'
        num_purchases = random.randint(1, len(platforms))  # Random number of purchases
        purchases = generate_purchases(num_purchases)
        data.append({'user': user_id, 'purchases': purchases})
    return data


In [4]:
# Generate a dataset with 50000 users
dataset = create_dataset(50000)

In [5]:
dataset[0]['purchases']

['Flipkart',
 'AND India',
 'Fastrack',
 'Nykaa',
 'Dominos',
 'Netmeds',
 'Aldo',
 'Amazon Shopping',
 'Pizza Hut',
 'Aligarh House by Eatfit',
 '@Home (Processing)',
 'ALT BALAJI',
 'Reliance Digital',
 'American Eagle',
 'Zee5',
 'AJIO',
 'Westside',
 'Apollo Pharmacy',
 'Vijay Sales',
 'AD']

Pre Processing Step

In [6]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import numpy as np

# Encoding dataset
all_purchases = [purchase for entry in dataset for purchase in entry['purchases']]
label_encoder = LabelEncoder()
label_encoder.fit(all_purchases)


In [7]:
len(all_purchases)

1196213

In [8]:

# Convert purchases to numerical data
def encode_purchases(purchases):
    return label_encoder.transform(purchases)

# Function to prepare input-output pairs for the model
def create_input_output_pairs(data):
    X, y = [], []
    for entry in data:
        purchases = encode_purchases(entry['purchases'])
        X.append(purchases[:-1])   # Input sequence
        y.append(purchases[-1])    # Output sequence(last purchase)
    return X, y


In [9]:
X, y = create_input_output_pairs(dataset)

In [10]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Pad sequences to ensure uniform length
max_length = max(len(seq) for seq in X)
X = pad_sequences(X, maxlen=max_length, padding='post')
y = np.array(y)

In [11]:
len(X[990]) # It ensures length of every seuence is equal

46

Model Development

In [12]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
from tensorflow.keras.utils import to_categorical
y_train = to_categorical(y_train, num_classes=len(label_encoder.classes_))
y_test = to_categorical(y_test, num_classes=len(label_encoder.classes_))


In [14]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, Masking, LSTM, Flatten
from tensorflow.keras.metrics import Precision, Recall

model = Sequential([
    Embedding(input_dim=len(label_encoder.classes_), output_dim=100, input_length=max_length),
    LSTM(128, return_sequences=True),  # LSTM layer with return_sequences=True to pass sequences to dense layers
    Flatten(),  # Flatten the output of LSTM layer
    Dense(256, activation='relu'),  # Add a dense layer with ReLU activation
    Dense(len(label_encoder.classes_), activation='softmax')
])

model.compile(optimizer='adam', 
              loss='categorical_crossentropy',  # Use categorical crossentropy for one-hot encoded labels
              metrics=['accuracy', Precision(), Recall()])

# Display the model summary
model.summary()


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 46, 100)           4700      
                                                                 
 lstm (LSTM)                 (None, 46, 128)           117248    
                                                                 
 flatten (Flatten)           (None, 5888)              0         
                                                                 
 dense (Dense)               (None, 256)               1507584   
                                                                 
 dense_1 (Dense)             (None, 47)                12079     
                                                                 
Total params: 1,641,611
Trainable params: 1,641,611
Non-trainable params: 0
_________________________________________________________________


In [15]:
# Train the model
history = model.fit(X_train, y_train, epochs=10, batch_size=64, validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [16]:
# Evaluate the model on the test data
loss, accuracy, precision, recall = model.evaluate(X_test, y_test)
print(f'Loss: {loss}, Accuracy: {accuracy}, Precision: {precision}, Recall: {recall}')

Loss: 8.18142032623291, Accuracy: 0.04780000075697899, Precision: 0.06175335496664047, Recall: 0.03359999880194664


In [17]:
# Test the model with a single example
test_sequence = X_test[0]  # Select the first test sequence for demonstration
test_sequence = np.expand_dims(test_sequence, axis=0)  # Model expects 3D input (batch_size, sequence_length, num_features)
predicted_probabilities = model.predict(test_sequence)
predicted_class = np.argmax(predicted_probabilities, axis=-1)  # Get the class with the highest probability
predicted_platform = label_encoder.inverse_transform(predicted_class)
print(f"Predicted next purchase: {predicted_platform[0]}")

Predicted next purchase: Aligarh House by Eatfit


In [18]:
# The actual next purchase in the test set
actual_next_purchase = label_encoder.inverse_transform([np.argmax(y_test[0])])
print(f"Actual next purchase: {actual_next_purchase[0]}")

Actual next purchase: Zee5
