In [14]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn
import torch.nn.functional as F
from sklearn.preprocessing import LabelEncoder
import ast

In [15]:
#Load the data
customer_base = pd.read_csv('customer_base.csv')
product_base = pd.read_csv('product_base.csv')
purchase_data = pd.read_csv('purchase_data.csv')

In [16]:
purchase_data

Unnamed: 0.1,Unnamed: 0,customer_idx,list_purch_product,list_purch_month_number,list_purch_txn_count
0,0,1001,"[101, 103, 105]","[1, 3, 5]","[2, 1, 1]"
1,1,1002,"[102, 104]","[2, 4]","[1, 2]"
2,2,1003,"[103, 103, 109]","[1, 2, 3]","[1, 1, 1]"
3,3,1004,"[108, 107]","[5, 6]","[3, 1]"
4,4,1005,"[101, 107, 103]","[1, 8, 12]","[1, 1, 1]"
5,5,1006,[110],[9],[3]


# Encode Customer and Product IDs

# Prepare Sequences

In [19]:
# Encode customer IDs
customer_encoder = LabelEncoder()
customer_base['customer_idx'] = customer_encoder.fit_transform(customer_base['customer_id'])
customer_id_to_idx = dict(zip(customer_base['customer_id'], customer_base['customer_idx']))

# Encode product IDs
product_encoder = LabelEncoder()
product_base['product_idx'] = product_encoder.fit_transform(product_base['product_id'])
product_id_to_idx = dict(zip(product_base['product_id'], product_base['product_idx']))

# Update purchase_data with encoded IDs
purchase_data['customer_idx'] = purchase_data['customer_idx'].map(lambda x: customer_id_to_idx.get(x, x))
print(purchase_data.info())
print(purchase_data['list_purch_product'])
print(product_id_to_idx[103])
def safe_convert(x):
    if isinstance(x, str):
        try:
            return [product_id_to_idx.get(pid, pid) for pid in ast.literal_eval(x)]
        except:
            print(f"Failed to convert: {x}")
            return []
    elif isinstance(x, list):
        return [product_id_to_idx.get(pid, pid) for pid in x]
    else:
        print(f"Unexpected type: {type(x)}, value: {x}")
        return []

purchase_data['list_purch_product'] = purchase_data['list_purch_product'].apply(safe_convert)

print(purchase_data['list_purch_product'])


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 5 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   Unnamed: 0               6 non-null      int64 
 1   customer_idx             6 non-null      int64 
 2   list_purch_product       6 non-null      object
 3   list_purch_month_number  6 non-null      object
 4   list_purch_txn_count     6 non-null      object
dtypes: int64(2), object(3)
memory usage: 368.0+ bytes
None
0    [101, 103, 105]
1         [102, 104]
2    [103, 103, 109]
3         [108, 107]
4    [101, 107, 103]
5              [110]
Name: list_purch_product, dtype: object
2
0    [0, 2, 4]
1       [1, 3]
2    [2, 2, 8]
3       [7, 6]
4    [0, 6, 2]
5          [9]
Name: list_purch_product, dtype: object


In [20]:
purchase_data

Unnamed: 0.1,Unnamed: 0,customer_idx,list_purch_product,list_purch_month_number,list_purch_txn_count
0,0,0,"[0, 2, 4]","[1, 3, 5]","[2, 1, 1]"
1,1,1,"[1, 3]","[2, 4]","[1, 2]"
2,2,2,"[2, 2, 8]","[1, 2, 3]","[1, 1, 1]"
3,3,3,"[7, 6]","[5, 6]","[3, 1]"
4,4,4,"[0, 6, 2]","[1, 8, 12]","[1, 1, 1]"
5,5,5,[9],[9],[3]


In [21]:

sequences = []
for _, row in purchase_data.iterrows():
    customer_idx = row['customer_idx']
    products = row['list_purch_product']
    for i in range(1, len(products)):
        sequences.append((customer_idx, products[:i], products[i]))

In [22]:
sequences

[(0, [0], 2),
 (0, [0, 2], 4),
 (1, [1], 3),
 (2, [2], 2),
 (2, [2, 2], 8),
 (3, [7], 6),
 (4, [0], 6),
 (4, [0, 6], 2)]

In [23]:
# Prepare Features
# Convert features to numpy arrays for quick access
customer_features = customer_base.set_index('customer_id').values
product_features = product_base.set_index('product_id').values

# Map indices to features
customer_features = {idx: feats for idx, feats in enumerate(customer_features)}
product_features = {idx: feats for idx, feats in enumerate(product_features)}

In [24]:
# Get max sequence length and number of products
max_seq_len = max(len(seq[1]) for seq in sequences)
num_products = len(product_id_to_idx)

In [25]:
max_seq_len

2

In [26]:
num_products

10

In [43]:
sequences

[(0, [0], 2),
 (0, [0, 2], 4),
 (1, [1], 3),
 (2, [2], 2),
 (2, [2, 2], 8),
 (3, [7], 6),
 (4, [0], 6),
 (4, [0, 6], 2)]

In [41]:
# Prepare samples
samples = []
negative_samples = 1
import random
for customer_id, seq, pos_product in sequences:
    # Positive sample
    samples.append((customer_id, seq, pos_product, 1))
    # Negative samples
    for _ in range(negative_samples):
        neg_product = random.randint(0, num_products - 1)
        while neg_product in seq:
            neg_product = random.randint(0, num_products - 1)
        samples.append((customer_id, seq, neg_product, 0))

In [42]:
samples

[(0, [0], 2, 1),
 (0, [0], 4, 0),
 (0, [0, 2], 4, 1),
 (0, [0, 2], 8, 0),
 (1, [1], 3, 1),
 (1, [1], 0, 0),
 (2, [2], 2, 1),
 (2, [2], 1, 0),
 (2, [2, 2], 8, 1),
 (2, [2, 2], 9, 0),
 (3, [7], 6, 1),
 (3, [7], 2, 0),
 (4, [0], 6, 1),
 (4, [0], 3, 0),
 (4, [0, 6], 2, 1),
 (4, [0, 6], 1, 0)]

In [48]:
# Function to get a single item
def get_item(idx):
    customer_id, seq, product_id, label = samples[idx]
    
    # Pad sequence
    seq_padded = seq + [0]*(max_seq_len - len(seq)) if len(seq) < max_seq_len else seq[-max_seq_len:]
    print(seq_padded)
    # Get features
    customer_feat = customer_features[customer_id]
    seq_product_feats = [product_features[pid] for pid in seq_padded]
    product_feat = product_features[product_id]
    print(f"customer_feat is equal to : {customer_feat}")
    
    return {
        'customer_feat': torch.tensor(customer_feat, dtype=torch.float),
        'seq_product_feats': torch.tensor(seq_product_feats, dtype=torch.float),
        'sequence': torch.tensor(seq_padded, dtype=torch.long),
        'product_feat': torch.tensor(product_feat, dtype=torch.float),
        'product_id': torch.tensor(product_id, dtype=torch.long),
        'label': torch.tensor(label, dtype=torch.float)
    }

In [49]:
# View a few samples
for i in range(5):
    sample = get_item(i)
    print(f"Sample {i}:")
    for key, value in sample.items():
        print(f"  {key}: {value}")
    print()

[0, 0]
[0 28 'Female' 'New York' '2019-06-15' 75 "['Electronics', 'Books']"
 150.75 25 '2021-08-10' 0]


TypeError: can't convert np.ndarray of type numpy.object_. The only supported types are: float64, float32, float16, complex64, complex128, int64, int32, int16, int8, uint8, and bool.