# Lookalike Dataset creation

In [1]:
! pip install scikit-learn




[notice] A new release of pip is available: 24.2 -> 25.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import pandas as pd 
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics.pairwise import cosine_similarity


In [4]:
# Load datasets
customers = pd.read_csv('../Datasets/Customers.csv')         # Customer profile data
transactions = pd.read_csv('../Datasets/Transactions.csv')   # Transactions data
products = pd.read_csv('../Datasets/Products.csv')           # Products data

In [9]:
products.head()

Unnamed: 0,ProductID,ProductName,Category,Price
0,P001,ActiveWear Biography,Books,169.3
1,P002,ActiveWear Smartwatch,Electronics,346.3
2,P003,ComfortLiving Biography,Books,44.12
3,P004,BookWorld Rug,Home Decor,95.69
4,P005,TechPro T-Shirt,Clothing,429.31


In [12]:
transactions.head()

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68


In [10]:
# Merge transactions with products
transactions_products = pd.merge(transactions, products, on='ProductID', how='left')


In [19]:
transactions_products.head()

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price_x,ProductName,Category,Price_y
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68


In [11]:
transactions.head()

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68


In [None]:
customers.head()

Unnamed: 0,CustomerID,CustomerName,Region,SignupDate
0,C0001,Lawrence Carroll,South America,2022-07-10
1,C0002,Elizabeth Lutz,Asia,2022-02-13
2,C0003,Michael Rivera,South America,2024-03-07
3,C0004,Kathleen Rodriguez,South America,2022-10-09
4,C0005,Laura Weber,Asia,2022-08-15


In [16]:
# Merge transactions with customers
final_data = pd.merge(transactions_products, customers, on='CustomerID', how='left')

In [18]:
final_data.head()

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price_x,ProductName,Category,Price_y,CustomerName,Region,SignupDate
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68,Andrea Jenkins,Europe,2022-12-03
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68,Brittany Harvey,Asia,2024-09-04
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68,Kathryn Stevens,Europe,2024-04-04
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68,Travis Campbell,South America,2024-04-11
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68,Timothy Perez,Europe,2022-03-15


In [22]:
# Aggregate transaction data for customers
customer_features = final_data.groupby('CustomerID').agg({
    'Quantity': 'sum',
    'Price_y': 'mean',
    'TotalValue': 'sum',
    'Category': lambda x: ' '.join(x)  # Concatenate product categories
}).reset_index()

In [23]:
# One-hot encode product categories
encoder = OneHotEncoder()
encoded_categories = encoder.fit_transform(customer_features[['Category']]).toarray()
encoded_categories_df = pd.DataFrame(encoded_categories, columns=encoder.get_feature_names_out(['Category']))

In [24]:
# Combine features
customer_features = pd.concat([customer_features, encoded_categories_df], axis=1).drop(columns=['Category'])

In [25]:
# Normalize numerical features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(customer_features.iloc[:, 1:])  # Exclude CustomerID

In [26]:
# Compute cosine similarity
similarity_matrix = cosine_similarity(scaled_features)


In [27]:
# Create a mapping of customer IDs
customer_ids = customer_features['CustomerID'].tolist()

In [28]:
# Find top 3 similar customers for the first 20 customers
lookalike_map = {}
for i in range(20):  # For customers C0001 to C0020
    similarities = list(enumerate(similarity_matrix[i]))
    similarities = sorted(similarities, key=lambda x: x[1], reverse=True)  # Sort by similarity score
    top_3 = [(customer_ids[j], round(score, 3)) for j, score in similarities[1:4]]  # Skip self-similarity
    lookalike_map[customer_ids[i]] = top_3

In [30]:
# Convert lookalike map to DataFrame
lookalike_data = [{'CustomerID': k, 'Lookalikes': v} for k, v in lookalike_map.items()]
lookalike_df = pd.DataFrame(lookalike_data)

In [31]:
# Save to CSV
lookalike_df.to_csv('Lookalike.csv', index=False)