# Task 2: Lookalike Model

Importing Libraries


In [13]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import numpy as np

Loading datasets


In [14]:
customers = pd.read_csv("Customers.csv")
products = pd.read_csv("Products.csv")
transactions = pd.read_csv("Transactions.csv")

Merging Data

In [15]:
merged_data = transactions.merge(customers, on="CustomerID", how="left").merge(products, on="ProductID", how="left")

In [16]:
merged_data

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price_x,CustomerName,Region,SignupDate,ProductName,Category,Price_y
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68,Andrea Jenkins,Europe,2022-12-03,ComfortLiving Bluetooth Speaker,Electronics,300.68
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68,Brittany Harvey,Asia,2024-09-04,ComfortLiving Bluetooth Speaker,Electronics,300.68
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68,Kathryn Stevens,Europe,2024-04-04,ComfortLiving Bluetooth Speaker,Electronics,300.68
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68,Travis Campbell,South America,2024-04-11,ComfortLiving Bluetooth Speaker,Electronics,300.68
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68,Timothy Perez,Europe,2022-03-15,ComfortLiving Bluetooth Speaker,Electronics,300.68
...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,T00496,C0118,P037,2024-10-24 08:30:27,1,459.86,459.86,Jacob Holt,South America,2022-01-22,SoundWave Smartwatch,Electronics,459.86
996,T00759,C0059,P037,2024-06-04 02:15:24,3,1379.58,459.86,Mrs. Kimberly Wright,North America,2024-04-07,SoundWave Smartwatch,Electronics,459.86
997,T00922,C0018,P037,2024-04-05 13:05:32,4,1839.44,459.86,Tyler Haynes,North America,2024-09-21,SoundWave Smartwatch,Electronics,459.86
998,T00959,C0115,P037,2024-09-29 10:16:02,2,919.72,459.86,Joshua Hamilton,Asia,2024-11-11,SoundWave Smartwatch,Electronics,459.86


Step 1: Feature Engineering

Creating customer features through i) trasanction details of the customer

In [17]:
customer_features = merged_data.groupby("CustomerID").agg(
    total_spent=("TotalValue", "sum"),
    avg_transaction_value=("TotalValue", "mean"),
    total_quantity=("Quantity", "sum"),
    transaction_count=("TransactionID", "count")
).reset_index()

In [18]:
customer_features

Unnamed: 0,CustomerID,total_spent,avg_transaction_value,total_quantity,transaction_count
0,C0001,3354.52,670.904000,12,5
1,C0002,1862.74,465.685000,10,4
2,C0003,2725.38,681.345000,14,4
3,C0004,5354.88,669.360000,23,8
4,C0005,2034.24,678.080000,7,3
...,...,...,...,...,...
194,C0196,4982.88,1245.720000,12,4
195,C0197,1928.65,642.883333,9,3
196,C0198,931.83,465.915000,3,2
197,C0199,1979.28,494.820000,9,4


ii) through details of the category

In [19]:
category_preferences = (
    merged_data.groupby(["CustomerID", "Category"])["Quantity"].sum().unstack(fill_value=0)
)
customer_features = customer_features.merge(category_preferences, on="CustomerID", how="left")

In [20]:
customer_features

Unnamed: 0,CustomerID,total_spent,avg_transaction_value,total_quantity,transaction_count,Books,Clothing,Electronics,Home Decor
0,C0001,3354.52,670.904000,12,5,2,0,7,3
1,C0002,1862.74,465.685000,10,4,0,4,0,6
2,C0003,2725.38,681.345000,14,4,0,4,4,6
3,C0004,5354.88,669.360000,23,8,8,0,6,9
4,C0005,2034.24,678.080000,7,3,0,0,4,3
...,...,...,...,...,...,...,...,...,...
194,C0196,4982.88,1245.720000,12,4,3,4,0,5
195,C0197,1928.65,642.883333,9,3,0,0,6,3
196,C0198,931.83,465.915000,3,2,0,2,1,0
197,C0199,1979.28,494.820000,9,4,0,0,3,6


iii) through details of the region

In [21]:
ohe = OneHotEncoder(sparse_output=False)
region_encoded = pd.DataFrame(
    ohe.fit_transform(customers[["Region"]]),
    columns=ohe.get_feature_names_out(["Region"])
)
customer_features = customer_features.merge(
    pd.concat([customers["CustomerID"], region_encoded], axis=1),
    on="CustomerID",
    how="left"
)

In [22]:
customer_features

Unnamed: 0,CustomerID,total_spent,avg_transaction_value,total_quantity,transaction_count,Books,Clothing,Electronics,Home Decor,Region_Asia,Region_Europe,Region_North America,Region_South America
0,C0001,3354.52,670.904000,12,5,2,0,7,3,0.0,0.0,0.0,1.0
1,C0002,1862.74,465.685000,10,4,0,4,0,6,1.0,0.0,0.0,0.0
2,C0003,2725.38,681.345000,14,4,0,4,4,6,0.0,0.0,0.0,1.0
3,C0004,5354.88,669.360000,23,8,8,0,6,9,0.0,0.0,0.0,1.0
4,C0005,2034.24,678.080000,7,3,0,0,4,3,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
194,C0196,4982.88,1245.720000,12,4,3,4,0,5,0.0,1.0,0.0,0.0
195,C0197,1928.65,642.883333,9,3,0,0,6,3,0.0,1.0,0.0,0.0
196,C0198,931.83,465.915000,3,2,0,2,1,0,0.0,1.0,0.0,0.0
197,C0199,1979.28,494.820000,9,4,0,0,3,6,0.0,1.0,0.0,0.0


Normalizing Features by scaling


In [23]:
scaler = StandardScaler()
scaled_features = scaler.fit_transform(customer_features.drop("CustomerID", axis=1))
customer_features_scaled = pd.DataFrame(
    scaled_features, columns=customer_features.columns[1:], index=customer_features["CustomerID"]
)


Computing Similarity Matrix by cosine similarity


In [24]:
similarity_matrix = cosine_similarity(customer_features_scaled)

In [25]:
similarity_matrix

array([[ 1.        , -0.30294081,  0.62824386, ..., -0.11261071,
         0.01300488, -0.51789636],
       [-0.30294081,  1.        ,  0.13702542, ...,  0.25994147,
         0.26582844,  0.41256462],
       [ 0.62824386,  0.13702542,  1.        , ..., -0.0560142 ,
         0.1059976 , -0.17716757],
       ...,
       [-0.11261071,  0.25994147, -0.0560142 , ...,  1.        ,
         0.72266855, -0.40279628],
       [ 0.01300488,  0.26582844,  0.1059976 , ...,  0.72266855,
         1.        , -0.48021141],
       [-0.51789636,  0.41256462, -0.17716757, ..., -0.40279628,
        -0.48021141,  1.        ]])

Recommending 3 similar customers based on their profile and transaction of existing customer.

In [26]:
# Function to Get Lookalikes for Existing Customers
def get_lookalikes(customer_id, top_n=3):
    # Get the index of the given customer
    idx = customer_features[customer_features["CustomerID"] == customer_id].index[0]

    # Get similarity scores for all customers
    similarity_scores = similarity_matrix[idx]

    # Sort by similarity, excluding the customer itself (self-similarity)
    similar_indices = similarity_scores.argsort()[::-1][1 : top_n + 1]

    # Get the CustomerIDs and similarity scores for the top N similar customers
    similar_customers = customer_features.iloc[similar_indices]
    return [
        (similar_customers.iloc[i]["CustomerID"], similarity_scores[similar_indices[i]])
        for i in range(len(similar_customers))
    ]

Getting the input data for new customer

In [27]:
# Functionality to Input New User Data
def recommend_for_new_user():
    print("\n--- Enter New User Details ---")

    # Input demographic and behavioral data
    region = input("Enter Region (e.g., Asia, Europe, etc.): ").strip()
    total_spent = float(input("Enter Total Spend (USD): "))
    avg_transaction_value = float(input("Enter Average Transaction Value (USD): "))
    total_quantity = int(input("Enter Total Quantity Purchased: "))
    transaction_count = int(input("Enter Total Number of Transactions: "))

    # Input product category preferences
    categories = list(category_preferences.columns)
    category_values = []
    for category in categories:
        val = int(input(f"Enter total quantity purchased for category '{category}': "))
        category_values.append(val)

    # Encode region
    region_encoded_input = ohe.transform([[region]])[0]

    # Combine all input features
    user_features = [total_spent, avg_transaction_value, total_quantity, transaction_count] + category_values + list(region_encoded_input)

    # Scale the new user features
    user_features_scaled = scaler.transform([user_features])

    # Compute similarity with all existing customers
    user_similarity = cosine_similarity(user_features_scaled, customer_features_scaled)

    # Find top 3 similar customers
    top_3_indices = np.argsort(user_similarity[0])[::-1][:3]
    recommendations = [
        (customer_features.iloc[i]["CustomerID"], user_similarity[0][i]) for i in top_3_indices
    ]

    print("\n--- Top 3 Lookalike Customers ---")
    for rank, (cust_id, score) in enumerate(recommendations, 1):
        print(f"{rank}. Customer ID: {cust_id}, Similarity Score: {score:.2f}")

    return recommendations

Creating “Lookalike.csv” for top 3 lookalikes with there similarity scores for the first 20 customers

In [28]:
# Test with First 20 Customers
lookalike_map = {}
for cust_id in customer_features["CustomerID"][:20]:
    lookalike_map[cust_id] = get_lookalikes(cust_id)


In [29]:
# Export Lookalikes for First 20 Customers
lookalike_csv = []
for cust_id, lookalikes in lookalike_map.items():
    lookalike_csv.append({
        "cust_id": cust_id,
        "lookalike_1": lookalikes[0][0], "score_1": round(lookalikes[0][1], 2),
        "lookalike_2": lookalikes[1][0], "score_2": round(lookalikes[1][1], 2),
        "lookalike_3": lookalikes[2][0], "score_3": round(lookalikes[2][1], 2),
    })

lookalike_df = pd.DataFrame(lookalike_csv)
lookalike_df.to_csv("Lookalike.csv", index=False)


In [30]:
# recommend_for_new_user()