In [1]:
import pandas as pd

In [2]:
customers = pd.read_csv('/content/zeotap_Customers.csv')
products = pd.read_csv('/content/zeotap_Products.csv')
transactions = pd.read_csv('/content/zeotap_Transactions.csv')

In [3]:
transactions_products = pd.merge(transactions, products, on="ProductID")

complete_data = pd.merge(transactions_products, customers, on="CustomerID")

In [4]:
customer_features = complete_data.groupby("CustomerID").agg(
    total_spending=("TotalValue", "sum"),
    total_quantity=("Quantity", "sum"),
    unique_products=("ProductID", "nunique"),
    unique_categories=("Category", "nunique"),
    transactions=("TransactionID", "nunique"),
    avg_spending_per_transaction=("TotalValue", "mean")
).reset_index()

In [5]:
customer_features

Unnamed: 0,CustomerID,total_spending,total_quantity,unique_products,unique_categories,transactions,avg_spending_per_transaction
0,C0001,3354.52,12,5,3,5,670.904000
1,C0002,1862.74,10,4,2,4,465.685000
2,C0003,2725.38,14,4,3,4,681.345000
3,C0004,5354.88,23,8,3,8,669.360000
4,C0005,2034.24,7,3,2,3,678.080000
...,...,...,...,...,...,...,...
194,C0196,4982.88,12,3,3,4,1245.720000
195,C0197,1928.65,9,3,2,3,642.883333
196,C0198,931.83,3,2,2,2,465.915000
197,C0199,1979.28,9,4,2,4,494.820000


In [6]:
category_pref = pd.crosstab(complete_data['CustomerID'], complete_data['Category'])
customer_features = customer_features.merge(category_pref, on="CustomerID", how="left")

In [7]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
normalized_features = customer_features.copy()
normalized_columns = customer_features.columns[1:]
normalized_features[normalized_columns] = scaler.fit_transform(customer_features[normalized_columns])

normalized_features.head()

Unnamed: 0,CustomerID,total_spending,total_quantity,unique_products,unique_categories,transactions,avg_spending_per_transaction,Books,Clothing,Electronics,Home Decor
0,C0001,0.308942,0.354839,0.444444,0.666667,0.4,0.474336,0.2,0.0,0.6,0.166667
1,C0002,0.168095,0.290323,0.333333,0.333333,0.3,0.30894,0.0,0.4,0.0,0.333333
2,C0003,0.249541,0.419355,0.333333,0.666667,0.3,0.482751,0.0,0.2,0.2,0.333333
3,C0004,0.497806,0.709677,0.777778,0.666667,0.7,0.473092,0.6,0.0,0.4,0.5
4,C0005,0.184287,0.193548,0.222222,0.333333,0.2,0.48012,0.0,0.0,0.4,0.166667


In [8]:
from sklearn.metrics.pairwise import cosine_similarity

first_20_customers = normalized_features[normalized_features["CustomerID"].isin([f"C{str(i).zfill(4)}" for i in range(1, 21)])]

feature_matrix = normalized_features.set_index("CustomerID")

similarity_matrix = cosine_similarity(feature_matrix, feature_matrix)
similarity_df = pd.DataFrame(similarity_matrix, index=feature_matrix.index, columns=feature_matrix.index)

lookalikes = {}
for customer_id in first_20_customers["CustomerID"]:
    similar_customers = similarity_df.loc[customer_id].sort_values(ascending=False).iloc[1:4]
    lookalikes[customer_id] = list(zip(similar_customers.index, similar_customers.values))

lookalike_df = pd.DataFrame([
    {"CustomerID": customer, "Lookalikes": lookalike_list}
    for customer, lookalike_list in lookalikes.items()
])
lookalike_df


Unnamed: 0,CustomerID,Lookalikes
0,C0001,"[(C0035, 0.9873857534475428), (C0146, 0.985006..."
1,C0002,"[(C0134, 0.9852842475481981), (C0133, 0.978912..."
2,C0003,"[(C0166, 0.9958524019017425), (C0031, 0.987234..."
3,C0004,"[(C0075, 0.9787629414206808), (C0012, 0.974398..."
4,C0005,"[(C0197, 0.9963276954272414), (C0007, 0.991835..."
5,C0006,"[(C0135, 0.9882772249561678), (C0187, 0.982595..."
6,C0007,"[(C0005, 0.9918355844667078), (C0197, 0.985355..."
7,C0008,"[(C0162, 0.9836644162757511), (C0098, 0.971559..."
8,C0009,"[(C0056, 0.9420066590515385), (C0034, 0.937457..."
9,C0010,"[(C0077, 0.9785329281414131), (C0034, 0.976553..."
