<a href="https://colab.research.google.com/github/vanshg11/Vansh_Gupta_Zeotap/blob/main/Vansh_Gupta_Lookalike.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
import numpy as np

In [2]:
customers = pd.read_csv('Customers_Zeotap.csv')
transactions = pd.read_csv('Transactions_Zeotap.csv')
products = pd.read_csv('Products_Zeotap.csv')

In [3]:
transactions_products = transactions.merge(products, on='ProductID')

In [4]:
category_spending = transactions_products.groupby(['CustomerID', 'Category'])['TotalValue'].sum().reset_index()


category_features = category_spending.pivot(index='CustomerID', columns='Category', values='TotalValue').fillna(0)
customer_transactions = transactions.groupby('CustomerID').agg({
    'TotalValue': 'sum',
    'Quantity': 'sum'
}).reset_index()

customer_data = customers.merge(customer_transactions, on='CustomerID', how='left').fillna(0)

# Merged category spending data with customer data
customer_data = customer_data.merge(category_features, on='CustomerID', how='left').fillna(0)

In [5]:
encoder = OneHotEncoder(sparse_output=False)
region_encoded = encoder.fit_transform(customer_data[['Region']])
region_encoded_df = pd.DataFrame(region_encoded, columns=encoder.get_feature_names_out(['Region']))


numerical_features = customer_data[['TotalValue', 'Quantity']]
product_category_features = customer_data[category_features.columns]
features = pd.concat([numerical_features, region_encoded_df, product_category_features], axis=1)


scaler = MinMaxScaler()
features_scaled = scaler.fit_transform(features)


similarity_matrix = cosine_similarity(features_scaled)

# ExtractED the first 20 customers
customer_ids = customer_data['CustomerID'].values[:20]
lookalike_results = {}


for idx, customer_id in enumerate(customer_ids):
    similar_indices = np.argsort(similarity_matrix[idx])[::-1][1:4]
    similar_customers = customer_data['CustomerID'].values[similar_indices]
    similarity_scores = similarity_matrix[idx][similar_indices]
    lookalike_results[customer_id] = list(zip(similar_customers, similarity_scores))


lookalike_df = pd.DataFrame([
    {'CustomerID': cust_id, 'Lookalikes': lookalike_results[cust_id]}
    for cust_id in lookalike_results
])

lookalike_df.to_csv('Vansh_Gupta_Lookalike.csv', index=False)
print("Lookalike model results saved to 'Vansh_Gupta_Lookalike.csv'")

Lookalike model results saved to 'Vansh_Gupta_Lookalike.csv'
