Lookalike Model 
Build a Lookalike Model that takes a user's information as input and recommends 3 similar 
customers based on their profile and transaction history. The model should: 
● Use both customer and product information. 
● Assign a similarity score to each recommended customer. 
Deliverables: 
● Give the top 3 lookalikes with there similarity scores for the first 20 customers 
(CustomerID: C0001 - C0020)  in Customers.csv. Form an “Lookalike.csv” which has 
just one map: Map<cust_id, List<cust_id, score>>   
● A Jupyter Notebook/Python script explaining your model development.

In [50]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

In [52]:
customers = pd.read_csv("Customers.csv")
customers.head()

Unnamed: 0,CustomerID,CustomerName,Region,SignupDate
0,C0001,Lawrence Carroll,South America,2022-07-10
1,C0002,Elizabeth Lutz,Asia,2022-02-13
2,C0003,Michael Rivera,South America,2024-03-07
3,C0004,Kathleen Rodriguez,South America,2022-10-09
4,C0005,Laura Weber,Asia,2022-08-15


In [54]:
products = pd.read_csv("Products.csv")
products.head()

Unnamed: 0,ProductID,ProductName,Category,Price
0,P001,ActiveWear Biography,Books,169.3
1,P002,ActiveWear Smartwatch,Electronics,346.3
2,P003,ComfortLiving Biography,Books,44.12
3,P004,BookWorld Rug,Home Decor,95.69
4,P005,TechPro T-Shirt,Clothing,429.31


In [56]:
# Preprocessing customer data
def preprocess_customer_data(customers):
    customers['SignupDate'] = pd.to_datetime(customers['SignupDate'])
    customers['DaysSinceSignup'] = (pd.Timestamp.now() - customers['SignupDate']).dt.days
    customers.drop(['CustomerName', 'SignupDate'], axis=1, inplace=True)
    return customers

# Preprocessing product data
def preprocess_product_data(products):
    products.drop(['ProductName'], axis=1, inplace=True)
    return products

customers = preprocess_customer_data(customers)
products = preprocess_product_data(products)


In [58]:
# Simulating the transaction history
transaction_history = pd.DataFrame({
    'CustomerID': np.random.choice(customers['CustomerID'], size=1000, replace=True),
    'ProductID': np.random.choice(products['ProductID'], size=1000, replace=True)
})

In [60]:
# Joining transactions with product data
transaction_data = transaction_history.merge(products, on='ProductID', how='left')

In [62]:
# Aggregate transaction data by customer
customer_product_data = transaction_data.groupby('CustomerID').agg({
    'Price': 'mean',
    'Category': lambda x: x.mode()[0] if len(x.mode()) > 0 else None
}).reset_index()


In [64]:
# Merge aggregated transaction data with customer data
customer_data = customers.merge(customer_product_data, on='CustomerID', how='left')

In [66]:
# Ensuring missing values are handled
customer_data['Price'] = customer_data['Price'].fillna(0)
customer_data['Category'] = customer_data['Category'].fillna("Unknown")

In [68]:
# Defining preprocessing pipeline for mixed data types
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline(steps=[('imputer', SimpleImputer(strategy='mean')), ('scaler', StandardScaler())]), ['DaysSinceSignup', 'Price']),
        ('cat', OneHotEncoder(handle_unknown='ignore'), ['Region', 'Category'])
    ]
)

In [70]:
# Preprocessing data
customer_features = preprocessor.fit_transform(customer_data)

In [72]:
# Computing similarity scores using cosine similarity
similarity_matrix = cosine_similarity(customer_features)

In [76]:
# Generating lookalike recommendations for the first 20 customers
lookalike_map = {}
for idx, customer_id in enumerate(customer_data['CustomerID'][:20]):
    similar_indices = similarity_matrix[idx].argsort()[-4:-1][::-1]  # Top 3 similar customers excluding itself
    similar_customers = [(customer_data['CustomerID'].iloc[i], round(similarity_matrix[idx, i], 2)) for i in similar_indices]
    lookalike_map[customer_id] = similar_customers

# Saving lookalike map to CSV
lookalike_df = pd.DataFrame([
    {'CustomerID': cust_id, 'Lookalikes': str(lookalikes)} for cust_id, lookalikes in lookalike_map.items()
])
lookalike_df.to_csv('Lookalike.csv', index=False)

# Printing the lookalike map
print(lookalike_df)

   CustomerID                                         Lookalikes
0       C0001  [('C0184', 0.94), ('C0004', 0.92), ('C0104', 0...
1       C0002  [('C0134', 1.0), ('C0162', 0.98), ('C0159', 0....
2       C0003  [('C0181', 0.98), ('C0195', 0.97), ('C0052', 0...
3       C0004  [('C0184', 0.98), ('C0174', 0.96), ('C0001', 0...
4       C0005  [('C0021', 0.84), ('C0173', 0.83), ('C0045', 0...
5       C0006  [('C0076', 0.99), ('C0013', 0.98), ('C0137', 0...
6       C0007  [('C0161', 0.99), ('C0045', 0.91), ('C0173', 0...
7       C0008  [('C0050', 0.97), ('C0189', 0.94), ('C0036', 0...
8       C0009  [('C0119', 0.97), ('C0051', 0.95), ('C0170', 0...
9       C0010  [('C0141', 0.97), ('C0037', 0.96), ('C0121', 0...
10      C0011  [('C0102', 0.98), ('C0071', 0.95), ('C0025', 0...
11      C0012  [('C0013', 0.98), ('C0137', 0.94), ('C0006', 0...
12      C0013  [('C0137', 0.99), ('C0012', 0.98), ('C0076', 0...
13      C0014  [('C0182', 0.97), ('C0089', 0.94), ('C0064', 0...
14      C0015  [('C0059',