In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [5]:
customers = pd.read_csv('Customers.csv')
products = pd.read_csv('Products.csv')
transactions = pd.read_csv('Transactions.csv')

merge = transactions.merge(customers, on='CustomerID', how='left').merge(products, on='ProductID', how='left')
merged=merge.drop(['Price_y'], axis=1)
merged.head()

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price_x,CustomerName,Region,SignupDate,ProductName,Category
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68,Andrea Jenkins,Europe,2022-12-03,ComfortLiving Bluetooth Speaker,Electronics
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68,Brittany Harvey,Asia,2024-09-04,ComfortLiving Bluetooth Speaker,Electronics
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68,Kathryn Stevens,Europe,2024-04-04,ComfortLiving Bluetooth Speaker,Electronics
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68,Travis Campbell,South America,2024-04-11,ComfortLiving Bluetooth Speaker,Electronics
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68,Timothy Perez,Europe,2022-03-15,ComfortLiving Bluetooth Speaker,Electronics


**Feature Engineering**

In [8]:
# Aggregate transaction data for each customer
customer_features = merged.groupby('CustomerID').agg(
    TotalSpending=('TotalValue', 'sum'),
    TotalQuantity=('Quantity', 'sum'),
    AveragePrice=('Price_x', 'mean'),
    UniqueProducts=('ProductID', 'nunique'),
    CategoryPreferences=('Category', lambda x: ','.join(x))
).reset_index()

customer_features

Unnamed: 0,CustomerID,TotalSpending,TotalQuantity,AveragePrice,UniqueProducts,CategoryPreferences
0,C0001,3354.52,12,278.334000,5,"Books,Home Decor,Electronics,Electronics,Elect..."
1,C0002,1862.74,10,208.920000,4,"Home Decor,Home Decor,Clothing,Clothing"
2,C0003,2725.38,14,195.707500,4,"Home Decor,Home Decor,Clothing,Electronics"
3,C0004,5354.88,23,240.636250,8,"Books,Home Decor,Home Decor,Home Decor,Books,B..."
4,C0005,2034.24,7,291.603333,3,"Home Decor,Electronics,Electronics"
...,...,...,...,...,...,...
194,C0196,4982.88,12,416.992500,3,"Books,Clothing,Home Decor,Home Decor"
195,C0197,1928.65,9,227.056667,3,"Home Decor,Electronics,Electronics"
196,C0198,931.83,3,239.705000,2,"Electronics,Clothing"
197,C0199,1979.28,9,250.610000,4,"Electronics,Home Decor,Home Decor,Electronics"


In [10]:
# 2. Encode the 'Region' and 'Category' columns
category_dummies = pd.get_dummies(merged['Category'], prefix='Category').groupby(merged['CustomerID']).sum().reset_index()
#category_dummies.head()

In [11]:
# Merge all features
customers['CustomerID'] = customers['CustomerID'].astype(str)

customer_features = customer_features.merge(category_dummies, on='CustomerID', how='left')
customer_features.head()

Unnamed: 0,CustomerID,TotalSpending,TotalQuantity,AveragePrice,UniqueProducts,CategoryPreferences,Category_Books,Category_Clothing,Category_Electronics,Category_Home Decor
0,C0001,3354.52,12,278.334,5,"Books,Home Decor,Electronics,Electronics,Elect...",1,0,3,1
1,C0002,1862.74,10,208.92,4,"Home Decor,Home Decor,Clothing,Clothing",0,2,0,2
2,C0003,2725.38,14,195.7075,4,"Home Decor,Home Decor,Clothing,Electronics",0,1,1,2
3,C0004,5354.88,23,240.63625,8,"Books,Home Decor,Home Decor,Home Decor,Books,B...",3,0,2,3
4,C0005,2034.24,7,291.603333,3,"Home Decor,Electronics,Electronics",0,0,2,1


In [13]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
numerical_cols = ['TotalSpending', 'TotalQuantity', 'AveragePrice', 'UniqueProducts']
customer_features[numerical_cols] = scaler.fit_transform(customer_features[numerical_cols])
customer_features.head()

Unnamed: 0,CustomerID,TotalSpending,TotalQuantity,AveragePrice,UniqueProducts,CategoryPreferences,Category_Books,Category_Clothing,Category_Electronics,Category_Home Decor
0,C0001,-0.061701,-0.122033,0.09467,0.050047,"Books,Home Decor,Electronics,Electronics,Elect...",1,0,3,1
1,C0002,-0.877744,-0.448,-0.904016,-0.424204,"Home Decor,Home Decor,Clothing,Clothing",0,2,0,2
2,C0003,-0.405857,0.203934,-1.094109,-0.424204,"Home Decor,Home Decor,Clothing,Electronics",0,1,1,2
3,C0004,1.032547,1.670787,-0.447702,1.472798,"Books,Home Decor,Home Decor,Home Decor,Books,B...",3,0,2,3
4,C0005,-0.783929,-0.936951,0.285581,-0.898455,"Home Decor,Electronics,Electronics",0,0,2,1


In [17]:
#Similarity Calculation
# Create a similarity matrix using cosine similarity
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import OneHotEncoder

# Create a OneHotEncoder object
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore') # sparse=False for dense output

# Fit and transform the 'Category' column
encoded_categories = encoder.fit_transform(customer_features[['CategoryPreferences']])

# Create a DataFrame from the encoded categories
encoded_categories_df = pd.DataFrame(encoded_categories,
                                      columns=encoder.get_feature_names_out(['CategoryPreferences']),
                                      index=customer_features.index)

# Concatenate the encoded categories with the numerical features
feature_matrix = pd.concat([customer_features.drop(columns=['CustomerID', 'CategoryPreferences']),
                           encoded_categories_df], axis=1).fillna(0)

# Calculate the similarity matrix
similarity_matrix = cosine_similarity(feature_matrix)
print(similarity_matrix)

[[1.         0.17428553 0.47977942 ... 0.29555398 0.72510089 0.47600979]
 [0.17428553 1.         0.76615994 ... 0.48166099 0.51190009 0.48551541]
 [0.47977942 0.76615994 1.         ... 0.36741853 0.71425336 0.5043656 ]
 ...
 [0.29555398 0.48166099 0.36741853 ... 1.         0.48615094 0.10252277]
 [0.72510089 0.51190009 0.71425336 ... 0.48615094 1.         0.31009395]
 [0.47600979 0.48551541 0.5043656  ... 0.10252277 0.31009395 1.        ]]


In [18]:
recommendations = {}
for idx, customer_id in enumerate(customer_features['CustomerID']):
    # Get similarity scores for the current customer
    scores = list(enumerate(similarity_matrix[idx]))
    # Sort by similarity score in descending order, exclude self-match
    scores = sorted(scores, key=lambda x: x[1], reverse=True)
    top_3 = [(customer_features['CustomerID'][i], round(score, 3)) for i, score in scores[1:4]]
    recommendations[customer_id] = top_3

# Filter for the first 20 customers
lookalike_map = {cust_id: recommendations[cust_id] for cust_id in customers['CustomerID'][:20]}

In [21]:
lookalike_df = pd.DataFrame({'CustomerID': lookalike_map.keys(), 'Recommendations': lookalike_map.values()})
lookalike_df.to_csv('Tarunkumar_Singamsetti_Lookalike.csv', index=False)

In [23]:
pf=pd.read_csv('Tarunkumar_Singamsetti_Lookalike.csv')
pf.head(20)

Unnamed: 0,CustomerID,Recommendations
0,C0001,"[('C0069', 0.882), ('C0127', 0.878), ('C0035',..."
1,C0002,"[('C0133', 0.891), ('C0134', 0.847), ('C0159',..."
2,C0003,"[('C0031', 0.829), ('C0166', 0.828), ('C0106',..."
3,C0004,"[('C0017', 0.924), ('C0113', 0.922), ('C0075',..."
4,C0005,"[('C0197', 0.942), ('C0007', 0.837), ('C0199',..."
5,C0006,"[('C0187', 0.839), ('C0135', 0.83), ('C0139', ..."
6,C0007,"[('C0005', 0.837), ('C0146', 0.817), ('C0120',..."
7,C0008,"[('C0162', 0.942), ('C0113', 0.895), ('C0039',..."
8,C0009,"[('C0198', 0.815), ('C0150', 0.779), ('C0061',..."
9,C0010,"[('C0176', 0.878), ('C0077', 0.85), ('C0030', ..."
