# Task 2: Lookalike Model


In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler


In [3]:
# Load datasets
customers_df = pd.read_csv("Downloads/Customers.csv")
transactions_df = pd.read_csv("Downloads/Transactions.csv")
products_df = pd.read_csv("Downloads/Products.csv")

# Display the first few rows
print("Customers Dataset:\n", customers_df.head())
print("Transactions Dataset:\n", transactions_df.head())
print("Products Dataset:\n", products_df.head())


Customers Dataset:
   CustomerID        CustomerName         Region  SignupDate
0      C0001    Lawrence Carroll  South America  2022-07-10
1      C0002      Elizabeth Lutz           Asia  2022-02-13
2      C0003      Michael Rivera  South America  2024-03-07
3      C0004  Kathleen Rodriguez  South America  2022-10-09
4      C0005         Laura Weber           Asia  2022-08-15
Transactions Dataset:
   TransactionID CustomerID ProductID      TransactionDate  Quantity  \
0        T00001      C0199      P067  2024-08-25 12:38:23         1   
1        T00112      C0146      P067  2024-05-27 22:23:54         1   
2        T00166      C0127      P067   2024-04-25 7:38:55         1   
3        T00272      C0087      P067  2024-03-26 22:55:37         2   
4        T00363      C0070      P067  2024-03-21 15:10:10         3   

   TotalValue   Price  
0      300.68  300.68  
1      300.68  300.68  
2      300.68  300.68  
3      601.36  300.68  
4      902.04  300.68  
Products Dataset:
   Produ

In [4]:
# Merge transactions with customers and products
merged_df = transactions_df.merge(customers_df, on="CustomerID")
merged_df = merged_df.merge(products_df, on="ProductID")

# Convert TransactionDate to datetime
merged_df["TransactionDate"] = pd.to_datetime(merged_df["TransactionDate"])

# Display merged dataset
print(merged_df.head())


  TransactionID CustomerID ProductID     TransactionDate  Quantity  \
0        T00001      C0199      P067 2024-08-25 12:38:23         1   
1        T00112      C0146      P067 2024-05-27 22:23:54         1   
2        T00166      C0127      P067 2024-04-25 07:38:55         1   
3        T00272      C0087      P067 2024-03-26 22:55:37         2   
4        T00363      C0070      P067 2024-03-21 15:10:10         3   

   TotalValue  Price_x     CustomerName         Region  SignupDate  \
0      300.68   300.68   Andrea Jenkins         Europe  2022-12-03   
1      300.68   300.68  Brittany Harvey           Asia  2024-09-04   
2      300.68   300.68  Kathryn Stevens         Europe  2024-04-04   
3      601.36   300.68  Travis Campbell  South America  2024-04-11   
4      902.04   300.68    Timothy Perez         Europe  2022-03-15   

                       ProductName     Category  Price_y  
0  ComfortLiving Bluetooth Speaker  Electronics   300.68  
1  ComfortLiving Bluetooth Speaker  Elec

## Feature Engineering

In [5]:
# Aggregate customer transaction features
customer_features = merged_df.groupby("CustomerID").agg(
    total_spent=("TotalValue", "sum"),
    avg_purchase_value=("TotalValue", "mean"),
    num_transactions=("TransactionID", "count"),
    unique_products_bought=("ProductID", "nunique")
).reset_index()

# Display computed features
print(customer_features.head())


  CustomerID  total_spent  avg_purchase_value  num_transactions  \
0      C0001      3354.52             670.904                 5   
1      C0002      1862.74             465.685                 4   
2      C0003      2725.38             681.345                 4   
3      C0004      5354.88             669.360                 8   
4      C0005      2034.24             678.080                 3   

   unique_products_bought  
0                       5  
1                       4  
2                       4  
3                       8  
4                       3  


## Standardize Features

In [6]:
# Standardize features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(customer_features.iloc[:, 1:])

# Convert back to DataFrame
customer_features_scaled = pd.DataFrame(scaled_features, columns=customer_features.columns[1:])
customer_features_scaled["CustomerID"] = customer_features["CustomerID"]

# Display scaled features
print(customer_features_scaled.head())


   total_spent  avg_purchase_value  num_transactions  unique_products_bought  \
0    -0.061701           -0.070263         -0.011458                0.050047   
1    -0.877744           -0.934933         -0.467494               -0.424204   
2    -0.405857           -0.026271         -0.467494               -0.424204   
3     1.032547           -0.076769          1.356650                1.472798   
4    -0.783929           -0.040028         -0.923530               -0.898455   

  CustomerID  
0      C0001  
1      C0002  
2      C0003  
3      C0004  
4      C0005  


## Compute Similarity Scores

In [7]:
# Compute similarity matrix
similarity_matrix = cosine_similarity(customer_features_scaled.iloc[:, :-1])

# Convert to DataFrame
similarity_df = pd.DataFrame(similarity_matrix, index=customer_features["CustomerID"], columns=customer_features["CustomerID"])

# Display similarity matrix
print(similarity_df.head())


CustomerID     C0001     C0002     C0003     C0004     C0005     C0006  \
CustomerID                                                               
C0001       1.000000  0.681909  0.137497 -0.000631  0.104379 -0.817502   
C0002       0.681909  1.000000  0.726043 -0.649737  0.713370 -0.574811   
C0003       0.137497  0.726043  1.000000 -0.990094  0.999224  0.145356   
C0004      -0.000631 -0.649737 -0.990094  1.000000 -0.994545 -0.243356   
C0005       0.104379  0.713370  0.999224 -0.994545  1.000000  0.162666   

CustomerID     C0007     C0008     C0009     C0010  ...     C0191     C0192  \
CustomerID                                          ...                       
C0001      -0.333941  0.386237  0.624453  0.709473  ...  0.919641  0.624733   
C0002       0.251699 -0.287615  0.995853  0.998487  ...  0.857196  0.994684   
C0003       0.847614 -0.845892  0.755674  0.687131  ...  0.287739  0.792995   
C0004      -0.892865  0.904547 -0.688581 -0.607304  ... -0.174444 -0.723908   
C0005  

## Find Lookalike Customers

In [8]:
# Function to get top 3 similar customers
def get_top_similar(customers, top_n=3):
    similar_customers = {}
    for customer in customers:
        top_similars = similarity_df[customer].nlargest(top_n + 1)[1:].reset_index()
        similar_customers[customer] = list(zip(top_similars["CustomerID"], top_similars[customer]))
    return similar_customers

# Get lookalikes for first 20 customers
lookalike_results = get_top_similar(customer_features["CustomerID"][:20])

# Display results
print(lookalike_results)


{'C0001': [('C0137', 0.9962112629754638), ('C0152', 0.9810644990181483), ('C0056', 0.9482579835224701)], 'C0002': [('C0029', 0.9995348566666873), ('C0199', 0.99858484559402), ('C0010', 0.9984869862496091)], 'C0003': [('C0178', 0.9996894278379866), ('C0005', 0.999223799679947), ('C0073', 0.9984785252854043)], 'C0004': [('C0021', 0.9997854801171627), ('C0075', 0.9995851081107424), ('C0067', 0.999206594708233)], 'C0005': [('C0073', 0.999666986623941), ('C0063', 0.9994007672469035), ('C0159', 0.9992953646426336)], 'C0006': [('C0079', 0.9999771688934571), ('C0117', 0.9978898632254388), ('C0196', 0.9887961815413687)], 'C0007': [('C0085', 0.999857675584863), ('C0140', 0.99853477231425), ('C0070', 0.9960789693978743)], 'C0008': [('C0194', 0.9963830439832919), ('C0154', 0.9941613135252719), ('C0090', 0.9910418377744797)], 'C0009': [('C0077', 0.9997954982200685), ('C0032', 0.9974983679777031), ('C0199', 0.9969684191191535)], 'C0010': [('C0029', 0.9996995348192107), ('C0025', 0.9991885304713041),

## Save Results to CSV

In [9]:
# Convert to DataFrame
lookalike_df = pd.DataFrame([
    {"CustomerID": cust, "Lookalikes": str(lookalike_results[cust])} 
    for cust in lookalike_results
])

# Save to CSV
lookalike_df.to_csv("Lookalike.csv", index=False)

print("Lookalike recommendations saved to Lookalike.csv")


Lookalike recommendations saved to Lookalike.csv


In [13]:
lookalike_df.to_csv("C:\\Users\\morey\\Downloads\\Lookalike.csv", index=False)
