In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics.pairwise import cosine_similarity

## LOAD THE DATASETS

In [2]:
customers_df = pd.read_csv('Venkataraman_Ranganath_Customers.csv')
products_df = pd.read_csv('Venkataraman_Ranganath_Products.csv')

In [3]:
customers_df.head()

Unnamed: 0,CustomerID,CustomerName,Region,SignupDate
0,C0001,Lawrence Carroll,South America,2022-07-10
1,C0002,Elizabeth Lutz,Asia,2022-02-13
2,C0003,Michael Rivera,South America,2024-03-07
3,C0004,Kathleen Rodriguez,South America,2022-10-09
4,C0005,Laura Weber,Asia,2022-08-15


In [4]:
products_df.head()

Unnamed: 0,ProductID,ProductName,Category,Price
0,P001,ActiveWear Biography,Books,169.3
1,P002,ActiveWear Smartwatch,Electronics,346.3
2,P003,ComfortLiving Biography,Books,44.12
3,P004,BookWorld Rug,Home Decor,95.69
4,P005,TechPro T-Shirt,Clothing,429.31


### CREATE LOOKALIKE MODEL BASED ON COSINE SIMILARITY SCORES

In [5]:
def create_lookalike_model(customers_df, products_df):
    # Convert SignupDate to datetime
    customers_df['SignupDate'] = pd.to_datetime(customers_df['SignupDate'])
    
    # Extract features from SignupDate
    customers_df['SignupYear'] = customers_df['SignupDate'].dt.year
    customers_df['SignupMonth'] = customers_df['SignupDate'].dt.month
    
    # Encode categorical variables
    le_region = LabelEncoder()
    customers_df['Region_encoded'] = le_region.fit_transform(customers_df['Region'])
    
    # Create feature matrix
    features = customers_df[['Region_encoded', 'SignupYear', 'SignupMonth']].values
    
    # Calculate similarity matrix using cosine similarity
    similarity_matrix = cosine_similarity(features)
    
    return similarity_matrix, customers_df

### GET TOP LOOKALIKES

In [6]:
def get_top_lookalikes(customer_id, similarity_matrix, customers_df, n=3):
    # Get customer index
    customer_idx = customers_df[customers_df['CustomerID'] == customer_id].index[0]
    
    # Get similarity scores for this customer
    similarity_scores = similarity_matrix[customer_idx]
    
    # Get indices of top n similar customers (excluding self)
    similar_indices = np.argsort(similarity_scores)[::-1][1:n+1]
    
    # Get customer IDs and scores
    similar_customers = []
    for idx in similar_indices:
        similar_customers.append({
            'customer_id': customers_df.iloc[idx]['CustomerID'],
            'similarity_score': round(similarity_scores[idx], 3)
        })
    
    return similar_customers

### GENERATE RECOMMENDATIONS

In [7]:
def generate_lookalike_recommendations(customers_df, products_df):
    # Create similarity matrix
    similarity_matrix, customers_df = create_lookalike_model(customers_df, products_df)
    
    # Generate recommendations for first 20 customers
    recommendations = {}
    for customer_id in customers_df['CustomerID'][:20]:
        lookalikes = get_top_lookalikes(customer_id, similarity_matrix, customers_df)
        recommendations[customer_id] = lookalikes
    
    # Create output dataframe
    output_rows = []
    for cust_id, lookalikes in recommendations.items():
        lookalike_str = "|".join([
            f"{rec['customer_id']},{rec['similarity_score']}" 
            for rec in lookalikes
        ])
        output_rows.append({
            'customer_id': cust_id,
            'lookalikes': lookalike_str
        })
    
    output_df = pd.DataFrame(output_rows)
    return output_df

In [8]:
output_df = generate_lookalike_recommendations(customers_df, products_df)

# Save to CSV
output_df.to_csv('Lookalike.csv', index=False)

# Print results
for _, row in output_df.iterrows():
    print(f"\nCustomer {row['customer_id']} lookalikes:")
    lookalikes = row['lookalikes'].split('|')
    for i, lookalike in enumerate(lookalikes, 1):
        cust_id, score = lookalike.split(',')
        print(f"{i}. {cust_id} (similarity score: {score})")


Customer C0001 lookalikes:
1. C0112 (similarity score: 1.0)
2. C0071 (similarity score: 1.0)
3. C0025 (similarity score: 1.0)

Customer C0002 lookalikes:
1. C0002 (similarity score: 1.0)
2. C0045 (similarity score: 1.0)
3. C0043 (similarity score: 1.0)

Customer C0003 lookalikes:
1. C0091 (similarity score: 1.0)
2. C0095 (similarity score: 1.0)
3. C0155 (similarity score: 1.0)

Customer C0004 lookalikes:
1. C0077 (similarity score: 1.0)
2. C0039 (similarity score: 1.0)
3. C0151 (similarity score: 1.0)

Customer C0005 lookalikes:
1. C0005 (similarity score: 1.0)
2. C0110 (similarity score: 1.0)
3. C0084 (similarity score: 1.0)

Customer C0006 lookalikes:
1. C0099 (similarity score: 1.0)
2. C0120 (similarity score: 1.0)
3. C0118 (similarity score: 1.0)

Customer C0007 lookalikes:
1. C0200 (similarity score: 1.0)
2. C0028 (similarity score: 1.0)
3. C0177 (similarity score: 1.0)

Customer C0008 lookalikes:
1. C0157 (similarity score: 1.0)
2. C0030 (similarity score: 1.0)
3. C0016 (similar