In [2]:
# Part 1: Load necessary libraries and datasets
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

# Load datasets
customers = pd.read_csv('Customers.csv')
transactions = pd.read_csv('Transactions.csv')
products = pd.read_csv('Products.csv')


In [3]:
# Part 2: Merge transactions with customer data
merged_data = pd.merge(transactions, customers, on='CustomerID', how='inner')


In [4]:
# Part 3: Feature Engineering

# Extract demographic features for customers
demographics = customers[['CustomerID', 'Region', 'SignupDate']]

# Encode categorical data (e.g., Region) into numerical format
demographics.loc[:, 'Region'] = demographics['Region'].astype('category').cat.codes


In [5]:
# Part 4: Aggregate transaction data by customer
customer_transactions = merged_data.groupby('CustomerID').agg(
    total_spent=('TotalValue', 'sum'),
    avg_quantity=('Quantity', 'mean'),
    total_products=('ProductID', 'nunique')
).reset_index()


In [6]:
# Align demographic data with transaction data
aligned_data = pd.merge(demographics, customer_transactions, on='CustomerID', how='inner')

# Normalize the features
scaler = StandardScaler()
demographics_scaled = scaler.fit_transform(aligned_data[['Region']])
transactions_scaled = scaler.fit_transform(aligned_data[['total_spent', 'avg_quantity', 'total_products']])

# Combine the features into a single matrix
customer_profile = np.hstack([demographics_scaled, transactions_scaled])


In [7]:
# Part 6: Calculate similarity matrix based on cosine similarity
similarity_matrix = cosine_similarity(customer_profile)


In [8]:
# Part 7: Generate recommendations for first 20 customers (C0001 to C0020)
lookalike_recommendations = {}

for idx in range(20):  # First 20 customers
    customer_id = customers.iloc[idx]['CustomerID']
    similarities = similarity_matrix[idx]
    
    # Get the 3 most similar customers excluding the customer itself
    similar_customers_idx = np.argsort(similarities)[::-1][1:4]  # Exclude self
    similar_customer_ids = customers.iloc[similar_customers_idx]['CustomerID'].values
    similar_scores = similarities[similar_customers_idx]
    
    # Store recommendations
    lookalike_recommendations[customer_id] = list(zip(similar_customer_ids, similar_scores))


In [11]:
# Prepare the data in the correct format
lookalike_data = []
for cust_id, recs in lookalike_recommendations.items():
    for rec in recs:
        lookalike_data.append([cust_id, rec[0], rec[1]])  # [CustomerID, SimilarCustomerID, SimilarityScore]

# Create the DataFrame
lookalike_df = pd.DataFrame(lookalike_data, columns=['CustomerID', 'RecommendedCustomerID', 'SimilarityScore'])

# Save recommendations to Lookalike.csv
lookalike_df.to_csv('Lookalike2.csv', index=False)

# Print the lookalike recommendations for the first 20 customers
for cust_id, recs in lookalike_recommendations.items():
    print(f"Customer {cust_id}:")
    for rec in recs:
        print(f"  Recommended Customer: {rec[0]}, Similarity Score: {rec[1]:.4f}")
    print()


Customer C0001:
  Recommended Customer: C0107, Similarity Score: 0.9932
  Recommended Customer: C0174, Similarity Score: 0.9809
  Recommended Customer: C0048, Similarity Score: 0.9772

Customer C0002:
  Recommended Customer: C0185, Similarity Score: 0.9609
  Recommended Customer: C0142, Similarity Score: 0.9586
  Recommended Customer: C0005, Similarity Score: 0.9546

Customer C0003:
  Recommended Customer: C0091, Similarity Score: 0.9676
  Recommended Customer: C0160, Similarity Score: 0.9280
  Recommended Customer: C0026, Similarity Score: 0.9239

Customer C0004:
  Recommended Customer: C0113, Similarity Score: 0.9829
  Recommended Customer: C0104, Similarity Score: 0.9526
  Recommended Customer: C0099, Similarity Score: 0.9474

Customer C0005:
  Recommended Customer: C0185, Similarity Score: 0.9976
  Recommended Customer: C0135, Similarity Score: 0.9738
  Recommended Customer: C0140, Similarity Score: 0.9696

Customer C0006:
  Recommended Customer: C0168, Similarity Score: 0.9694
  R