In [1]:
import pandas as pd
import numpy as np

# Load datasets
customers_df = pd.read_csv('/kaggle/input/customers/Customers.csv')
products_df = pd.read_csv('/kaggle/input/products/Products.csv')
transactions_df = pd.read_csv('/kaggle/input/transactions/Transactions.csv')

# Merge dataframes: transactions -> customers -> products
merged_df = pd.merge(transactions_df, customers_df, on='CustomerID', how='left')
merged_df = pd.merge(merged_df, products_df, on='ProductID', how='left')

# Check for missing values
merged_df.isna().sum()


TransactionID      0
CustomerID         0
ProductID          0
TransactionDate    0
Quantity           0
TotalValue         0
Price_x            0
CustomerName       0
Region             0
SignupDate         0
ProductName        0
Category           0
Price_y            0
dtype: int64

In [2]:
# One-hot encoding for 'Region'
customer_region = pd.get_dummies(customers_df['Region'], drop_first=True)

# Transaction frequency for each product category
category_frequency = merged_df.groupby(['CustomerID', 'Category'])['Quantity'].sum().unstack().fillna(0)

# Total spending per customer
customer_spend = merged_df.groupby('CustomerID')['TotalValue'].sum()

# Combine demographic features and transaction features
customer_features = pd.concat([customer_region, category_frequency, customer_spend], axis=1)

# Check the created feature dataframe
customer_features.head()


  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0,Europe,North America,South America,Books,Clothing,Electronics,Home Decor,TotalValue
0,False,False,True,,,,,
1,False,False,False,,,,,
2,False,False,True,,,,,
3,False,False,True,,,,,
4,False,False,False,,,,,


In [3]:
# Handle missing values by imputing with the mean for numerical columns
customer_features_imputed = customer_features.fillna(customer_features.mean())

# Check again if there are any missing values
customer_features_imputed.isna().sum()


Europe           0
North America    0
South America    0
Books            0
Clothing         0
Electronics      0
Home Decor       0
TotalValue       0
dtype: int64

In [6]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
customer_features_scaled = scaler.fit_transform(customer_features_imputed)


In [7]:
from sklearn.metrics.pairwise import cosine_similarity

# Calculate cosine similarity between customers
cosine_sim = cosine_similarity(customer_features_scaled)

# Convert the cosine similarity matrix to a DataFrame for better readability
cosine_sim_df = pd.DataFrame(cosine_sim, index=customer_features_imputed.index, columns=customer_features_imputed.index)

# Preview the cosine similarity matrix
cosine_sim_df.head()


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,C0191,C0192,C0193,C0194,C0195,C0196,C0197,C0198,C0199,C0200
0,1.0,-0.206527,1.0,1.0,-0.206527,1.0,-0.206527,-0.473522,-0.50762,-0.50762,...,-8.507883000000001e-17,-4.3298380000000004e-17,-6.589951000000001e-17,-2.604141e-16,-4.0224070000000004e-17,-7.303729e-18,-7.527872000000001e-17,5.746491e-17,-7.632847e-17,2.5717990000000003e-17
1,-0.206527,1.0,-0.206527,-0.206527,1.0,-0.206527,1.0,-0.119615,-0.143144,-0.143144,...,-3.413587e-17,1.021194e-16,-2.6501950000000002e-17,1.240153e-16,4.383369e-17,1.620016e-16,6.352836e-18,2.152012e-16,6.01898e-18,1.984575e-16
2,1.0,-0.206527,1.0,1.0,-0.206527,1.0,-0.206527,-0.473522,-0.50762,-0.50762,...,-8.507883000000001e-17,-4.3298380000000004e-17,-6.589951000000001e-17,-2.604141e-16,-4.0224070000000004e-17,-7.303729e-18,-7.527872000000001e-17,5.746491e-17,-7.632847e-17,2.5717990000000003e-17
3,1.0,-0.206527,1.0,1.0,-0.206527,1.0,-0.206527,-0.473522,-0.50762,-0.50762,...,-8.507883000000001e-17,-4.3298380000000004e-17,-6.589951000000001e-17,-2.604141e-16,-4.0224070000000004e-17,-7.303729e-18,-7.527872000000001e-17,5.746491e-17,-7.632847e-17,2.5717990000000003e-17
4,-0.206527,1.0,-0.206527,-0.206527,1.0,-0.206527,1.0,-0.119615,-0.143144,-0.143144,...,-3.413587e-17,1.021194e-16,-2.6501950000000002e-17,1.240153e-16,4.383369e-17,1.620016e-16,6.352836e-18,2.152012e-16,6.01898e-18,1.984575e-16


In [8]:
def get_top_3_similar_customers(cust_id, cosine_sim_df):
    # Get the similarity scores for the customer
    sim_scores = cosine_sim_df[cust_id].sort_values(ascending=False)
    
    # Remove the customer itself (highest similarity)
    sim_scores = sim_scores[1:]
    
    # Get the top 3 most similar customers
    top_3 = sim_scores.head(3)
    
    # Return the customer IDs and similarity scores as a list of tuples
    return [(cust_id, score) for cust_id, score in top_3.items()]

# Get recommendations for the first 20 customers (C0001 - C0020)
recommendations = {}
for i in range(1, 21):  # For customers C0001 - C0020
    cust_id = f'C{i:04d}'
    recommendations[cust_id] = get_top_3_similar_customers(cust_id, cosine_sim_df)

# Print recommendations for the first 5 customers as an example
for cust_id, recs in recommendations.items():
    print(f"Customer {cust_id} Recommendations:")
    for rec in recs:
        print(f"  Lookalike: {rec[0]} | Similarity Score: {rec[1]:.4f}")
    print()


Customer C0001 Recommendations:
  Lookalike: C0069 | Similarity Score: 0.9330
  Lookalike: C0026 | Similarity Score: 0.9260
  Lookalike: C0120 | Similarity Score: 0.8898

Customer C0002 Recommendations:
  Lookalike: C0159 | Similarity Score: 0.9747
  Lookalike: C0178 | Similarity Score: 0.9563
  Lookalike: C0133 | Similarity Score: 0.9467

Customer C0003 Recommendations:
  Lookalike: C0195 | Similarity Score: 0.8457
  Lookalike: C0166 | Similarity Score: 0.8137
  Lookalike: C0031 | Similarity Score: 0.7838

Customer C0004 Recommendations:
  Lookalike: C0065 | Similarity Score: 0.9370
  Lookalike: C0075 | Similarity Score: 0.9160
  Lookalike: C0012 | Similarity Score: 0.9097

Customer C0005 Recommendations:
  Lookalike: C0007 | Similarity Score: 0.9655
  Lookalike: C0085 | Similarity Score: 0.9603
  Lookalike: C0197 | Similarity Score: 0.9437

Customer C0006 Recommendations:
  Lookalike: C0024 | Similarity Score: 0.8951
  Lookalike: C0135 | Similarity Score: 0.8125
  Lookalike: C0108 | 

In [9]:
# Prepare data for the CSV output
lookalike_data = []
for cust_id, recs in recommendations.items():
    for rec in recs:
        lookalike_data.append([cust_id, rec[0], rec[1]])

# Create a DataFrame for Lookalike.csv
lookalike_df = pd.DataFrame(lookalike_data, columns=['CustomerID', 'LookalikeCustomerID', 'SimilarityScore'])

# Save to CSV
lookalike_df.to_csv('Lookalike.csv', index=False)

# Preview the output
lookalike_df.head()


Unnamed: 0,CustomerID,LookalikeCustomerID,SimilarityScore
0,C0001,C0069,0.933006
1,C0001,C0026,0.926038
2,C0001,C0120,0.889766
3,C0002,C0159,0.974678
4,C0002,C0178,0.956297
