In [50]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_score, recall_score, f1_score, mean_squared_error
import numpy as np

In [4]:
# Loading the datasets
customers_df = pd.read_csv('Customers.csv')
products_df = pd.read_csv('Products.csv')
transactions_df = pd.read_csv('Transactions.csv')

In [6]:
# Merging datasets to create a comprehensive dataset
transactions_customers = pd.merge(transactions_df, customers_df, on='CustomerID')
merged_data = pd.merge(transactions_customers, products_df, on='ProductID')

In [30]:
# Aggregating data to create customer-level features
customer_features = merged_data.groupby('CustomerID').agg({
    'TotalValue': 'sum',               # Total spending
    'Quantity': 'sum',                 # Total quantity purchased
    'Category': lambda x: x.mode()[0],  # Most frequently purchased category
    'Region': 'first'                  # Customer's region
}).reset_index()

customer_features.head()

Unnamed: 0,CustomerID,TotalValue,Quantity,Category,Region
0,C0001,3354.52,12,Electronics,South America
1,C0002,1862.74,10,Clothing,Asia
2,C0003,2725.38,14,Home Decor,South America
3,C0004,5354.88,23,Books,South America
4,C0005,2034.24,7,Electronics,Asia


In [32]:
# One-hot encode categorical features (Region, Category)
customer_features = pd.get_dummies(customer_features, columns=['Region', 'Category'], drop_first=True)

customer_features.head()

Unnamed: 0,CustomerID,TotalValue,Quantity,Region_Europe,Region_North America,Region_South America,Category_Clothing,Category_Electronics,Category_Home Decor
0,C0001,3354.52,12,False,False,True,False,True,False
1,C0002,1862.74,10,False,False,False,True,False,False
2,C0003,2725.38,14,False,False,True,False,False,True
3,C0004,5354.88,23,False,False,True,False,False,False
4,C0005,2034.24,7,False,False,False,False,True,False


In [34]:
# Scaling the features for similarity computation
scaler = StandardScaler()
scaled_features = scaler.fit_transform(customer_features.drop(columns=['CustomerID']))

In [42]:
# Computing similarity matrix using cosine similarity
similarity_matrix = cosine_similarity(scaled_features)

In [44]:
# Find top 3 lookalikes for the first 20 customers
lookalike_results = {}

for idx in range(20):  # Looping through the first 20 customers
    customer_id = customer_features.iloc[idx]['CustomerID']
    similarities = list(enumerate(similarity_matrix[idx]))
    similarities = sorted(similarities, key=lambda x: x[1], reverse=True)  # Sorting by similarity score
    top_lookalikes = [
        (customer_features.iloc[i]['CustomerID'], round(score, 4))
        for i, score in similarities[1:4]  # Excluding the customer themselves
    ]
    lookalike_results[customer_id] = top_lookalikes

In [48]:
# Create the Lookalike.csv file
lookalike_data = []

for cust_id, lookalikes in lookalike_results.items():
    for similar_cust, score in lookalikes:
        lookalike_data.append({
            "CustomerID": cust_id,
            "SimilarCustomerID": similar_cust,
            "SimilarityScore": score
        })

lookalike_df = pd.DataFrame(lookalike_data)
lookalike_df.to_csv('Lookalike.csv', index=False)  # Save the results to Lookalike.csv

print("Lookalike.csv has been created successfully!")

Lookalike.csv has been created successfully!


# Calculating accuracy metrics of this model

In [55]:
true_similarity_matrix = cosine_similarity(scaled_features)
threshold = 0.8              # Defining a threshold to classify "similar" customers
binary_recommendations = (similarity_matrix > threshold).astype(int)   # Converting recommendations and similarity scores into a binary form for evaluation

In [57]:
# Lists to store precision, recall, and F1-score for each customer
precision_list = []
recall_list = []
f1_list = []
mse_list = []

In [59]:
# Evaluating the recommendations for the first 20 customers
for idx in range(20):
    # Extracting the true similarity vector and predicted similarity vector for this customer
    true_similarities = true_similarity_matrix[idx]
    predicted_similarities = similarity_matrix[idx]
    
    # Binarizing true and predicted similarities using the threshold defined above
    true_binary = (true_similarities > threshold).astype(int)
    predicted_binary = (predicted_similarities > threshold).astype(int)
    
    # Excluding self-similarity (diagonal)
    true_binary[idx] = 0
    predicted_binary[idx] = 0
    
    # Calculating precision, recall, and F1-score for this customer
    precision_list.append(precision_score(true_binary, predicted_binary, zero_division=0))
    recall_list.append(recall_score(true_binary, predicted_binary, zero_division=0))
    f1_list.append(f1_score(true_binary, predicted_binary, zero_division=0))
    mse_list.append(mean_squared_error(true_similarities, predicted_similarities))

In [61]:
# Calculate average metrics across the first 20 customers
average_precision = np.mean(precision_list)
average_recall = np.mean(recall_list)
average_f1 = np.mean(f1_list)
average_mse = np.mean(mse_list)

In [63]:
# Printing the evaluation metrics
print("Model Evaluation Metrics:")
print(f"Average Precision: {average_precision:.4f}")
print(f"Average Recall: {average_recall:.4f}")
print(f"Average F1-Score: {average_f1:.4f}")
print(f"Average Mean Squared Error (MSE): {average_mse:.4f}")

Model Evaluation Metrics:
Average Precision: 1.0000
Average Recall: 1.0000
Average F1-Score: 1.0000
Average Mean Squared Error (MSE): 0.0000
