In [2]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [3]:
# Load the datasets
customers = pd.read_csv("C:\E\data_analytics\Customers.csv")
# transactions was assigned to products and products was assigned to transactions
products = pd.read_csv("C:\E\data_analytics\Products.csv")
transactions = pd.read_csv("C:\E\data_analytics\Transactions.csv")

In [4]:
# Merge datasets to create customer profiles
transactions = transactions.merge(products[['ProductID', 'Category']], on='ProductID', how='left')
customer_profiles = transactions.groupby('CustomerID').agg({
    'TotalValue': 'sum',
    'TransactionID': 'count',
    'Category': lambda x: x.value_counts().index[0]  # Most frequent category
}).rename(columns={
    'TotalValue': 'TotalSpending',
    'TransactionID': 'TransactionCount',
    'Category': 'PreferredCategory'
}).reset_index()

In [5]:
# Merge customer data, including SignupYear
customer_profiles = customer_profiles.merge(customers, on='CustomerID', how='left')  # Ensure SignupYear is included

In [6]:
# Encode categorical features (Region, PreferredCategory)
customer_profiles = pd.get_dummies(customer_profiles, columns=['Region', 'PreferredCategory'], drop_first=True)

In [7]:
# Normalize numerical features
scaler = StandardScaler()
numerical_features = ['TotalSpending', 'TransactionCount']
customer_profiles[numerical_features] = scaler.fit_transform(customer_profiles[numerical_features])

In [8]:
# Compute similarity matrix using cosine similarity
# Removing 'SignupYear' from the drop columns as it is present in the DataFrame
similarity_matrix = cosine_similarity(customer_profiles.drop(columns=['CustomerID', 'CustomerName', 'SignupDate']))  
similarity_df = pd.DataFrame(similarity_matrix, index=customer_profiles['CustomerID'], columns=customer_profiles['CustomerID'])

In [9]:
# Find top 3 similar customers for the first 20 customers
lookalike_results = {}
for cust_id in customer_profiles['CustomerID'][:20]:
    similar_customers = similarity_df[cust_id].sort_values(ascending=False).iloc[1:4]  # Exclude self (highest similarity)
    lookalike_results[cust_id] = list(similar_customers.items())

In [11]:
# Convert results to DataFrame for output
lookalike_output = []
for cust_id, similars in lookalike_results.items():
    for similar_cust_id, score in similars:
        lookalike_output.append({'CustomerID': cust_id, 'SimilarCustomerID': similar_cust_id, 'Score': score})
lookalike_df = pd.DataFrame(lookalike_output)

In [12]:
# Save results to Lookalike.csv
lookalike_df.to_csv("Lookalike.csv", index=False)

print("Lookalike model completed! Results saved to Lookalike.csv.")

Lookalike model completed! Results saved to Lookalike.csv.
