In [8]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

# File paths
data_folder = "Data"
customers_file = f"{data_folder}/Customers.csv"
products_file = f"{data_folder}/Products.csv"
transactions_file = f"{data_folder}/Transactions.csv"

# Load datasets
customers_df = pd.read_csv(customers_file)
products_df = pd.read_csv(products_file)
transactions_df = pd.read_csv(transactions_file)

# Merging customer and transaction data to create customer profiles
customer_profile = transactions_df.groupby('CustomerID').agg(
    total_spent=('TotalValue', 'sum'),
    total_items_bought=('Quantity', 'sum'),
    unique_products=('ProductID', pd.Series.nunique)
).reset_index()

# Check for missing values
print(customer_profile.isnull().sum())

# Handle missing values (fill with 0 or you can choose other methods like mean, median)
customer_profile.fillna(0, inplace=True)

# Standardizing the features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(customer_profile[['total_spent', 'total_items_bought', 'unique_products']])

# Calculating cosine similarity
similarity_matrix = cosine_similarity(scaled_features)

# Create a DataFrame for similarity matrix
similarity_df = pd.DataFrame(similarity_matrix, index=customer_profile['CustomerID'], columns=customer_profile['CustomerID'])

# Extracting top 3 lookalikes for the first 20 customers (CustomerID: C0001 - C0020)
lookalike_recommendations = {}

for customer in customer_profile['CustomerID'][:20]:
    # Get similarity scores for the current customer
    similar_customers = similarity_df[customer].sort_values(ascending=False)[1:4]  # Top 3 excluding self
    lookalike_recommendations[customer] = similar_customers.index.tolist() + similar_customers.values.tolist()

# Create a DataFrame for the lookalikes
lookalike_df = pd.DataFrame.from_dict(lookalike_recommendations, orient='index', columns=['lookalike_1', 'lookalike_2', 'lookalike_3', 'score_1', 'score_2', 'score_3'])

# Save the lookalikes to a CSV file
lookalike_df.to_csv('Lookalike.csv')

# Displaying the first few rows of the recommendations
print(lookalike_df.head())


CustomerID            0
total_spent           0
total_items_bought    0
unique_products       0
dtype: int64
      lookalike_1 lookalike_2 lookalike_3   score_1   score_2   score_3
C0001       C0164       C0137       C0069  0.968410  0.962081  0.955071
C0002       C0029       C0031       C0094  0.999762  0.999013  0.993555
C0003       C0176       C0027       C0010  0.890640  0.863579  0.829717
C0004       C0075       C0175       C0195  0.997674  0.994084  0.993556
C0005       C0058       C0123       C0015  0.999798  0.999706  0.999689
