In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans
from sklearn.metrics import davies_bouldin_score
import warnings

warnings.filterwarnings('ignore')

# Load the datasets
customers_df = pd.read_csv('Customers.csv')
products_df = pd.read_csv('Products.csv')
transactions_df = pd.read_csv('Transactions.csv')

# Display the first few rows of each dataset
print(customers_df.head())
print(products_df.head())
print(transactions_df.head())

# Check for missing values
print(customers_df.isnull().sum())
print(products_df.isnull().sum())
print(transactions_df.isnull().sum())

# Basic information about the datasets
print(customers_df.info())
print(products_df.info())
print(transactions_df.info())

  CustomerID        CustomerName         Region  SignupDate
0      C0001    Lawrence Carroll  South America  2022-07-10
1      C0002      Elizabeth Lutz           Asia  2022-02-13
2      C0003      Michael Rivera  South America  2024-03-07
3      C0004  Kathleen Rodriguez  South America  2022-10-09
4      C0005         Laura Weber           Asia  2022-08-15
  ProductID              ProductName     Category   Price
0      P001     ActiveWear Biography        Books  169.30
1      P002    ActiveWear Smartwatch  Electronics  346.30
2      P003  ComfortLiving Biography        Books   44.12
3      P004            BookWorld Rug   Home Decor   95.69
4      P005          TechPro T-Shirt     Clothing  429.31
  TransactionID CustomerID ProductID      TransactionDate  Quantity  \
0        T00001      C0199      P067  2024-08-25 12:38:23         1   
1        T00112      C0146      P067  2024-05-27 22:23:54         1   
2        T00166      C0127      P067  2024-04-25 07:38:55         1   
3       

In [2]:
# Prepare the data for the lookalike model
customer_features = customers_df.set_index('CustomerID')
customer_features['SignupDate'] = pd.to_datetime(customer_features['SignupDate'])
customer_features['DaysSinceSignup'] = (pd.Timestamp.now() - customer_features['SignupDate']).dt.days

# Calculate total spend and number of transactions for each customer
customer_transactions = transactions_df.groupby('CustomerID').agg({
    'TotalValue': 'sum',
    'TransactionID': 'count'
}).rename(columns={'TransactionID': 'TransactionCount'})

# Merge customer features with transaction data
customer_data = customer_features.join(customer_transactions)

# Fill NaN values with 0 for customers with no transactions
customer_data = customer_data.fillna(0)

# Select features for similarity calculation
features = ['DaysSinceSignup', 'TotalValue', 'TransactionCount']

# Normalize the features
scaler = StandardScaler()
normalized_features = scaler.fit_transform(customer_data[features])

# Calculate cosine similarity
similarity_matrix = cosine_similarity(normalized_features)

# Function to get top 3 lookalikes
def get_lookalikes(customer_id, n=3):
    customer_index = customer_data.index.get_loc(customer_id)
    similarities = similarity_matrix[customer_index]
    top_indices = similarities.argsort()[-n-1:-1][::-1]  # Exclude the customer itself
    top_customers = customer_data.iloc[top_indices]
    top_similarities = similarities[top_indices]
    return list(zip(top_customers.index, top_similarities))

# Get lookalikes for the first 20 customers
lookalikes = {}
for customer_id in customers_df['CustomerID'][:20]:
    lookalikes[customer_id] = get_lookalikes(customer_id)

# Create Lookalike.csv
lookalike_df = pd.DataFrame.from_dict(lookalikes, orient='index')
lookalike_df.columns = ['Lookalike1', 'Lookalike2', 'Lookalike3']
lookalike_df.to_csv('Lookalike.csv')

print(lookalike_df)

                        Lookalike1                   Lookalike2  \
C0001  (C0152, 0.9997963560178006)  (C0160, 0.9889458102649686)   
C0002  (C0029, 0.9971101247482109)  (C0166, 0.9888846114317507)   
C0003  (C0177, 0.9928770716093563)  (C0052, 0.9897133902589554)   
C0004   (C0175, 0.998247607435891)   (C0173, 0.995065244286069)   
C0005   (C0073, 0.999969657519672)   (C0159, 0.999775395490758)   
C0006  (C0066, 0.8944086332724719)  (C0117, 0.8675816793276873)   
C0007  (C0125, 0.9993778718273251)  (C0193, 0.9960865905218719)   
C0008   (C0090, 0.980925668512243)  (C0017, 0.9609593091686617)   
C0009  (C0077, 0.9931403254860873)  (C0128, 0.9771010506970863)   
C0010   (C0199, 0.994822012921529)  (C0083, 0.9909033339887625)   
C0011   (C0107, 0.999397237275431)  (C0152, 0.9697979690220843)   
C0012  (C0046, 0.9968413833349494)   (C0126, 0.996542443961925)   
C0013  (C0143, 0.9999592960993378)  (C0028, 0.9955122586325639)   
C0014  (C0033, 0.9990824139388639)  (C0063, 0.9964918325839992