In [14]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline


In [15]:
# Load the datasets
customers = pd.read_csv('Customers.csv')
products = pd.read_csv('Products.csv')
transactions = pd.read_csv('Transactions.csv')


In [19]:
# Display the first few rows of each dataset
print("Customers Data:")
print(customers.head())
print("\nProducts Data:")
print(products.head())
print("\nTransactions Data:")
print(transactions.head())


Customers Data:
  CustomerID        CustomerName         Region  SignupDate
0      C0001    Lawrence Carroll  South America  2022-07-10
1      C0002      Elizabeth Lutz           Asia  2022-02-13
2      C0003      Michael Rivera  South America  2024-03-07
3      C0004  Kathleen Rodriguez  South America  2022-10-09
4      C0005         Laura Weber           Asia  2022-08-15

Products Data:
  ProductID              ProductName     Category   Price
0      P001     ActiveWear Biography        Books  169.30
1      P002    ActiveWear Smartwatch  Electronics  346.30
2      P003  ComfortLiving Biography        Books   44.12
3      P004            BookWorld Rug   Home Decor   95.69
4      P005          TechPro T-Shirt     Clothing  429.31

Transactions Data:
  TransactionID CustomerID ProductID      TransactionDate  Quantity  \
0        T00001      C0199      P067  2024-08-25 12:38:23         1   
1        T00112      C0146      P067  2024-05-27 22:23:54         1   
2        T00166      C0127 

In [20]:
# Merge datasets to create a unified dataset
# Merge transactions with customers on CustomerID
data = pd.merge(transactions, customers, on='CustomerID')


In [21]:
# Merge the resulting data with products on ProductID
data = pd.merge(data, products, on='ProductID')


In [22]:
# Display the merged dataset
print("\nMerged Data:")
print(data.head())




Merged Data:
  TransactionID CustomerID ProductID      TransactionDate  Quantity  \
0        T00001      C0199      P067  2024-08-25 12:38:23         1   
1        T00112      C0146      P067  2024-05-27 22:23:54         1   
2        T00166      C0127      P067  2024-04-25 07:38:55         1   
3        T00272      C0087      P067  2024-03-26 22:55:37         2   
4        T00363      C0070      P067  2024-03-21 15:10:10         3   

   TotalValue  Price_x     CustomerName         Region  SignupDate  \
0      300.68   300.68   Andrea Jenkins         Europe  2022-12-03   
1      300.68   300.68  Brittany Harvey           Asia  2024-09-04   
2      300.68   300.68  Kathryn Stevens         Europe  2024-04-04   
3      601.36   300.68  Travis Campbell  South America  2024-04-11   
4      902.04   300.68    Timothy Perez         Europe  2022-03-15   

                       ProductName     Category  Price_y  
0  ComfortLiving Bluetooth Speaker  Electronics   300.68  
1  ComfortLiving Blu

In [23]:
# Feature Engineering
# Create features for customer behavior
# 1. Total spending per customer
customer_spending = data.groupby('CustomerID')['TotalValue'].sum().reset_index()
customer_spending.rename(columns={'TotalValue': 'TotalSpending'}, inplace=True)



In [24]:
# 2. Number of transactions per customer
customer_transactions = data.groupby('CustomerID')['TransactionID'].count().reset_index()
customer_transactions.rename(columns={'TransactionID': 'TransactionCount'}, inplace=True)


In [25]:
# 3. Average transaction value per customer
customer_avg_value = data.groupby('CustomerID')['TotalValue'].mean().reset_index()
customer_avg_value.rename(columns={'TotalValue': 'AvgTransactionValue'}, inplace=True)



In [26]:
# 4. Favorite product category (most purchased category)
customer_favorite_category = data.groupby(['CustomerID', 'Category']).size().reset_index(name='CategoryCount')
customer_favorite_category = customer_favorite_category.loc[customer_favorite_category.groupby('CustomerID')['CategoryCount'].idxmax()]
customer_favorite_category = customer_favorite_category[['CustomerID', 'Category']]
customer_favorite_category.rename(columns={'Category': 'FavoriteCategory'}, inplace=True)


In [27]:
# Merge all features into a single customer profile dataset
customer_profile = pd.merge(customer_spending, customer_transactions, on='CustomerID')
customer_profile = pd.merge(customer_profile, customer_avg_value, on='CustomerID')
customer_profile = pd.merge(customer_profile, customer_favorite_category, on='CustomerID')



In [28]:
# Add customer demographic information (from Customers.csv)
customer_profile = pd.merge(customer_profile, customers, on='CustomerID')



In [29]:
# Display the final customer profile dataset
print("\nCustomer Profile Data:")
print(customer_profile.head())



Customer Profile Data:
  CustomerID  TotalSpending  TransactionCount  AvgTransactionValue  \
0      C0001        3354.52                 5              670.904   
1      C0002        1862.74                 4              465.685   
2      C0003        2725.38                 4              681.345   
3      C0004        5354.88                 8              669.360   
4      C0005        2034.24                 3              678.080   

  FavoriteCategory        CustomerName         Region  SignupDate  
0      Electronics    Lawrence Carroll  South America  2022-07-10  
1         Clothing      Elizabeth Lutz           Asia  2022-02-13  
2       Home Decor      Michael Rivera  South America  2024-03-07  
3            Books  Kathleen Rodriguez  South America  2022-10-09  
4      Electronics         Laura Weber           Asia  2022-08-15  


In [30]:
# Preprocessing for similarity calculation
# Separate numerical and categorical features
numerical_features = ['TotalSpending', 'TransactionCount', 'AvgTransactionValue']
categorical_features = ['FavoriteCategory', 'Region']



In [31]:
# Create a preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(), categorical_features)
    ])


In [32]:
# Apply preprocessing to the customer profile data
customer_features = preprocessor.fit_transform(customer_profile)



In [33]:
# Calculate cosine similarity between customers
similarity_matrix = cosine_similarity(customer_features)


In [34]:
# Convert the similarity matrix into a DataFrame for easier lookup
similarity_df = pd.DataFrame(similarity_matrix, index=customer_profile['CustomerID'], columns=customer_profile['CustomerID'])


In [35]:

# Function to get top 3 similar customers
def get_top_similar_customers(customer_id, similarity_df, top_n=3):
    # Exclude the customer itself and get top N similar customers
    similar_customers = similarity_df[customer_id].sort_values(ascending=False).iloc[1:top_n+1]
    return [[similar_customers.index[i], similar_customers.iloc[i]] for i in range(top_n)]


In [36]:
# Generate recommendations for the first 20 customers (C0001 - C0020)
lookalike_map = {}
for customer_id in customer_profile['CustomerID'][:20]:
    similar_customers = get_top_similar_customers(customer_id, similarity_df)
    lookalike_map[customer_id] = similar_customers


In [37]:
# Convert the results to a DataFrame
lookalike_df = pd.DataFrame(list(lookalike_map.items()), columns=['CustomerID', 'SimilarCustomers'])


In [38]:
# Save the results to a CSV file
lookalike_df.to_csv('Lookalike.csv', index=False)



In [39]:
# Display the final output
print("\nLookalike Recommendations:")
print(lookalike_df)




Lookalike Recommendations:
   CustomerID                                   SimilarCustomers
0       C0001  [[C0190, 0.968215451295126], [C0048, 0.9410720...
1       C0002  [[C0088, 0.9663574397998078], [C0134, 0.941709...
2       C0003  [[C0052, 0.9847977904024425], [C0152, 0.926264...
3       C0004  [[C0165, 0.9711437718179057], [C0155, 0.962082...
4       C0005  [[C0186, 0.97879054193451], [C0146, 0.95985083...
5       C0006  [[C0168, 0.9732537429499297], [C0171, 0.951338...
6       C0007  [[C0140, 0.9764156563035169], [C0115, 0.934222...
7       C0008  [[C0109, 0.8700104206236068], [C0139, 0.811768...
8       C0009  [[C0010, 0.9760669630706748], [C0198, 0.952035...
9       C0010  [[C0009, 0.9760669630706748], [C0111, 0.970850...
10      C0011  [[C0137, 0.9611944701177747], [C0169, 0.920395...
11      C0012  [[C0104, 0.9659896786844347], [C0113, 0.926666...
12      C0013  [[C0099, 0.9855644363688849], [C0108, 0.919846...
13      C0014  [[C0060, 0.9763044912298495], [C0151, 0.908484.