In [73]:
# Data Preparation:
#    Merge Customers, Products, and Transactions datasets into a unified view (df).
#    This provides a comprehensive view of each customer's purchasing behavior and profile.

# Feature Engineering:
#    Aggregate transaction data to compute key features for each customer:
#       Total spending (sum of TotalValue)
#       Transaction count
#       Average price of purchased items
#       Favorite product category (most frequent)
#       Region
#       Customer lifetime (based on SignupDate)
#    One-hot encode categorical features (FavoriteCategory and Region)
#    Standardize numerical features using StandardScaler

# Similarity Calculation:
#    Use cosine similarity to compare customer profiles
#    Generate a similarity matrix where each cell represents the similarity between two customers

# Recommendation Generation:
#    For each customer (focusing on C0001 to C0020):
#       Identify the top 3 most similar customers based on similarity scores
#       Store these recommendations with their similarity scores

# Output:
#    Creating Lookalike.csv file containing:
#       CustomerID
#       List of top 3 similar customer IDs with their similarity scores

In [84]:
#importING essential libraries and modules
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics.pairwise import cosine_similarity

In [85]:
# Loading data from CSV to Dataframes
customers = pd.read_csv("Customers.csv", parse_dates=["SignupDate"])
products = pd.read_csv("Products.csv")
transactions = pd.read_csv("Transactions.csv", parse_dates=["TransactionDate"])

In [86]:
# Merge datasets
df = transactions.merge(customers, on='CustomerID').merge(products, on='ProductID')

In [87]:
print(customers.info())
print(products.info())
print(transactions.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   CustomerID    200 non-null    object        
 1   CustomerName  200 non-null    object        
 2   Region        200 non-null    object        
 3   SignupDate    200 non-null    datetime64[ns]
dtypes: datetime64[ns](1), object(3)
memory usage: 6.4+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   ProductID    100 non-null    object 
 1   ProductName  100 non-null    object 
 2   Category     100 non-null    object 
 3   Price        100 non-null    float64
dtypes: float64(1), object(3)
memory usage: 3.3+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 7 columns):
 #   Column    

In [88]:
df.info

<bound method DataFrame.info of     TransactionID CustomerID ProductID     TransactionDate  Quantity  \
0          T00001      C0199      P067 2024-08-25 12:38:23         1   
1          T00112      C0146      P067 2024-05-27 22:23:54         1   
2          T00166      C0127      P067 2024-04-25 07:38:55         1   
3          T00272      C0087      P067 2024-03-26 22:55:37         2   
4          T00363      C0070      P067 2024-03-21 15:10:10         3   
..            ...        ...       ...                 ...       ...   
995        T00496      C0118      P037 2024-10-24 08:30:27         1   
996        T00759      C0059      P037 2024-06-04 02:15:24         3   
997        T00922      C0018      P037 2024-04-05 13:05:32         4   
998        T00959      C0115      P037 2024-09-29 10:16:02         2   
999        T00992      C0024      P037 2024-04-21 10:52:24         1   

     TotalValue  Price_x          CustomerName         Region SignupDate  \
0        300.68   300.68   

In [90]:
# Feature engineering
customer_features = df.groupby('CustomerID').agg({
    'TotalValue': 'sum',
    'TransactionID': 'count',
    'Price_x': 'mean',
    'Category': lambda x: x.mode().iloc[0] if not x.mode().empty else None,
    'Region': 'first',
    'SignupDate': 'first'
}).reset_index()

customer_features.columns = ['CustomerID', 'TotalSpending', 'TransactionCount', 'AvgPrice', 'FavoriteCategory', 'Region', 'SignupDate']

In [91]:
customer_features

Unnamed: 0,CustomerID,TotalSpending,TransactionCount,AvgPrice,FavoriteCategory,Region,SignupDate
0,C0001,3354.52,5,278.334000,Electronics,South America,2022-07-10
1,C0002,1862.74,4,208.920000,Clothing,Asia,2022-02-13
2,C0003,2725.38,4,195.707500,Home Decor,South America,2024-03-07
3,C0004,5354.88,8,240.636250,Books,South America,2022-10-09
4,C0005,2034.24,3,291.603333,Electronics,Asia,2022-08-15
...,...,...,...,...,...,...,...
194,C0196,4982.88,4,416.992500,Home Decor,Europe,2022-06-07
195,C0197,1928.65,3,227.056667,Electronics,Europe,2023-03-21
196,C0198,931.83,2,239.705000,Clothing,Europe,2022-02-27
197,C0199,1979.28,4,250.610000,Electronics,Europe,2022-12-03


In [92]:
# Calculate customer lifetime (in days)
current_date = datetime(2025, 2, 2)  # Current date from your information
customer_features['CustomerLifetime'] = (current_date - customer_features['SignupDate']).dt.days

In [93]:
# One-hot encode categorical features
customer_features = pd.get_dummies(customer_features, columns=['FavoriteCategory', 'Region'])

In [94]:
# Standardize numerical features
scaler = StandardScaler()
numerical_features = ['TotalSpending', 'TransactionCount', 'AvgPrice', 'CustomerLifetime']
customer_features[numerical_features] = scaler.fit_transform(customer_features[numerical_features])

In [95]:
# Calculate cosine similarity
similarity_matrix = cosine_similarity(customer_features.drop(['CustomerID', 'SignupDate'], axis=1))

In [101]:
similarity_matrix

array([[ 1.        ,  0.39344655,  0.02910708, ...,  0.34771341,
         0.53893666,  0.04177989],
       [ 0.39344655,  1.        ,  0.08276792, ...,  0.78614456,
         0.49581498,  0.30481123],
       [ 0.02910708,  0.08276792,  1.        , ...,  0.10223502,
         0.10159524, -0.22949276],
       ...,
       [ 0.34771341,  0.78614456,  0.10223502, ...,  1.        ,
         0.73954444,  0.01123023],
       [ 0.53893666,  0.49581498,  0.10159524, ...,  0.73954444,
         1.        , -0.19727426],
       [ 0.04177989,  0.30481123, -0.22949276, ...,  0.01123023,
        -0.19727426,  1.        ]])

In [96]:
# Function to get top 3 similar customers
def get_top_3_similar(customer_id, customer_features, similarity_matrix):
    idx = customer_features.index[customer_features['CustomerID'] == customer_id].tolist()[0]
    similar_indices = similarity_matrix[idx].argsort()[::-1][1:4]
    similar_customers = customer_features.iloc[similar_indices]['CustomerID'].tolist()
    similarity_scores = similarity_matrix[idx][similar_indices].tolist()
    return list(zip(similar_customers, similarity_scores))

In [97]:
# Generate recommendations for customers C0001 to C0020
recommendations = {}
for cust_id in [f'C{str(i).zfill(4)}' for i in range(1, 21)]:
    recommendations[cust_id] = get_top_3_similar(cust_id, customer_features, similarity_matrix)

In [102]:
recommendations

{'C0001': [('C0192', 0.8946394502886589),
  ('C0184', 0.8439526517204075),
  ('C0120', 0.788382696262193)],
 'C0002': [('C0134', 0.9421769087781718),
  ('C0106', 0.9281192197486611),
  ('C0029', 0.8344376946853383)],
 'C0003': [('C0031', 0.9398506720448854),
  ('C0052', 0.9318370062367196),
  ('C0151', 0.8808666772270531)],
 'C0004': [('C0165', 0.9742530245757335),
  ('C0153', 0.8612990710020779),
  ('C0175', 0.84638163754151)],
 'C0005': [('C0007', 0.9271524635204714),
  ('C0140', 0.8730944205744388),
  ('C0186', 0.791708743668009)],
 'C0006': [('C0187', 0.912019187878102),
  ('C0137', 0.8560606075183769),
  ('C0085', 0.8559799988718523)],
 'C0007': [('C0005', 0.9271524635204714),
  ('C0040', 0.7957440271770984),
  ('C0140', 0.766609083031755)],
 'C0008': [('C0065', 0.8365803505743955),
  ('C0156', 0.7472396970816853),
  ('C0109', 0.7260492694269917)],
 'C0009': [('C0061', 0.9284990231871001),
  ('C0062', 0.7837616896004123),
  ('C0058', 0.7578155874207212)],
 'C0010': [('C0062', 0.91

In [100]:
# Create Lookalike.csv
lookalike_df = pd.DataFrame([(k, v) for k, v in recommendations.items()], columns=['CustomerID', 'Recommendations'])
lookalike_df.to_csv('Lookalike.csv', index=False)

print("Lookalike.csv has been created with recommendations for customers C0001 to C0020.")

Lookalike.csv has been created with recommendations for customers C0001 to C0020.


In [103]:
lookalike_df

Unnamed: 0,CustomerID,Recommendations
0,C0001,"[(C0192, 0.8946394502886589), (C0184, 0.843952..."
1,C0002,"[(C0134, 0.9421769087781718), (C0106, 0.928119..."
2,C0003,"[(C0031, 0.9398506720448854), (C0052, 0.931837..."
3,C0004,"[(C0165, 0.9742530245757335), (C0153, 0.861299..."
4,C0005,"[(C0007, 0.9271524635204714), (C0140, 0.873094..."
5,C0006,"[(C0187, 0.912019187878102), (C0137, 0.8560606..."
6,C0007,"[(C0005, 0.9271524635204714), (C0040, 0.795744..."
7,C0008,"[(C0065, 0.8365803505743955), (C0156, 0.747239..."
8,C0009,"[(C0061, 0.9284990231871001), (C0062, 0.783761..."
9,C0010,"[(C0062, 0.9190097305640931), (C0060, 0.860064..."
