#  This notebook explains the process of building a Lookalike Model that recommends similar customers based on their profiles and transaction history.

In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


In [3]:
# Load the datasets: Customers, Products, and Transactions
customers = pd.read_csv("C:\\Users\\Vikra\\Downloads\\Customers.csv")
products= pd.read_csv("C:\\Users\\Vikra\\Downloads\\Products.csv")
transactions = pd.read_csv("C:\\Users\\Vikra\\Downloads\\Transactions.csv")

# Display the first few rows of the data
customers.head(), products.head(), transactions.head()


(  CustomerID        CustomerName         Region  SignupDate
 0      C0001    Lawrence Carroll  South America  2022-07-10
 1      C0002      Elizabeth Lutz           Asia  2022-02-13
 2      C0003      Michael Rivera  South America  2024-03-07
 3      C0004  Kathleen Rodriguez  South America  2022-10-09
 4      C0005         Laura Weber           Asia  2022-08-15,
   ProductID              ProductName     Category   Price
 0      P001     ActiveWear Biography        Books  169.30
 1      P002    ActiveWear Smartwatch  Electronics  346.30
 2      P003  ComfortLiving Biography        Books   44.12
 3      P004            BookWorld Rug   Home Decor   95.69
 4      P005          TechPro T-Shirt     Clothing  429.31,
   TransactionID CustomerID ProductID      TransactionDate  Quantity  \
 0        T00001      C0199      P067  2024-08-25 12:38:23         1   
 1        T00112      C0146      P067  2024-05-27 22:23:54         1   
 2        T00166      C0127      P067   2024-04-25 7:38:55    

In [4]:
# Merge the data to get a combined dataset for analysis
data = transactions.merge(customers, on="CustomerID").merge(products, on="ProductID")

# Show the first few rows of the combined data
data.head()


Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price_x,CustomerName,Region,SignupDate,ProductName,Category,Price_y
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68,Andrea Jenkins,Europe,2022-12-03,ComfortLiving Bluetooth Speaker,Electronics,300.68
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68,Brittany Harvey,Asia,2024-09-04,ComfortLiving Bluetooth Speaker,Electronics,300.68
2,T00166,C0127,P067,2024-04-25 7:38:55,1,300.68,300.68,Kathryn Stevens,Europe,2024-04-04,ComfortLiving Bluetooth Speaker,Electronics,300.68
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68,Travis Campbell,South America,2024-04-11,ComfortLiving Bluetooth Speaker,Electronics,300.68
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68,Timothy Perez,Europe,2022-03-15,ComfortLiving Bluetooth Speaker,Electronics,300.68


In [5]:
# Group data by CustomerID and aggregate features like TotalValue and Region
customer_features = data.groupby('CustomerID').agg({
    'TotalValue': 'sum',  # Sum of all purchases for the customer
    'Region': 'first'  # Get the region for the customer
}).reset_index()

# Combine features into a 'combined_features' column for text vectorization
customer_features['combined_features'] = customer_features['Region'] + ' ' + customer_features['TotalValue'].astype(str)

# Show the first few rows of customer features
customer_features.head()


Unnamed: 0,CustomerID,TotalValue,Region,combined_features
0,C0001,3354.52,South America,South America 3354.52
1,C0002,1862.74,Asia,Asia 1862.74
2,C0003,2725.38,South America,South America 2725.38
3,C0004,5354.88,South America,South America 5354.88
4,C0005,2034.24,Asia,Asia 2034.24


In [6]:
# Vectorize the customer profile using TF-IDF to convert text features into numerical form
vectorizer = TfidfVectorizer()
feature_matrix = vectorizer.fit_transform(customer_features['combined_features'])

# Show the shape of the feature matrix
feature_matrix.shape


(199, 294)

In [7]:
# Calculate the cosine similarity between the customer profiles
similarity_matrix = cosine_similarity(feature_matrix)

# Show the similarity matrix (first 5 rows)
similarity_matrix[:5, :5]


array([[1.        , 0.        , 0.12317137, 0.12317137, 0.        ],
       [0.        , 1.        , 0.        , 0.        , 0.09601045],
       [0.12317137, 0.        , 1.        , 0.11938449, 0.        ],
       [0.12317137, 0.        , 0.11938449, 1.        , 0.        ],
       [0.        , 0.09601045, 0.        , 0.        , 1.        ]])

In [8]:
# Generate the top 3 most similar customers for each customer
lookalike_map = {}

for idx, row in customer_features.iterrows():
    customer_id = row['CustomerID']
    
    # Get similarity scores for this customer
    similarity_scores = similarity_matrix[idx]
    
    # Get indices of top 3 most similar customers, excluding the customer itself
    similar_indices = similarity_scores.argsort()[-4:-1]  # Top 3 excluding self
    similar_customers = customer_features.iloc[similar_indices]
    
    # Map lookalike customers with similarity scores
    lookalike_map[customer_id] = [
        (row.CustomerID, similarity_scores[idx]) for idx, row in zip(similar_indices, similar_customers.itertuples())
    ]


In [9]:
# Format the recommendations and save to a CSV file
lookalike_data = []
for customer_id, similar_customers in lookalike_map.items():
    for similar_customer_id, score in similar_customers:
        lookalike_data.append([customer_id, similar_customer_id, score])

# Convert the data into a DataFrame
lookalike_df = pd.DataFrame(lookalike_data, columns=['cust_id', 'lookalike_cust_id', 'similarity_score'])

# Save to CSV
lookalike_df.to_csv('FirstName_LastName_Lookalike.csv', index=False)

# Display the first few rows of the generated CSV
lookalike_df.head()


Unnamed: 0,cust_id,lookalike_cust_id,similarity_score
0,C0001,C0080,0.346615
1,C0001,C0121,0.348448
2,C0001,C0158,0.469805
3,C0002,C0106,0.125864
4,C0002,C0200,0.125864


In [10]:

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


In [11]:
# Load the datasets: Customers, Products, and Transactions
customers = pd.read_csv(r"C:\Downloads\Customers.csv")
products = pd.read_csv(r"C:\Downloads\Products.csv")
transactions = pd.read_csv(r"C:\Downloads\Transactions.csv")

# Display the first few rows of the data
customers.head(), products.head(), transactions.head()


FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Downloads\\Customers.csv'

In [None]:
# Merge the data to get a combined dataset for analysis
data = transactions.merge(customers, on="CustomerID").merge(products, on="ProductID")

# Show the first few rows of the combined data
data.head()


In [None]:
# Group data by CustomerID and aggregate features like TotalValue and Region
customer_features = data.groupby('CustomerID').agg({
    'TotalValue': 'sum',  # Sum of all purchases for the customer
    'Region': 'first'  # Get the region for the customer
}).reset_index()

# Combine features into a 'combined_features' column for text vectorization
customer_features['combined_features'] = customer_features['Region'] + ' ' + customer_features['TotalValue'].astype(str)

# Show the first few rows of customer features
customer_features.head()


In [None]:
# Vectorize the customer profile using TF-IDF to convert text features into numerical form
vectorizer = TfidfVectorizer()
feature_matrix = vectorizer.fit_transform(customer_features['combined_features'])

# Show the shape of the feature matrix
feature_matrix.shape
