<a href="https://colab.research.google.com/github/venkateshblks/Data-Science-Assignment-eCommerce-Transactions/blob/main/Bantupalli_Venkateswararao_Lookalike.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import gdown
ids = {
    '1bu_--mo79VdUG9oin4ybfFGRUSXAe-WE': 'customers.csv',
    '1IKuDizVapw-hyktwfpoAoaGtHtTNHfd0': 'products.csv',
    '1saEqdbBB-vuk2hxoAf4TzDEsykdKlzbF': 'transactions.csv',
}

for file_id, output_filename in ids.items():
    url = f'https://drive.google.com/uc?id={file_id}'
    gdown.download(url, output_filename, quiet=False)
    print(f'Downloaded {output_filename}')

Downloading...
From: https://drive.google.com/uc?id=1bu_--mo79VdUG9oin4ybfFGRUSXAe-WE
To: /content/customers.csv
100%|██████████| 8.54k/8.54k [00:00<00:00, 21.8MB/s]


Downloaded customers.csv


Downloading...
From: https://drive.google.com/uc?id=1IKuDizVapw-hyktwfpoAoaGtHtTNHfd0
To: /content/products.csv
100%|██████████| 4.25k/4.25k [00:00<00:00, 5.47MB/s]


Downloaded products.csv


Downloading...
From: https://drive.google.com/uc?id=1saEqdbBB-vuk2hxoAf4TzDEsykdKlzbF
To: /content/transactions.csv
100%|██████████| 54.7k/54.7k [00:00<00:00, 44.0MB/s]

Downloaded transactions.csv





# Lookalike Model

In [5]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
import pandas as pd

In [6]:
customers=pd.read_csv('customers.csv')
products=pd.read_csv('products.csv')
transactions=pd.read_csv('transactions.csv')
display(customers.head(3))
display(customers.shape)
display(products.head(3))
display(products.shape)
display(transactions.head(3))
transactions.shape

Unnamed: 0,CustomerID,CustomerName,Region,SignupDate
0,C0001,Lawrence Carroll,South America,2022-07-10
1,C0002,Elizabeth Lutz,Asia,2022-02-13
2,C0003,Michael Rivera,South America,2024-03-07


(200, 4)

Unnamed: 0,ProductID,ProductName,Category,Price
0,P001,ActiveWear Biography,Books,169.3
1,P002,ActiveWear Smartwatch,Electronics,346.3
2,P003,ComfortLiving Biography,Books,44.12


(100, 4)

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68


(1000, 7)

In [8]:
merged_df = transactions.merge(customers, on='CustomerID', how='left')
merged_df = merged_df.merge(products, on='ProductID', how='left')
merged_df = merged_df.drop(columns=['Price_y'])
merged_df = merged_df.rename(columns={'Price_x': 'Price'})
merged_df.head(3)

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price,CustomerName,Region,SignupDate,ProductName,Category
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68,Andrea Jenkins,Europe,2022-12-03,ComfortLiving Bluetooth Speaker,Electronics
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68,Brittany Harvey,Asia,2024-09-04,ComfortLiving Bluetooth Speaker,Electronics
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68,Kathryn Stevens,Europe,2024-04-04,ComfortLiving Bluetooth Speaker,Electronics


In [9]:
transactions=merged_df.groupby('CustomerID').agg({
    'TransactionID': 'count',  # Number of purchases
    'TotalValue': 'sum',  # Total spend
    'Price': 'mean'  # Average price of items bought
})
transactions

Unnamed: 0_level_0,TransactionID,TotalValue,Price
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
C0001,5,3354.52,278.334000
C0002,4,1862.74,208.920000
C0003,4,2725.38,195.707500
C0004,8,5354.88,240.636250
C0005,3,2034.24,291.603333
...,...,...,...
C0196,4,4982.88,416.992500
C0197,3,1928.65,227.056667
C0198,2,931.83,239.705000
C0199,4,1979.28,250.610000


In [10]:
category_spend = pd.pivot_table(merged_df, index='CustomerID', columns='Category', values='TotalValue', aggfunc='sum', fill_value=0)
category_spend

Category,Books,Clothing,Electronics,Home Decor
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
C0001,114.60,0.00,2827.30,412.62
C0002,0.00,1025.46,0.00,837.28
C0003,0.00,122.36,1385.20,1217.82
C0004,1888.48,0.00,1355.74,2110.66
C0005,0.00,0.00,1180.38,853.86
...,...,...,...,...
C0196,1310.67,1585.36,0.00,2086.85
C0197,0.00,0.00,914.92,1013.73
C0198,0.00,904.84,26.99,0.00
C0199,0.00,0.00,594.38,1384.90


In [11]:
scaler = StandardScaler()
transaction_features = transactions.join(category_spend, on='CustomerID')
scaled = scaler.fit_transform(transaction_features)

In [14]:
transaction_features.head()

Unnamed: 0_level_0,TransactionID,TotalValue,Price,Books,Clothing,Electronics,Home Decor
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
C0001,5,3354.52,278.334,114.6,0.0,2827.3,412.62
C0002,4,1862.74,208.92,0.0,1025.46,0.0,837.28
C0003,4,2725.38,195.7075,0.0,122.36,1385.2,1217.82
C0004,8,5354.88,240.63625,1888.48,0.0,1355.74,2110.66
C0005,3,2034.24,291.603333,0.0,0.0,1180.38,853.86


In [12]:
similarity_matrix=cosine_similarity(scaled)

In [13]:
similarity_matrix.shape

(199, 199)

In [15]:
lookalike_recommendations = {}

for i in range(len(customers)-1):
    cust_id = customers.iloc[i]['CustomerID']
    similarity_scores = similarity_matrix[i]
    similar_customer_indices = similarity_scores.argsort()[-4:-1]  # Top 3 excluding self
    similar_customer_ids = customers.iloc[similar_customer_indices]['CustomerID']
    similar_scores = similarity_scores[similar_customer_indices]

    lookalike_recommendations[cust_id] = list(zip(similar_customer_ids, similar_scores))
lookalike_df = pd.DataFrame.from_dict(lookalike_recommendations, orient='index')

In [16]:
lookalike_df.columns = ['(Lookalike1, Score)', '(Lookalike2, Score)', '(Lookalike3, Score)']

In [17]:
Lookalike=lookalike_df.head(20)
Lookalike

Unnamed: 0,"(Lookalike1, Score)","(Lookalike2, Score)","(Lookalike3, Score)"
C0001,"(C0120, 0.8743786159035025)","(C0180, 0.8939086131691001)","(C0069, 0.9793580666094183)"
C0002,"(C0062, 0.8729839461266474)","(C0159, 0.9205634390349692)","(C0036, 0.9345842357809427)"
C0003,"(C0031, 0.8447183439552948)","(C0196, 0.8763914813839384)","(C0166, 0.8971896341140942)"
C0004,"(C0065, 0.9119887592773301)","(C0090, 0.937003377942026)","(C0075, 0.9784687475152085)"
C0005,"(C0085, 0.8726654503746094)","(C0140, 0.8807008597141818)","(C0007, 0.8808970676907522)"
C0006,"(C0199, 0.8156855722717157)","(C0195, 0.8360901667873226)","(C0184, 0.8523580968399286)"
C0007,"(C0026, 0.8065135084621552)","(C0005, 0.8808970676907522)","(C0085, 0.9860753164482993)"
C0008,"(C0175, 0.7687445281438812)","(C0173, 0.7701536374218426)","(C0109, 0.9306004167922095)"
C0009,"(C0020, 0.9229800131076652)","(C0130, 0.9496809055507356)","(C0150, 0.9517324168772279)"
C0010,"(C0176, 0.9340827063904426)","(C0029, 0.9636197384467877)","(C0111, 0.9673431433564499)"


In [18]:
transaction_features[(transaction_features.index=='C0007') | (transaction_features.index=='C0085')].sort_values(by='CustomerID')

Unnamed: 0_level_0,TransactionID,TotalValue,Price,Books,Clothing,Electronics,Home Decor
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
C0007,3,2579.82,350.02,0.0,0.0,1360.06,1219.76
C0085,3,2598.33,350.45,277.86,0.0,1312.95,1007.52


**Based on transaction count, total value, and category distribution,  customers are similar**

In [20]:
Lookalike.to_csv('Bantupalli_Venkateswararao_Lookalike.csv',index_label='CustomerID')

In [22]:
cust_sim=merged_df.pivot_table(index='CustomerID', columns='ProductName', values='Quantity', aggfunc='sum', fill_value=0)
cosine_sim=cosine_similarity(cust_sim)
cosine_sim.shape
lookalike_recommendations = {}

for i in range(len(cust_sim)):
    cust_id = cust_sim.index[i]
    similarity_scores = cosine_sim[i]
    similar_customer_indices = similarity_scores.argsort()[-4:-1]  # Top 3 excluding self
    similar_customer_ids = cust_sim.index[similar_customer_indices]
    similar_scores = similarity_scores[similar_customer_indices]

    lookalike_recommendations[cust_id] = list(zip(similar_customer_ids, similar_scores))
lookalike_df1 = pd.DataFrame.from_dict(lookalike_recommendations, orient='index')
lookalike_df1.columns = ['(Lookalike1, Score)', '(Lookalike2,Score)', '(Lookalike3,Score)']

In [23]:
lookalike_df1

Unnamed: 0,"(Lookalike1, Score)","(Lookalike2,Score)","(Lookalike3,Score)"
C0001,"(C0105, 0.5465943944999485)","(C0097, 0.5477225575051661)","(C0050, 0.5838742081211422)"
C0002,"(C0091, 0.3339135484518728)","(C0032, 0.3418817293789138)","(C0030, 0.6115766297251506)"
C0003,"(C0140, 0.5487954724560282)","(C0164, 0.5547001962252291)","(C0181, 0.6102160571171791)"
C0004,"(C0078, 0.46188021535170054)","(C0070, 0.4988876515698588)","(C0065, 0.533113989983183)"
C0005,"(C0055, 0.5144957554275265)","(C0164, 0.5707817929853929)","(C0096, 0.6482037235521645)"
...,...,...,...
C0196,"(C0138, 0.5986710947139653)","(C0016, 0.6414269805898185)","(C0123, 0.6708203932499368)"
C0197,"(C0026, 0.5396661147204319)","(C0075, 0.5420821648636565)","(C0084, 0.5851918189130048)"
C0198,"(C0054, 0.4103913408340616)","(C0106, 0.44112877325628463)","(C0096, 0.47809144373375745)"
C0199,"(C0104, 0.5268684479505137)","(C0121, 0.5656854249492381)","(C0097, 0.8)"


**Based on product,  customers are similar**

In [24]:
merged_df[(merged_df['CustomerID']=='C0199') | (merged_df['CustomerID']=='C0097')]

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price,CustomerName,Region,SignupDate,ProductName,Category
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68,Andrea Jenkins,Europe,2022-12-03,ComfortLiving Bluetooth Speaker,Electronics
428,T00051,C0097,P022,2024-03-23 05:58:54,1,137.54,137.54,Tina Ford,Asia,2023-12-18,HomeSense Wall Art,Home Decor
439,T00761,C0199,P022,2024-10-01 05:57:09,4,550.16,137.54,Andrea Jenkins,Europe,2022-12-03,HomeSense Wall Art,Home Decor
918,T00626,C0199,P079,2024-08-17 12:06:08,2,834.74,417.37,Andrea Jenkins,Europe,2022-12-03,ActiveWear Rug,Home Decor
940,T00963,C0199,P008,2024-10-26 00:01:58,2,293.7,146.85,Andrea Jenkins,Europe,2022-12-03,BookWorld Bluetooth Speaker,Electronics
