# Task 2

In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# For demonstration (visual checks, not strictly required)
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

#### Loaded & Merged Datasets

In [3]:
# Reading the raw data
customers_df = pd.read_csv('Customers.csv')
products_df = pd.read_csv('Products.csv')
transactions_df = pd.read_csv('Transactions.csv')

# Merged transactions with products
trans_products_df = pd.merge(
    transactions_df,
    products_df,
    on='ProductID',
    how='left',
    suffixes=('_trans', '_prod')
)

# Merged with customers
full_df = pd.merge(
    trans_products_df,
    customers_df,
    on='CustomerID',
    how='left'
)

print(full_df.head())

  TransactionID CustomerID ProductID      TransactionDate  Quantity  \
0        T00001      C0199      P067  2024-08-25 12:38:23         1   
1        T00112      C0146      P067  2024-05-27 22:23:54         1   
2        T00166      C0127      P067  2024-04-25 07:38:55         1   
3        T00272      C0087      P067  2024-03-26 22:55:37         2   
4        T00363      C0070      P067  2024-03-21 15:10:10         3   

   TotalValue  Price_trans                      ProductName     Category  \
0      300.68       300.68  ComfortLiving Bluetooth Speaker  Electronics   
1      300.68       300.68  ComfortLiving Bluetooth Speaker  Electronics   
2      300.68       300.68  ComfortLiving Bluetooth Speaker  Electronics   
3      601.36       300.68  ComfortLiving Bluetooth Speaker  Electronics   
4      902.04       300.68  ComfortLiving Bluetooth Speaker  Electronics   

   Price_prod     CustomerName         Region  SignupDate  
0      300.68   Andrea Jenkins         Europe  2022-12-0

 ### Created Customer-Level Feature Vectors

#### Spending by Category

In [4]:
# Aggregate total spending by Category
category_spend = full_df.groupby(['CustomerID', 'Category'])['TotalValue'].sum().reset_index()

#### Total quantity purchased

In [5]:
# Pivot: Rows = CustomerID, Columns = Categories, Values = Sum of Spend
category_spend_pivot = category_spend.pivot(index='CustomerID', columns='Category', values='TotalValue').fillna(0)

#### Average order value

In [6]:
# Added more features like overall purchase frequency or average transaction value
purchase_count = full_df.groupby('CustomerID')['TransactionID'].nunique()
category_spend_pivot['PurchaseCount'] = purchase_count

#### Number of transactions

In [7]:
# Added total spend column
total_spend = full_df.groupby('CustomerID')['TotalValue'].sum()
category_spend_pivot['TotalSpend'] = total_spend

#### SignupDate (converted to days since signup)

In [8]:
# Added days since signup or year of signup
customers_df['SignupDate'] = pd.to_datetime(customers_df['SignupDate'])
customers_df['DaysSinceSignup'] = (pd.to_datetime('today') - customers_df['SignupDate']).dt.days
category_spend_pivot = category_spend_pivot.merge(
    customers_df[['CustomerID', 'DaysSinceSignup']],
    how='left',
    on='CustomerID'
)

#### Feature matrix

In [9]:
feature_df = category_spend_pivot.copy()
feature_df.head()

Unnamed: 0,CustomerID,Books,Clothing,Electronics,Home Decor,PurchaseCount,TotalSpend,DaysSinceSignup
0,C0001,114.6,0.0,2827.3,412.62,5,3354.52,930
1,C0002,0.0,1025.46,0.0,837.28,4,1862.74,1077
2,C0003,0.0,122.36,1385.2,1217.82,4,2725.38,324
3,C0004,1888.48,0.0,1355.74,2110.66,8,5354.88,839
4,C0005,0.0,0.0,1180.38,853.86,3,2034.24,894


#### Computed Similarities

In [10]:
# Maked sure CustomerID is the index
feature_df.reset_index(inplace=True)
feature_df.set_index('CustomerID', inplace=True)

#### Converted to matrix

In [11]:
# shape (num_customers, num_features)
feature_matrix = feature_df.values

#### Calculated pairwise cosine similarity

In [12]:
similarity_matrix = cosine_similarity(feature_matrix, feature_matrix)

#### Transformed into a DataFrame for easier readability

In [13]:
sim_df = pd.DataFrame(
    similarity_matrix,
    index=feature_df.index,
    columns=feature_df.index
)

In [14]:
# small sample of the matrix
sim_df.iloc[:5, :5]

CustomerID,C0001,C0002,C0003,C0004,C0005
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
C0001,1.0,0.6674,0.929938,0.838569,0.947797
C0002,0.6674,1.0,0.78601,0.79909,0.814453
C0003,0.929938,0.78601,1.0,0.931812,0.967762
C0004,0.838569,0.79909,0.931812,1.0,0.903991
C0005,0.947797,0.814453,0.967762,0.903991,1.0


### Found Top 3 Lookalikes for Each Customer

####  Helper function that, given a customer ID, returns the top N similar customers

In [15]:
def get_top_n_similar(customersim_df, cust_id, n=3):

    scores = customersim_df.loc[cust_id].copy()         # got all similarity scores for the target customer
    scores.drop(index=cust_id, inplace=True)            # removed self similarity
    top_n = scores.sort_values(ascending=False).head(n) # sorted descending by similarity
    return list(zip(top_n.index, top_n.values))         # returned a list of tuples (similar_cust_id, similarity_score)

#### Generated Lookalikes for the First 20 Customers

In [16]:
lookalike_results = []

In [17]:
# for cust_id from C0001 to C0020
for i in range(1, 21):
    # Construct the string 'C0001', 'C0002', etc.
    cust_id = f"C{str(i).zfill(4)}"

    # Getting top 3
    top_3 = get_top_n_similar(sim_df, cust_id, n=3)

    # Prepared CSV
    # stored a single row for each cust_id with its top 3 lookalikes and scores
    lookalike_results.append({
        "CustomerID": cust_id,
        "Lookalikes": top_3  # list of (cust_id, score) pairs
    })

#### Converted to DataFrame

In [20]:
lookalike_df = pd.DataFrame(lookalike_results)
lookalike_df.head(10)

Unnamed: 0,CustomerID,Lookalikes
0,C0001,"[(C0140, 0.9963959141465663), (C0091, 0.996099..."
1,C0002,"[(C0134, 0.9911381185502217), (C0159, 0.990385..."
2,C0003,"[(C0163, 0.9986899247441967), (C0085, 0.992239..."
3,C0004,"[(C0075, 0.9976146166883211), (C0090, 0.993058..."
4,C0005,"[(C0007, 0.9970394217473295), (C0197, 0.988203..."
5,C0006,"[(C0081, 0.997229092486295), (C0185, 0.9965652..."
6,C0007,"[(C0005, 0.9970394217473295), (C0197, 0.995208..."
7,C0008,"[(C0055, 0.9956513564960103), (C0170, 0.993289..."
8,C0009,"[(C0032, 0.970797524091056), (C0027, 0.9654228..."
9,C0010,"[(C0029, 0.995071354106565), (C0042, 0.9931131..."


In [23]:
lookalike_df.to_csv('Lookalike_Before_Formatting.csv', index=False)

 ### Formatting & Saving the “Lookalike.csv”

As asked : " just one map: Map<cust_id, List<cust_id, score>> "

#### stored it as a JSON-like string.

 ##### Converted [(C0022, 0.93), (C0017, 0.87), (C0045, 0.85)] into "C0022:0.93, C0017:0.87, C0045:0.85"

In [24]:
def format_lookalikes(lookalikes_list):
    return ", ".join([f"{t[0]}:{t[1]:.15f}" for t in lookalikes_list])

In [25]:
lookalike_df['LookalikeMap'] = lookalike_df['Lookalikes'].apply(format_lookalikes)

In [26]:
final_lookalike_df = lookalike_df[['CustomerID', 'LookalikeMap']]
final_lookalike_df.head()

Unnamed: 0,CustomerID,LookalikeMap
0,C0001,"C0140:0.996395914146566, C0091:0.9960997794890..."
1,C0002,"C0134:0.991138118550222, C0159:0.9903852930552..."
2,C0003,"C0163:0.998689924744197, C0085:0.9922390132778..."
3,C0004,"C0075:0.997614616688321, C0090:0.9930580747653..."
4,C0005,"C0007:0.997039421747330, C0197:0.9882039808850..."


#### Saving the Lookalike.csv

In [27]:
final_lookalike_df.to_csv('Lookalike.csv', index=False)