# Task 2: Lookalike model

## importing the basics libraries

In [23]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

# Read the data

In [24]:

products_df = pd.read_csv('Products.csv')
customers_df = pd.read_csv('Customers.csv')
transactions_df = pd.read_csv('Transactions.csv')

# Convert dates to datetime

In [25]:

customers_df['SignupDate'] = pd.to_datetime(customers_df['SignupDate'])
transactions_df['TransactionDate'] = pd.to_datetime(transactions_df['TransactionDate'])

# Create customer feature matrix

In [26]:

def create_customer_features():
    # Merge transactions with products to get category information
    trans_prod = pd.merge(transactions_df, products_df, on='ProductID')
    
    # Calculate customer-level features
    customer_features = pd.DataFrame()
    
    # 1. Total spend by category
    category_spend = trans_prod.pivot_table(
        index='CustomerID',
        columns='Category',
        values='TotalValue',
        aggfunc='sum',
        fill_value=0
    )
    
    # 2. Average order value
    avg_order = trans_prod.groupby('CustomerID')['TotalValue'].mean()
    
    # 3. Total number of transactions
    transaction_count = trans_prod.groupby('CustomerID').size()
    
    # 4. Average items per transaction
    avg_items = trans_prod.groupby('CustomerID')['Quantity'].mean()
    
    # 5. Days since signup
    customers_df['DaysSinceSignup'] = (pd.Timestamp.now() - customers_df['SignupDate']).dt.days
    # Combine all features
    customer_features = pd.concat([
        category_spend,
        avg_order.rename('AvgOrderValue'),
        transaction_count.rename('TransactionCount'),
        avg_items.rename('AvgItemsPerTransaction'),
        customers_df.set_index('CustomerID')['DaysSinceSignup']
    ], axis=1)
    
    return customer_features    

# Create feature matrix

In [27]:
customer_features = create_customer_features()

# Normalize features

In [28]:
scaler = StandardScaler()
normalized_features = scaler.fit_transform(customer_features.fillna(0))
normalized_features_df = pd.DataFrame(
    normalized_features, 
    index=customer_features.index,
    columns=customer_features.columns
)

# Calculate similarity matrix

In [29]:
similarity_matrix = cosine_similarity(normalized_features)
similarity_df = pd.DataFrame(
    similarity_matrix,
    index=customer_features.index,
    columns=customer_features.index
)

# Function to get top lookalikes

In [30]:
def get_lookalikes(customer_id, n=3):
    if customer_id not in similarity_df.index:
        return []
    
    # Get similarity scores for the customer
    customer_similarities = similarity_df.loc[customer_id]
    
    # Sort and get top n similar customers (excluding self)
    top_similar = customer_similarities.sort_values(ascending=False)[1:n+1]
    
    return [(cust_id, round(score, 4)) for cust_id, score in top_similar.items()]


# Generate lookalikes for first 20 customers

In [31]:
lookalike_results = {}
for cust_id in customers_df['CustomerID'][:20]:
    lookalikes = get_lookalikes(cust_id)
    lookalike_results[cust_id] = lookalikes

# Create Lookalike.csv

In [32]:
lookalike_data = []
for cust_id, recommendations in lookalike_results.items():
    # Format recommendations as a string
    rec_str = ';'.join([f"{rec[0]}:{rec[1]}" for rec in recommendations])
    lookalike_data.append({
        'CustomerID': cust_id,
        'Lookalikes': rec_str
    })

lookalike_df = pd.DataFrame(lookalike_data)
lookalike_df.to_csv('Sunil_Kumar_Lookalike.csv', index=False)


# Print results

In [33]:
print("Lookalike Recommendations for first 20 customers:")
for cust_id, lookalikes in lookalike_results.items():
    print(f"\n{cust_id}:")
    for similar_cust, score in lookalikes:
        print(f"  {similar_cust}: {score}")

Lookalike Recommendations for first 20 customers:

C0001:
  C0125: 0.8156
  C0023: 0.8054
  C0120: 0.7923

C0002:
  C0029: 0.8726
  C0106: 0.8633
  C0159: 0.8487

C0003:
  C0144: 0.8461
  C0026: 0.7927
  C0110: 0.7847

C0004:
  C0075: 0.9474
  C0113: 0.8216
  C0104: 0.7837

C0005:
  C0007: 0.8801
  C0166: 0.8433
  C0199: 0.7771

C0006:
  C0185: 0.9404
  C0200: 0.8516
  C0138: 0.8155

C0007:
  C0005: 0.8801
  C0166: 0.8181
  C0197: 0.7682

C0008:
  C0024: 0.8476
  C0098: 0.7874
  C0194: 0.7697

C0009:
  C0180: 0.9785
  C0097: 0.9583
  C0058: 0.9497

C0010:
  C0062: 0.9098
  C0029: 0.8795
  C0027: 0.8213

C0011:
  C0153: 0.8737
  C0099: 0.7747
  C0074: 0.7675

C0012:
  C0059: 0.8815
  C0195: 0.8488
  C0163: 0.8111

C0013:
  C0046: 0.9441
  C0099: 0.8646
  C0022: 0.8178

C0014:
  C0033: 0.978
  C0032: 0.9152
  C0015: 0.9139

C0015:
  C0014: 0.9139
  C0123: 0.9135
  C0128: 0.9035

C0016:
  C0126: 0.6635
  C0191: 0.6345
  C0044: 0.629

C0017:
  C0065: 0.8664
  C0090: 0.7877
  C0075: 0.7823
