### Calculating Similarity Score for segmentation, marketing, or recommendation tasks

In [7]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

customers = pd.read_csv('Customers.csv')
products = pd.read_csv('Products.csv')
transactions = pd.read_csv('Transactions.csv')
merged_data = transactions.merge(products, on='ProductID').merge(customers, on='CustomerID')

customer_features = merged_data.groupby('CustomerID').agg(
    total_spent=('TotalValue', 'sum'),
    avg_transaction_value=('TotalValue', 'mean'),
    total_transactions=('TransactionID', 'count'),
    unique_products=('ProductID', 'nunique'),
    favorite_category=('Category', lambda x: x.mode()[0])
).reset_index()
customer_features = customer_features.merge(customers[['CustomerID', 'Region']], on='CustomerID')

categorical_features = ['favorite_category', 'Region']
numerical_features = ['total_spent', 'avg_transaction_value', 'total_transactions', 'unique_products']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(), categorical_features)
    ]
)
X = preprocessor.fit_transform(customer_features)

similarity_matrix = cosine_similarity(X)

def get_top_3_lookalikes(customer_index, similarity_matrix, customer_ids):
    similarities = list(enumerate(similarity_matrix[customer_index]))
    similarities = sorted(similarities, key=lambda x: x[1], reverse=True)
    top_3 = [(customer_ids[i], score) for i, score in similarities[1:4]] 
    return top_3
customer_ids = customer_features['CustomerID'].tolist()
lookalike_results = {}

for i, customer_id in enumerate(customer_ids[:20]):
    lookalike_results[customer_id] = get_top_3_lookalikes(i, similarity_matrix, customer_ids)

lookalike_df = pd.DataFrame([
    {'CustomerID': cust_id, 'Lookalikes': lookalikes}
    for cust_id, lookalikes in lookalike_results.items()
])

lookalike_df['Lookalikes'] = lookalike_df['Lookalikes'].apply(lambda x: [{'CustomerID': cust[0], 'Score': round(cust[1], 4)} for cust in x])
lookalike_df.to_csv('Lookalike.csv', index=False)

print("Lookalike model completed")


Lookalike model completed


#### Cross Checking score for the customer who are in same list 

In [8]:
merged_data = transactions.merge(products, on='ProductID').merge(customers, on='CustomerID')

customer_features = merged_data.groupby('CustomerID').agg(
    total_spent=('TotalValue', 'sum'),
    avg_transaction_value=('TotalValue', 'mean'),
    total_transactions=('TransactionID', 'count'),
    unique_products=('ProductID', 'nunique'),
    favorite_category=('Category', lambda x: x.mode()[0])
).reset_index()

customer_features = customer_features.merge(customers[['CustomerID', 'Region']], on='CustomerID')

categorical_features = ['favorite_category', 'Region']
numerical_features = ['total_spent', 'avg_transaction_value', 'total_transactions', 'unique_products']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(), categorical_features)
    ]
)
X = preprocessor.fit_transform(customer_features)
similarity_matrix = cosine_similarity(X)


def calculate_pair_similarity(customer_id1, customer_id2, customer_features, preprocessor):
    customer1 = customer_features[customer_features['CustomerID'] == customer_id1]
    customer2 = customer_features[customer_features['CustomerID'] == customer_id2]
    customer1_processed = preprocessor.transform(customer1)
    customer2_processed = preprocessor.transform(customer2)
    similarity = cosine_similarity(customer1_processed, customer2_processed)[0][0]
    return similarity

# Cross-check similarity for specific customer ID
customer_id1 = 'C0002'
customer_id2 = 'C0088'
similarity_score = calculate_pair_similarity(customer_id1, customer_id2, customer_features, preprocessor)
print(f"Similarity score between {customer_id1} and {customer_id2}: {similarity_score:.4f}")


Similarity score between C0002 and C0088: 0.9388
