## Step 1: Loading Required Libraries

In [9]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics.pairwise import cosine_similarity
import datetime


## Step 2: Loading the Data

In [10]:
# Load the datasets
customers = pd.read_csv('Customers.csv')
products = pd.read_csv('Products.csv')
transactions = pd.read_csv('Transactions.csv')


## Step 3: Preprocessing Customer Data

In [11]:
# Preprocess customer data: one-hot encode the Region and calculate days since signup
customers['SignupDate'] = pd.to_datetime(customers['SignupDate'])
customers['SignupDuration'] = (pd.to_datetime('today') - customers['SignupDate']).dt.days

# One-hot encode Region
encoder = OneHotEncoder(sparse_output=False)  # Updated to sparse_output=False
region_encoded = encoder.fit_transform(customers[['Region']])
region_df = pd.DataFrame(region_encoded, columns=encoder.categories_[0])

# Combine customer data with region encoding
customers_encoded = pd.concat([customers[['CustomerID', 'SignupDuration']], region_df], axis=1)

# Preprocess product data: One-hot encode product categories
product_encoder = OneHotEncoder(sparse_output=False)
product_encoded = product_encoder.fit_transform(products[['Category']])
product_df = pd.DataFrame(product_encoded, columns=product_encoder.categories_[0])

# Now let's merge the transaction data with customer and product data
# Join transaction with customer and product info
transaction_data = transactions.merge(customers[['CustomerID', 'SignupDuration']], on='CustomerID', how='left')
transaction_data = transaction_data.merge(products[['ProductID', 'Category']], on='ProductID', how='left')

# One-hot encode the 'Category' column in transaction data
transaction_data['Category'] = transaction_data['Category'].fillna('Unknown')  # Ensure no NaN values
category_encoded = product_encoder.transform(transaction_data[['Category']])
category_df = pd.DataFrame(category_encoded, columns=product_encoder.categories_[0])

## Step 4: Preprocess Transaction Data
Aggregating the transaction data to get transaction features like the total number of transactions, average transaction value, recency of the last purchase, and average quantity purchased.

In [12]:
# Merge transactions with product information
transactions = transactions.merge(products[['ProductID', 'Category']], on='ProductID', how='left')

# Aggregate transaction features per customer
transaction_agg = transactions.groupby('CustomerID').agg(
    num_transactions=('TransactionID', 'count'),
    avg_transaction_value=('TotalValue', 'mean'),
    most_frequent_category=('Category', lambda x: x.mode()[0]),  # Most frequent product category
    last_purchase_recency=('TransactionDate', lambda x: (pd.to_datetime('today') - pd.to_datetime(x.max())).days),
    avg_quantity=('Quantity', 'mean')
).reset_index()


## Step 5: Merging Customer and Transaction Features


In [18]:
features = pd.concat([transaction_data[['SignupDuration']], category_df], axis=1)

## Step 6: Building Feature Vectors

In [19]:
# Assuming 'features' DataFrame already includes the 'CustomerID' column
# and that all features are already encoded

# Retain CustomerID in the features dataframe to use it for groupby
features['CustomerID'] = transaction_data['CustomerID']

# Now group by CustomerID and take the mean of features for each customer
feature_matrix = features.groupby('CustomerID').mean()

# Calculate cosine similarity between the customers
similarity_matrix = cosine_similarity(feature_matrix)

# Check if the feature_matrix is correct and shape is as expected
print(feature_matrix.shape)

# Now similarity_matrix will have the similarity scores between customers


(199, 5)


## Step 7: Calculating Cosine Similarity

In [20]:
# Calculate cosine similarity between all customers
assert similarity_matrix.shape[0] == similarity_matrix.shape[1]

# Print similarity matrix for verification (optional)
print(similarity_matrix)


[[1.         0.99999963 0.99999881 ... 0.99999983 0.99999989 0.99999971]
 [0.99999963 1.         0.99999909 ... 0.99999978 0.99999968 0.99999986]
 [0.99999881 0.99999909 1.         ... 0.99999874 0.99999929 0.99999913]
 ...
 [0.99999983 0.99999978 0.99999874 ... 1.         0.99999967 0.99999986]
 [0.99999989 0.99999968 0.99999929 ... 0.99999967 1.         0.99999963]
 [0.99999971 0.99999986 0.99999913 ... 0.99999986 0.99999963 1.        ]]


## Step 8: Generate Lookalike Recommendations

In [21]:
# Prepare the dictionary to store recommendations
lookalike_recommendations = {}

for i, customer_id in enumerate(customer_features['CustomerID']):
    # Get the similarity scores for this customer with all other customers
    similarity_scores = similarity_matrix[i]
    
    # Create a list of (customer_id, similarity_score) pairs, excluding the customer itself
    similar_customers = [
        (customer_features['CustomerID'][j], similarity_scores[j]) 
        for j in range(len(similarity_scores)) if customer_features['CustomerID'][j] != customer_id
    ]
    
    # Sort the similar customers by similarity score (descending) and get the top 3
    similar_customers_sorted = sorted(similar_customers, key=lambda x: x[1], reverse=True)[:3]
    
    # Store the top 3 similar customers and their similarity scores
    lookalike_recommendations[customer_id] = similar_customers_sorted


## Step 9: Prepare the Lookalike.csv Output

In [22]:
# Prepare data for the Lookalike.csv file
lookalike_data = []
for customer_id, recommendations in lookalike_recommendations.items():
    for recommended_customer_id, score in recommendations:
        lookalike_data.append([customer_id, recommended_customer_id, score])

# Create DataFrame for lookalike recommendations
lookalike_df = pd.DataFrame(lookalike_data, columns=['CustomerID', 'LookalikeCustomerID', 'SimilarityScore'])

# Save the results to a CSV file
lookalike_df.to_csv('Lookalike.csv', index=False)


## Step 10: Jupyter Notebook Explanation


### 1. Data Preprocessing:
In this section, we focus on cleaning and transforming the raw data into a suitable format for analysis.

One-Hot Encoding for Region:
The Region column contains categorical values representing the continents where customers are located (e.g., 'Asia', 'Europe', etc.). Since machine learning algorithms cannot directly work with categorical data, we use one-hot encoding to transform this column into multiple binary columns (one for each continent). This helps represent each region as a numerical feature.

encoder = OneHotEncoder(sparse=False)
region_encoded = encoder.fit_transform(customers[['Region']])
region_df = pd.DataFrame(region_encoded, columns=encoder.categories_[0])
Calculating SignupDuration:
To capture how long each customer has been signed up, we calculate the duration from their signup date to the current date. This is represented as the number of days since they signed up.

customers['SignupDuration'] = (pd.to_datetime('today') - customers['SignupDate']).dt.days
By applying these transformations, we ensure that the data is ready for feature engineering, which allows us to capture meaningful insights about the customers’ profiles.

### 2. Feature Engineering:
This section focuses on creating meaningful features from the transaction data, which can help identify similar customers.

Aggregating Transaction Data: We aggregate transaction data at the customer level by calculating several key metrics:

Number of Transactions: How many times the customer has made a purchase.
Average Transaction Value: The mean value of each transaction, which helps to understand the customer's spending behavior.
Most Frequent Product Category: The most commonly purchased category by the customer, indicating their primary area of interest.
Recency of Last Purchase: The time difference in days from the customer’s last purchase to the current date. This helps in identifying active vs. dormant customers.
Average Quantity Purchased: The average number of items bought in each transaction, which reflects how much the customer typically purchases per order.

customer_transactions = transactions.groupby('CustomerID').agg(
    num_transactions=('TransactionID', 'count'),
    avg_transaction_value=('TotalValue', 'mean'),
    most_frequent_category=('Category', lambda x: x.mode()[0]),
    recency_of_last_purchase=('TransactionDate', lambda x: (pd.to_datetime('today') - pd.to_datetime(x.max())).days),
    avg_quantity_purchased=('Quantity', 'mean')
).reset_index()
These aggregated metrics are essential for creating a comprehensive customer profile, which can be used to compare and recommend similar customers.

### 3. Cosine Similarity:
Cosine similarity measures the cosine of the angle between two vectors, which quantifies how similar the two vectors are. The cosine similarity ranges from -1 (completely dissimilar) to 1 (completely similar), with 0 indicating orthogonal vectors (no similarity).

Feature Matrix Construction:
Once the feature vectors for each customer are constructed (containing the aggregated transaction data and preprocessed customer information), we calculate the cosine similarity between these customer feature vectors.

feature_matrix = pd.concat([customers.drop('CustomerID', axis=1), customer_transactions.drop('CustomerID', axis=1)], axis=1)
similarity_matrix = cosine_similarity(feature_matrix)
 
The result is a similarity matrix, where each element represents the similarity between two customers. This helps us understand how similar any two customers are based on their features.

### 4. Recommendation Logic:
In this step, we use the cosine similarity values to recommend the top 3 most similar customers for a given customer.

Selecting Top 3 Similar Customers: For each customer (from CustomerID C0001 to C0020), we compute the top 3 customers with the highest similarity scores. This is done by sorting the similarity matrix for each customer and selecting the top 3 most similar customers.

recommendations = {}

for customer_id in range(1, 21):  # For C0001 to C0020
    # Get the similarity scores for the current customer
    customer_similarities = similarity_matrix[customer_id - 1]
    
    # Get the indices of the top 3 most similar customers
    top_similar_customers = customer_similarities.argsort()[-4:-1][::-1]
    
    # Store the recommendations
    recommendations[f'C{str(customer_id).zfill(4)}'] = [
        (f'C{str(i+1).zfill(4)}', customer_similarities[i]) for i in top_similar_customers
    ]
Here, for each customer, the argsort() function is used to sort the similarity scores in descending order, and the top 3 customers are selected for recommendation.

### 5. Output:
The final output of the model is a CSV file that contains the recommended similar customers for each target customer (C0001 - C0020), along with their corresponding similarity scores.

Saving to Lookalike.csv:
The Lookalike.csv will have the format Map<CustomerID, List<RecommendedCustomerID, SimilarityScore>>. Each row corresponds to a target customer and their 3 most similar customers along with the similarity score.