In [1]:

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
import json

np.random.seed(42)

customers_df = pd.DataFrame({
    'CustomerID': [f'C{str(i).zfill(4)}' for i in range(1, 101)],
    'Age': np.random.randint(18, 80, 100),
    'Gender': np.random.choice(['M', 'F'], 100),
    'Location': np.random.choice(['NY', 'CA', 'TX', 'FL'], 100)
})

products_df = pd.DataFrame({
    'ProductID': [f'P{str(i).zfill(4)}' for i in range(1, 51)],
    'Category': np.random.choice(['Books', 'Electronics', 'Clothing', 'Home Decor'], 50),
    'Price': np.random.uniform(10, 1000, 50)
})

n_transactions = 1000
transactions_df = pd.DataFrame({
    'TransactionID': [f'T{str(i).zfill(4)}' for i in range(1, n_transactions + 1)],
    'CustomerID': np.random.choice(customers_df['CustomerID'], n_transactions),
    'ProductID': np.random.choice(products_df['ProductID'], n_transactions),
    'Quantity': np.random.randint(1, 10, n_transactions),
    'TotalValue': np.random.uniform(10, 1000, n_transactions)
})

print("Sample data created successfully")
print("\
Customers DataFrame Head:")
print(customers_df.head())
print("\
Products DataFrame Head:")
print(products_df.head())
print("\
Transactions DataFrame Head:")
print(transactions_df.head())

Sample data created successfully
Customers DataFrame Head:
  CustomerID  Age Gender Location
0      C0001   56      F       NY
1      C0002   69      F       FL
2      C0003   46      F       FL
3      C0004   32      F       CA
4      C0005   60      M       NY
Products DataFrame Head:
  ProductID     Category       Price
0     P0001        Books  522.573837
1     P0002   Home Decor  839.333005
2     P0003  Electronics  678.933216
3     P0004   Home Decor  737.863958
4     P0005  Electronics  216.980905
Transactions DataFrame Head:
  TransactionID CustomerID ProductID  Quantity  TotalValue
0         T0001      C0051     P0004         8  821.812503
1         T0002      C0054     P0001         2  697.933550
2         T0003      C0008     P0008         6  537.167772
3         T0004      C0027     P0029         5  454.364541
4         T0005      C0027     P0039         9   93.548457


In [2]:

def create_customer_features(transactions_df, customers_df, products_df):

    customer_transaction_features = transactions_df.groupby('CustomerID').agg({
        'TransactionID': 'count',  # Purchase frequency
        'TotalValue': ['sum', 'mean'],  # Spending patterns
        'Quantity': ['sum', 'mean']  # Purchase volume
    }).reset_index()

    customer_transaction_features.columns = ['CustomerID', 'transaction_count',
                                           'total_spend', 'avg_transaction_value',
                                           'total_quantity', 'avg_quantity']

    trans_products = pd.merge(transactions_df, products_df, on='ProductID')

    category_preferences = pd.crosstab(
        trans_products['CustomerID'],
        trans_products['Category'],
        values=trans_products['Quantity'],
        aggfunc='sum'
    ).fillna(0)

    category_preferences = category_preferences.div(category_preferences.sum(axis=1), axis=0)

    customer_features = pd.merge(customer_transaction_features,
                               category_preferences.reset_index(),
                               on='CustomerID', how='left')

    customer_features = pd.merge(customer_features, customers_df, on='CustomerID', how='left')

    return customer_features

customer_features = create_customer_features(transactions_df, customers_df, products_df)

print("Customer Features Head:")
print(customer_features.head())

Customer Features Head:
  CustomerID  transaction_count  total_spend  avg_transaction_value  \
0      C0001                 16  7595.012775             474.688298   
1      C0002                  8  4433.774031             554.221754   
2      C0003                  7  3662.938922             523.276989   
3      C0004                 10  6374.857998             637.485800   
4      C0005                 15  6474.788648             431.652577   

   total_quantity  avg_quantity     Books  Clothing  Electronics  Home Decor  \
0              73      4.562500  0.424658  0.082192     0.328767    0.164384   
1              44      5.500000  0.295455  0.704545     0.000000    0.000000   
2              25      3.571429  0.000000  0.160000     0.480000    0.360000   
3              48      4.800000  0.125000  0.208333     0.187500    0.479167   
4              89      5.933333  0.112360  0.179775     0.213483    0.494382   

   Age Gender Location  
0   56      F       NY  
1   69      F     

In [3]:
# Function to calculate similarity scores
def calculate_similarity_scores(customer_features, target_customer_id):

    numerical_cols = ['transaction_count', 'total_spend', 'avg_transaction_value',
                     'total_quantity', 'avg_quantity', 'Age']
    category_cols = ['Books', 'Clothing', 'Electronics', 'Home Decor']

    X = customer_features[numerical_cols + category_cols].values

    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    similarity_matrix = cosine_similarity(X_scaled)

    target_idx = customer_features[customer_features['CustomerID'] == target_customer_id].index[0]

    similarity_scores = similarity_matrix[target_idx]

    similarity_df = pd.DataFrame({
        'CustomerID': customer_features['CustomerID'],
        'similarity_score': similarity_scores
    })

    similarity_df = similarity_df[similarity_df['CustomerID'] != target_customer_id]
    similarity_df = similarity_df.sort_values('similarity_score', ascending=False)

    return similarity_df.head(3)

lookalike_results = {}
target_customers = customer_features['CustomerID'].head().tolist()

for customer_id in target_customers:
    # Get top 3 similar customers
    similar_customers = calculate_similarity_scores(customer_features, customer_id)

    recommendations = similar_customers.apply(
        lambda x: {'customer_id': x['CustomerID'],
                  'similarity_score': round(x['similarity_score'], 3)},
        axis=1
    ).tolist()

    lookalike_results[customer_id] = recommendations

print("Sample Lookalike Recommendations:")
for customer_id, recommendations in lookalike_results.items():
    print(f"\
Customer {customer_id}:")
    for i, rec in enumerate(recommendations, 1):
        print(f"  {i}. Customer {rec['customer_id']}: Similarity Score = {rec['similarity_score']}")

# Save results to CSV
results_df = pd.DataFrame([(k, json.dumps(v)) for k, v in lookalike_results.items()],
                         columns=['CustomerID', 'Recommendations'])
results_df.to_csv('lookalike_recommendations.csv', index=False)
print("\
Recommendations saved to lookalike_recommendations.csv")

Sample Lookalike Recommendations:
Customer C0001:
  1. Customer C0099: Similarity Score = 0.777
  2. Customer C0026: Similarity Score = 0.77
  3. Customer C0038: Similarity Score = 0.756
Customer C0002:
  1. Customer C0098: Similarity Score = 0.798
  2. Customer C0010: Similarity Score = 0.79
  3. Customer C0068: Similarity Score = 0.778
Customer C0003:
  1. Customer C0044: Similarity Score = 0.781
  2. Customer C0022: Similarity Score = 0.737
  3. Customer C0041: Similarity Score = 0.714
Customer C0004:
  1. Customer C0013: Similarity Score = 0.791
  2. Customer C0069: Similarity Score = 0.77
  3. Customer C0008: Similarity Score = 0.734
Customer C0005:
  1. Customer C0090: Similarity Score = 0.886
  2. Customer C0027: Similarity Score = 0.835
  3. Customer C0099: Similarity Score = 0.822
Recommendations saved to lookalike_recommendations.csv
