In [4]:
import pandas as pd
import numpy as np
customers = pd.read_csv('Customers.csv')
products = pd.read_csv('Products.csv')
transactions = pd.read_csv('Transactions.csv')
customers['SignupDate'] = pd.to_datetime(customers['SignupDate'])
transactions['TransactionDate'] = pd.to_datetime(transactions['TransactionDate'])

from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

# Prepare customer features
customer_features = customers.merge(transactions.groupby('CustomerID').agg({
    'TotalValue': 'sum',
    'TransactionID': 'count'
}), on='CustomerID', how='left')

customer_features['TotalValue'].fillna(0, inplace=True)
customer_features['TransactionID'].fillna(0, inplace=True)

# Encode categorical variables
customer_features = pd.get_dummies(customer_features, columns=['Region'])

# Select features for similarity calculation
features = ['TotalValue', 'TransactionID'] + [col for col in customer_features.columns if col.startswith('Region_')]

# Normalize features
scaler = StandardScaler()
customer_features_scaled = scaler.fit_transform(customer_features[features])

# Calculate similarity matrix
similarity_matrix = cosine_similarity(customer_features_scaled)

# Function to get top 3 lookalikes
def get_lookalikes(customer_id, n=3):
    idx = customer_features.index[customer_features['CustomerID'] == customer_id].tolist()[0]
    similar_indices = similarity_matrix[idx].argsort()[-n-1:-1][::-1]
    return [(customer_features.iloc[i]['CustomerID'], similarity_matrix[idx][i]) for i in similar_indices]

# Get lookalikes for first 20 customers
lookalikes = {}
for cust_id in customers['CustomerID'][:20]:
    lookalikes[cust_id] = get_lookalikes(cust_id)

# Create DataFrame and save to CSV
lookalike_df = pd.DataFrame([(k, v) for k, v in lookalikes.items()], columns=['CustomerID', 'Lookalikes'])
lookalike_df.to_csv('Lookalike.csv', index=False)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  customer_features['TotalValue'].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  customer_features['TransactionID'].fillna(0, inplace=True)
