In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np


customers = pd.read_csv('/content/Customers.csv')
products = pd.read_csv('/content/Products.csv')
transactions = pd.read_csv('/content/Transactions.csv')


customers['SignupDate'] = pd.to_datetime(customers['SignupDate'])
transactions['TransactionDate'] = pd.to_datetime(transactions['TransactionDate'])


data = transactions.merge(customers, on='CustomerID', how='left').merge(products, on='ProductID', how='left')


customer_spend = data.groupby('CustomerID')['TotalValue'].sum().rename('TotalSpend')


customer_aov = data.groupby('CustomerID')['TotalValue'].mean().rename('AverageOrderValue')


customer_freq = data['CustomerID'].value_counts().rename('TransactionFrequency')


favorite_category = data.groupby('CustomerID')['Category'].agg(lambda x: x.mode()[0]).rename('FavoriteCategory')


customer_features = pd.DataFrame({
    'CustomerID': customers['CustomerID']
}).merge(customer_spend, on='CustomerID', how='left') \
  .merge(customer_aov, on='CustomerID', how='left') \
  .merge(customer_freq, on='CustomerID', how='left') \
  .merge(favorite_category, on='CustomerID', how='left')


le = LabelEncoder()
customer_features['FavoriteCategory'] = le.fit_transform(customer_features['FavoriteCategory'])


customer_features.fillna(0, inplace=True)


X = customer_features.drop(columns=['CustomerID'])
y = customer_features['CustomerID']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)


feature_matrix = customer_features.drop(columns=['CustomerID']).values


similarity_matrix = cosine_similarity(feature_matrix)


def get_top_3_similar(customers_list, sim_matrix, customer_ids):
    recommendations = {}
    for idx, cust_id in enumerate(customers_list):

        similarity_scores = sim_matrix[idx]


        similar_indices = np.argsort(similarity_scores)[::-1][1:4]


        similar_customers = [(customer_ids[i], similarity_scores[i]) for i in similar_indices]

        recommendations[cust_id] = similar_customers
    return recommendations


target_customers = customers['CustomerID'][:20].values
lookalike_recommendations = get_top_3_similar(
    customers_list=target_customers,
    sim_matrix=similarity_matrix[:20],
    customer_ids=customer_features['CustomerID'].values
)


lookalike_data = []
for cust_id, recs in lookalike_recommendations.items():
    lookalike_data.append({'CustomerID': cust_id, 'Recommendations': recs})

lookalike_df = pd.DataFrame(lookalike_data)
lookalike_df.to_csv('Vansh_Kodwani_Lookalike.csv', index=False)

print("Lookalike.csv has been created!")


Lookalike.csv has been created!
