In [None]:
import pandas as pd

# Load the datasets
transactions = pd.read_csv('/content/Transactions.csv')
customers = pd.read_csv('/content/Customers.csv')
products = pd.read_csv('/content/Products.csv')

In [None]:

print(transactions.isnull().sum())
print(customers.isnull().sum())
print(products.isnull().sum())



TransactionID      0
CustomerID         0
ProductID          0
TransactionDate    0
Quantity           0
TotalValue         0
Price              0
dtype: int64
CustomerID      0
CustomerName    0
Region          0
SignupDate      0
dtype: int64
ProductID      0
ProductName    0
Category       0
Price          0
dtype: int64


In [None]:
transactions_products = transactions.merge(products, on='ProductID', how='left')
full_data = transactions_products.merge(customers, on='CustomerID', how='left')
full_data.head()


Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price_x,ProductName,Category,Price_y,CustomerName,Region,SignupDate
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68,Andrea Jenkins,Europe,2022-12-03
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68,Brittany Harvey,Asia,2024-09-04
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68,Kathryn Stevens,Europe,2024-04-04
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68,Travis Campbell,South America,2024-04-11
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68,Timothy Perez,Europe,2022-03-15


In [None]:
from datetime import datetime


full_data['TransactionDate'] = pd.to_datetime(full_data['TransactionDate'])
full_data['SignupDate'] = pd.to_datetime(full_data['SignupDate'])

# Aggregate features for each customer
customer_features = full_data.groupby('CustomerID').agg(
    avg_transaction_value=('TotalValue', 'mean'),
    total_quantity=('Quantity', 'sum'),
    num_transactions=('TransactionID', 'count'),
    categories_purchased=('Category', lambda x: x.nunique()),
    days_since_signup=('SignupDate', lambda x: (datetime.now() - x.min()).days)
).reset_index()

# One-hot encoding
category_features = pd.get_dummies(full_data[['CustomerID', 'Category']], columns=['Category'])
category_features = category_features.groupby('CustomerID').sum().reset_index()

# Merge customer features with category features
customer_data = customer_features.merge(category_features, on='CustomerID', how='left')


customer_data.head()


Unnamed: 0,CustomerID,avg_transaction_value,total_quantity,num_transactions,categories_purchased,days_since_signup,Category_Books,Category_Clothing,Category_Electronics,Category_Home Decor
0,C0001,670.904,12,5,3,932,1,0,3,1
1,C0002,465.685,10,4,2,1079,0,2,0,2
2,C0003,681.345,14,4,3,326,0,1,1,2
3,C0004,669.36,23,8,3,841,3,0,2,3
4,C0005,678.08,7,3,2,896,0,0,2,1


In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity


scaler = StandardScaler()
customer_features_scaled = scaler.fit_transform(customer_data.iloc[:, 1:])  # Exclude CustomerID

#  cosine similarity matrix
similarity_matrix = cosine_similarity(customer_features_scaled)

similarity_df = pd.DataFrame(similarity_matrix, index=customer_data['CustomerID'], columns=customer_data['CustomerID'])

# Extract top 3
top_20_customers = customer_data['CustomerID'][:20]
lookalike_results = {}

for customer_id in top_20_customers:

    similar_customers = similarity_df[customer_id].sort_values(ascending=False).iloc[1:4]
    lookalike_results[customer_id] = similar_customers.items()

lookalike_df = pd.DataFrame([
    {'CustomerID': cust_id, 'Lookalikes': list(map(lambda x: {'CustomerID': x[0], 'Score': x[1]}, lookalikes))}
    for cust_id, lookalikes in lookalike_results.items()
])


output_path = 'Lookalike.csv'
lookalike_df.to_csv(output_path, index=False)

output_path


'Lookalike.csv'

In [None]:
import pandas as pd
from datetime import datetime
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity




transactions.fillna(0, inplace=True)
customers.fillna(0, inplace=True)
products.fillna(0, inplace=True)



transactions_products = transactions.merge(products, on='ProductID', how='left')

# Merge the result with Customers
full_data = transactions_products.merge(customers, on='CustomerID', how='left')

# Convert dates
full_data['TransactionDate'] = pd.to_datetime(full_data['TransactionDate'])
full_data['SignupDate'] = pd.to_datetime(full_data['SignupDate'])


customer_features = full_data.groupby('CustomerID').agg(
    avg_transaction_value=('TotalValue', 'mean'),
    total_quantity=('Quantity', 'sum'),
    num_transactions=('TransactionID', 'count'),
    categories_purchased=('Category', lambda x: x.nunique()),
    days_since_signup=('SignupDate', lambda x: (datetime.now() - x.min()).days)
).reset_index()


customer_features.fillna(0, inplace=True)


# One-hot encoding
category_features = pd.get_dummies(full_data[['CustomerID', 'Category']], columns=['Category'])
category_features = category_features.groupby('CustomerID').sum().reset_index()



customer_data = customer_features.merge(category_features, on='CustomerID', how='left')

customer_data.fillna(0, inplace=True)


scaler = StandardScaler()
customer_features_scaled = scaler.fit_transform(customer_data.iloc[:, 1:])

# Calculating cosine similarity
similarity_matrix = cosine_similarity(customer_features_scaled)

similarity_df = pd.DataFrame(similarity_matrix, index=customer_data['CustomerID'], columns=customer_data['CustomerID'])
lookalike_results = {}
for customer_id in customer_data['CustomerID']:

    similar_customers = similarity_df[customer_id].drop(customer_id).sort_values(ascending=False).head(3)
    lookalike_results[customer_id] = similar_customers.items()

lookalike_df = pd.DataFrame([
    {'CustomerID': cust_id, 'Lookalikes': list(map(lambda x: {'CustomerID': x[0], 'Score': x[1]}, lookalikes))}
    for cust_id, lookalikes in lookalike_results.items()
])


output_path = 'Lookalike.csv'
lookalike_df.to_csv(output_path, index=False)

print(f"Lookalike customer data saved to: {output_path}")

Lookalike customer data saved to: Lookalike.csv
