In [28]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import KMeans
from sklearn.metrics import davies_bouldin_score
import matplotlib.pyplot as plt

In [30]:
# Load the datasets
customers = pd.read_csv('Customers.csv')
transactions = pd.read_csv('Transactions.csv')
products = pd.read_csv('Products.csv')


In [32]:
# Merge the datasets
customer_transactions = pd.merge(customers, transactions, on='CustomerID')
all_data = pd.merge(customer_transactions, products, on='ProductID')

In [34]:
# Feature engineering for lookalike model
features_lookalike = all_data.pivot_table(index='CustomerID', columns='Category', values='TotalValue', aggfunc='sum').fillna(0)

# Standardize the features for lookalike model
scaler = StandardScaler()
scaled_features_lookalike = scaler.fit_transform(features_lookalike)

In [36]:
# Train the KNN model for lookalike model
knn = NearestNeighbors(n_neighbors=4, metric='cosine')
knn.fit(scaled_features_lookalike)


In [38]:
# Get the top 3 lookalikes for the first 20 customers
lookalike_results = {}
for i in range(20):
    cust_id = customers['CustomerID'][i]
    distances, indices = knn.kneighbors(scaled_features_lookalike[i].reshape(1, -1))
    similar_cust_ids = [features_lookalike.index[j] for j in indices[0][1:]]
    similarity_scores = 1 - distances.flatten()[1:]
    lookalike_results[cust_id] = list(zip(similar_cust_ids, similarity_scores))

# Print the results for lookalike model
for cust_id, lookalikes in lookalike_results.items():
    print(f"CustomerID: {cust_id}")
    for similar_id, score in lookalikes:
        print(f"  - Lookalike: {similar_id}, Similarity Score: {score:.4f}")

# Save the results to a CSV file for lookalike model
lookalike_df = pd.DataFrame.from_dict(lookalike_results, orient='index')
lookalike_df.to_csv('Utkarsh_Roy_Lookalike.csv', header=False)

# Feature engineering for customer segmentation
features_segmentation = all_data.groupby('CustomerID').agg(
    total_transactions=('TransactionID', 'count'),
    total_quantity=('Quantity', 'sum'),
    total_value=('TotalValue', 'sum'),
    distinct_categories=('Category', 'nunique')
)

# Standardize the features for customer segmentation
scaler = StandardScaler()
scaled_features_segmentation = scaler.fit_transform(features_segmentation)

# Perform customer segmentation using KMeans
kmeans = KMeans(n_clusters=5, random_state=42)
customers['cluster'] = kmeans.fit_predict(scaled_features_segmentation)

# Calculate DB Index
db_index = davies_bouldin_score(scaled_features_segmentation, customers['cluster'])
print(f"DB Index: {db_index:.4f}")

# Visualize the clusters
plt.scatter(scaled_features_segmentation[:, 0], scaled_features_segmentation[:, 1], c=customers['cluster'])
plt.title('Customer Segmentation')
plt.xlabel('Total Transactions')
plt.ylabel('Total Quantity')
plt.show()

CustomerID: C0001
  - Lookalike: C0091, Similarity Score: 0.9888
  - Lookalike: C0069, Similarity Score: 0.9843
  - Lookalike: C0184, Similarity Score: 0.9786
CustomerID: C0002
  - Lookalike: C0159, Similarity Score: 0.9795
  - Lookalike: C0036, Similarity Score: 0.9568
  - Lookalike: C0134, Similarity Score: 0.9079
CustomerID: C0003
  - Lookalike: C0007, Similarity Score: 0.9969
  - Lookalike: C0085, Similarity Score: 0.9640
  - Lookalike: C0166, Similarity Score: 0.9604
CustomerID: C0004
  - Lookalike: C0075, Similarity Score: 0.9832
  - Lookalike: C0090, Similarity Score: 0.9206
  - Lookalike: C0065, Similarity Score: 0.8849
CustomerID: C0005
  - Lookalike: C0197, Similarity Score: 0.9680
  - Lookalike: C0085, Similarity Score: 0.9638
  - Lookalike: C0166, Similarity Score: 0.9498
CustomerID: C0006
  - Lookalike: C0169, Similarity Score: 0.9704
  - Lookalike: C0185, Similarity Score: 0.9294
  - Lookalike: C0081, Similarity Score: 0.9274
CustomerID: C0007
  - Lookalike: C0003, Simila



ValueError: Length of values (199) does not match length of index (200)