In [20]:
import requests
import pandas as pd
import numpy as np
import io
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.cluster import KMeans
from sklearn.metrics import davies_bouldin_score, silhouette_score
from sklearn.decomposition import PCA
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
import matplotlib.pyplot as plt


In [21]:
import matplotlib.pyplot as plt
def fetch_csv(file_id):
    url = f'https://drive.google.com/uc?id={file_id}&export=download'
    response = requests.get(url)
    if response.status_code == 200:
        return pd.read_csv(io.StringIO(response.content.decode('utf-8')))
    else:
        print(f"Error fetching the file with ID: {file_id}")
        return None

customers_id = '1bu_--mo79VdUG9oin4ybfFGRUSXAe-WE'
products_id = '1IKuDizVapw-hyktwfpoAoaGtHtTNHfd0'
transactions_id = '1saEqdbBB-vuk2hxoAf4TzDEsykdKlzbF'

customers_df = fetch_csv(customers_id)
products_df = fetch_csv(products_id)
transactions_df = fetch_csv(transactions_id)

In [23]:
# Data Preparation
transactions_df['TransactionDate'] = pd.to_datetime(transactions_df['TransactionDate'])
customers_df['SignupDate'] = pd.to_datetime(customers_df['SignupDate'])

# Merge datasets
merged_data = transactions_df.merge(
    customers_df, on='CustomerID', how='left'
).merge(
    products_df.rename(columns={'Price': 'ProductPrice'}),
    on='ProductID', how='left'
)

# Feature Engineering
def create_customer_features(df):
    # Profile Features
    latest_date = df['TransactionDate'].max()
    customer_features = df.groupby('CustomerID').agg(
        Region=('Region', 'first'),
        Tenure=('SignupDate', lambda x: (latest_date - x.max()).days),
        TotalTransactions=('TransactionID', 'nunique'),
        TotalSpend=('TotalValue', 'sum'),
        AvgSpend=('TotalValue', 'mean'),
        FavoriteCategory=('Category', lambda x: x.mode()[0])
    ).reset_index()

    # Product Interaction Features
    category_counts = pd.pivot_table(
        df,
        index='CustomerID',
        columns='Category',
        values='TransactionID',
        aggfunc='count',
        fill_value=0
    ).add_prefix('Category_')

    return customer_features.merge(category_counts, on='CustomerID')

feature_matrix = create_customer_features(merged_data)

# Preprocessing
encoder = OneHotEncoder()
region_encoded = encoder.fit_transform(feature_matrix[['Region']])

scaler = StandardScaler()
numeric_features = feature_matrix[['Tenure', 'TotalTransactions', 'TotalSpend', 'AvgSpend']]
scaled_features = scaler.fit_transform(numeric_features)

category_features = feature_matrix.drop(
    columns=['CustomerID', 'Region', 'Tenure', 'TotalTransactions', 'TotalSpend', 'AvgSpend', 'FavoriteCategory']
).values

processed_features = np.hstack([
    region_encoded.toarray(),
    scaled_features,
    category_features
])

# Similarity Calculation
similarity_matrix = cosine_similarity(processed_features)
customer_ids = feature_matrix['CustomerID'].values

# Generate Recommendations
target_customers = [f"C{str(i+1).zfill(4)}" for i in range(20)]
lookalike_mapping = {}

for target in target_customers:
    if target not in customer_ids:
        lookalike_mapping[target] = []
        continue

    idx = np.where(customer_ids == target)[0][0]
    scores = list(enumerate(similarity_matrix[idx]))
    sorted_scores = sorted(scores, key=lambda x: x[1], reverse=True)[1:4]  # Exclude self

    lookalike_mapping[target] = [
        (customer_ids[i], float(round(score, 4)))
        for i, score in sorted_scores
    ]

# Save to CSV
output_data = []
for cust_id, matches in lookalike_mapping.items():
    output_data.append({
        'CustomerID': cust_id,
        'Lookalikes': [[match[0], match[1]] for match in matches]
    })

pd.DataFrame(output_data).to_csv('Lookalike.csv', index=False)