In [3]:
import gdown
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

# Define Google Drive links (file IDs)
customers_url = 'https://drive.google.com/uc?export=download&id=1bu_--mo79VdUG9oin4ybfFGRUSXAe-WE'
products_url = 'https://drive.google.com/uc?export=download&id=1IKuDizVapw-hyktwfpoAoaGtHtTNHfd0'
transactions_url = 'https://drive.google.com/uc?export=download&id=1saEqdbBB-vuk2hxoAf4TzDEsykdKlzbF'

# Download files
gdown.download(customers_url, 'customers.csv', quiet=False)
gdown.download(products_url, 'products.csv', quiet=False)
gdown.download(transactions_url, 'transactions.csv', quiet=False)

# Load the customer and transaction data
customers = pd.read_csv('customers.csv')
transactions = pd.read_csv('transactions.csv')

# Inspect the columns in the customers and transactions dataset
print(customers.columns)  # Customer data columns
print(transactions.columns)  # Transaction data columns

# Feature Engineering
# Merge customer data with transaction data based on 'CustomerID'
customer_transactions = pd.merge(transactions, customers, on='CustomerID', how='left')

# Generate customer features (e.g., total spend, transaction frequency)
customer_features = customer_transactions.groupby('CustomerID').agg(
    total_spend=('TotalValue', 'sum'),  # Use 'TotalValue' for total spending
    transaction_count=('TransactionID', 'count')  # Count number of transactions
).reset_index()

# Merge the generated features with the original customer data
customers = pd.merge(customers, customer_features, on='CustomerID', how='left')

# Handle missing values by filling NaN with 0 (you can also use the mean or other strategies)
customers.fillna(0, inplace=True)

# Handle categorical columns (e.g., 'Region') using encoding
customers['Region'] = customers['Region'].astype('category').cat.codes  # Encoding categorical 'Region' as numeric

# Select features for scaling (numeric columns)
features = customers[['Region', 'total_spend', 'transaction_count']]  # Adjust columns based on available data

# Standardize the data
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)

# Calculate similarity (cosine similarity in this case)
similarity_matrix = cosine_similarity(features_scaled)

# Get the top 3 lookalike recommendations for the first 20 customers
recommendations = []
for i in range(20):
    similar_customers = similarity_matrix[i].argsort()[::-1][1:4]  # Top 3 similar customers
    similarity_scores = similarity_matrix[i][similar_customers]
    recommendations.append(list(zip(similar_customers, similarity_scores)))

# Prepare the final result (Lookalike map)
lookalike_map = {}
for i, rec in enumerate(recommendations):
    customer_id = customers.iloc[i]['CustomerID']
    lookalike_map[customer_id] = [{'CustomerID': customers.iloc[rec[0]]['CustomerID'], 'Score': rec[1]} for rec in rec]

# Prepare a list to store the rows for the CSV
lookalike_data = []

# Loop through the lookalike map and extract their lookalikes and similarity scores
for customer_id, lookalikes in lookalike_map.items():
    row = [customer_id]
    for i in range(3):  # Top 3 lookalikes
        lookalike = lookalikes[i]  # Get the ith lookalike
        row.append(lookalike['CustomerID'])
        row.append(lookalike['Score'])
    lookalike_data.append(row)

# Convert the list to a DataFrame
lookalike_df = pd.DataFrame(lookalike_data, columns=[
    'CustomerID',
    'Lookalike1_CustomerID', 'Lookalike1_Score',
    'Lookalike2_CustomerID', 'Lookalike2_Score',
    'Lookalike3_CustomerID', 'Lookalike3_Score'
])

# Save the DataFrame to a CSV file
lookalike_df.to_csv('Lookalike.csv', index=False)

# Print the final recommendations for the first 20 customers
print(lookalike_df.head(20))


Downloading...
From: https://drive.google.com/uc?export=download&id=1bu_--mo79VdUG9oin4ybfFGRUSXAe-WE
To: /content/customers.csv
100%|██████████| 8.54k/8.54k [00:00<00:00, 13.3MB/s]
Downloading...
From: https://drive.google.com/uc?export=download&id=1IKuDizVapw-hyktwfpoAoaGtHtTNHfd0
To: /content/products.csv
100%|██████████| 4.25k/4.25k [00:00<00:00, 6.96MB/s]
Downloading...
From: https://drive.google.com/uc?export=download&id=1saEqdbBB-vuk2hxoAf4TzDEsykdKlzbF
To: /content/transactions.csv
100%|██████████| 54.7k/54.7k [00:00<00:00, 53.0MB/s]

Index(['CustomerID', 'CustomerName', 'Region', 'SignupDate'], dtype='object')
Index(['TransactionID', 'CustomerID', 'ProductID', 'TransactionDate',
       'Quantity', 'TotalValue', 'Price'],
      dtype='object')
   CustomerID Lookalike1_CustomerID  Lookalike1_Score Lookalike2_CustomerID  \
0       C0001                 C0137          0.999954                 C0152   
1       C0002                 C0142          0.994031                 C0177   
2       C0003                 C0133          0.998135                 C0052   
3       C0004                 C0067          0.997616                 C0113   
4       C0005                 C0159          0.999947                 C0186   
5       C0006                 C0158          0.979980                 C0168   
6       C0007                 C0070          0.990569                 C0135   
7       C0008                 C0139          0.990511                 C0109   
8       C0009                 C0062          0.985576                 C0010 


