In [15]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer

data = pd.read_csv('merged_data.csv')

data.head()

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price_x,Month,ProductName,Category,Price_y,CustomerName,Region,SignupDate
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68,2024-08,ComfortLiving Bluetooth Speaker,Electronics,300.68,Andrea Jenkins,Europe,2022-12-03
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68,2024-05,ComfortLiving Bluetooth Speaker,Electronics,300.68,Brittany Harvey,Asia,2024-09-04
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68,2024-04,ComfortLiving Bluetooth Speaker,Electronics,300.68,Kathryn Stevens,Europe,2024-04-04
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68,2024-03,ComfortLiving Bluetooth Speaker,Electronics,300.68,Travis Campbell,South America,2024-04-11
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68,2024-03,ComfortLiving Bluetooth Speaker,Electronics,300.68,Timothy Perez,Europe,2022-03-15


In [4]:
# Calculate total spending per customer
total_spending = data.groupby('CustomerID')['TotalValue'].sum().reset_index()
total_spending.columns = ['CustomerID', 'TotalSpending']


In [5]:

# Calculate number of transactions per customer
num_transactions = data.groupby('CustomerID')['TransactionID'].count().reset_index()
num_transactions.columns = ['CustomerID', 'NumTransactions']


In [6]:

# Calculate average transaction value per customer
avg_transaction_value = data.groupby('CustomerID')['TotalValue'].mean().reset_index()
avg_transaction_value.columns = ['CustomerID', 'AvgTransactionValue']

# Preferred product categories
preferred_categories = data.groupby(['CustomerID', 'Category']).size().unstack(fill_value=0)


In [7]:

# Merge all features
customer_features = total_spending.merge(num_transactions, on='CustomerID')
customer_features = customer_features.merge(avg_transaction_value, on='CustomerID')
customer_features = customer_features.merge(preferred_categories, on='CustomerID')


In [8]:

# Add region and signup date
customer_info = data[['CustomerID', 'Region', 'SignupDate']].drop_duplicates()
customer_features = customer_features.merge(customer_info, on='CustomerID')

# Convert SignupDate to tenure in days
customer_features['SignupDate'] = pd.to_datetime(customer_features['SignupDate'])
customer_features['Tenure'] = (pd.to_datetime('2024-12-31') - customer_features['SignupDate']).dt.days

# Drop SignupDate as it's no longer needed
customer_features = customer_features.drop(columns=['SignupDate'])


In [9]:
customer_features.head()

Unnamed: 0,CustomerID,TotalSpending,NumTransactions,AvgTransactionValue,Books,Clothing,Electronics,Home Decor,Region,Tenure
0,C0001,3354.52,5,670.904,1,0,3,1,South America,905
1,C0002,1862.74,4,465.685,0,2,0,2,Asia,1052
2,C0003,2725.38,4,681.345,0,1,1,2,South America,299
3,C0004,5354.88,8,669.36,3,0,2,3,South America,814
4,C0005,2034.24,3,678.08,0,0,2,1,Asia,869


In [17]:

numerical_columns = [col for col in customer_features.select_dtypes(include=['int64', 'float64']).columns
                    if col != 'CustomerID']
text_columns = [col for col in customer_features.select_dtypes(include=['object']).columns
                if col != 'CustomerID']

scaler = StandardScaler()
scaled_numerical = scaler.fit_transform(customer_features[numerical_columns])

tfidf = TfidfVectorizer()
text_features = []

for column in text_columns:
    # Convert any missing values to empty string
    text_data = customer_features[column].fillna('')
    # Transform the column
    tfidf_matrix = tfidf.fit_transform(text_data)
    text_features.append(tfidf_matrix.toarray())

all_features = np.hstack([scaled_numerical] + text_features)

similarity_matrix = cosine_similarity(all_features)

similarity_df = pd.DataFrame(
    similarity_matrix,
    index=customer_features['CustomerID'],
    columns=customer_features['CustomerID']
)



In [19]:
# Initialize a dictionary to store the results
lookalike_map = {}

# Get the list of the first 20 customers
first_20_customers = customer_features['CustomerID'].unique()[:20]

# Iterate over each customer
for customer in first_20_customers:
    # Get the similarity scores for the current customer
    similar_customers = similarity_df[customer].sort_values(ascending=False)

    # Exclude the customer themselves and get the top 3
    top_3_similar = similar_customers.iloc[1:4]

    # Store the results in the dictionary
    lookalike_map[customer] = list(zip(top_3_similar.index, top_3_similar.values))

# Convert the dictionary to a DataFrame for easier export
lookalike_df = pd.DataFrame.from_dict(lookalike_map, orient='index')
lookalike_df = lookalike_df.stack().reset_index()
lookalike_df.columns = ['CustomerID', 'Index', 'Lookalike']

# Split the Lookalike column into CustomerID and SimilarityScore
lookalike_df[['LookalikeID', 'SimilarityScore']] = pd.DataFrame(lookalike_df['Lookalike'].tolist(), index=lookalike_df.index)
lookalike_df = lookalike_df.drop(columns=['Lookalike', 'Index'])

# Display the final lookalike DataFrame
print(lookalike_df.head())

# Save the results to a CSV file
lookalike_df.to_csv('Vishnu_Gudipati_Lookalike.csv', index=False)

  CustomerID LookalikeID  SimilarityScore
0      C0001       C0120         0.788527
1      C0001       C0091         0.783322
2      C0001       C0112         0.763010
3      C0002       C0134         0.933248
4      C0002       C0106         0.900106
