In [6]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import numpy as np

In [7]:
#Load Data
customers = pd.read_csv('Customers.csv')
products = pd.read_csv('Products.csv')
transactions = pd.read_csv('Transactions.csv')
print(customers.columns)  
print(transactions.columns) 
print(products.columns) 

Index(['CustomerID', 'CustomerName', 'Region', 'SignupDate'], dtype='object')
Index(['TransactionID', 'CustomerID', 'ProductID', 'TransactionDate',
       'Quantity', 'TotalValue', 'Price'],
      dtype='object')
Index(['ProductID', 'ProductName', 'Category', 'Price'], dtype='object')


In [8]:
#Preprocess Data
data = pd.merge(transactions, customers, on='CustomerID')
data.fillna(0, inplace=True)
customer_features = data.groupby('CustomerID').agg(
    total_spent=('TotalValue', 'sum'),  # Total value spent by the customer
    purchase_frequency=('TransactionID', 'count')  # Number of transactions by the customer
).reset_index()

In [None]:
customer_features = pd.merge(customer_features, customers[['CustomerID', 'Region']], on='CustomerID')
encoder = OneHotEncoder(sparse_output=False)
region_encoded = encoder.fit_transform(customer_features[['Region']])
region_df = pd.DataFrame(region_encoded, columns=encoder.get_feature_names_out(['Region']))
region_df['CustomerID'] = customer_features['CustomerID']
customer_features = pd.merge(customer_features, region_df, on='CustomerID')

In [13]:
numeric_columns = ['total_spent', 'purchase_frequency'] + list(region_df.columns)
numeric_data = customer_features[numeric_columns]

In [18]:
print(numeric_data.dtypes)

total_spent             float64
purchase_frequency        int64
Region_Asia             float64
Region_Europe           float64
Region_North America    float64
Region_South America    float64
CustomerID               object
dtype: object


In [19]:
print(numeric_data.isnull().sum())

total_spent             0
purchase_frequency      0
Region_Asia             0
Region_Europe           0
Region_North America    0
Region_South America    0
CustomerID              0
dtype: int64


In [20]:
numeric_data.fillna(0, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  numeric_data.fillna(0, inplace=True)


In [21]:
scaler = StandardScaler()

In [22]:
customer_features_scaled = scaler.fit_transform(numeric_data)

ValueError: could not convert string to float: 'C0001'

In [28]:
#Standardize Features
scaler = StandardScaler()
numeric_data = numeric_data.apply(pd.to_numeric, errors='coerce')  # Convert to numeric, setting invalid parsing to NaN
numeric_data.fillna(0, inplace=True)  # Replace NaN with 0 for safe scaling
customer_features_scaled = scaler.fit_transform(numeric_data)
similarity_matrix = cosine_similarity(customer_features_scaled)
recommendations = {}


In [30]:
for cust_id in customer_features['CustomerID'].iloc[:20]:
    index = customer_features[customer_features['CustomerID'] == cust_id].index[0]
    similarity_scores = similarity_matrix[index]
    similar_customer_indices = np.argsort(similarity_scores)[::-1][1:4]
    similar_customers = customer_features.iloc[similar_customer_indices]
    similarity_scores_top3 = similarity_scores[similar_customer_indices]
    recommendations[cust_id] = [(similar_customers['CustomerID'].iloc[i], similarity_scores_top3[i]) 
                                for i in range(3)]


In [31]:
lookalike_data = []

for cust_id, similar in recommendations.items():
    for similar_cust, score in similar:
        lookalike_data.append([cust_id, similar_cust, score])

In [33]:
#convert Dataframe and save to csv
lookalike_df = pd.DataFrame(lookalike_data, columns=['CustomerID', 'LookalikeCustomerID', 'SimilarityScore'])
lookalike_df.to_csv('Meganathan_S_Lookalike.csv',index=False)

print("Lookalike.csv has been created.")

Lookalike.csv has been created.
