### Data Science Internship Task 2: Lookalike Model.

Name: Rushikesh Borade <br>
Gmail: rishipb19@gmail.com <br>
Contact: 7038235686 <br>

In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.neighbors import NearestNeighbors
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [2]:
customers = pd.read_csv("datasets/Customers.csv")
products = pd.read_csv("datasets/Products.csv")
transactions = pd.read_csv("datasets/Transactions.csv")

In [14]:
# Feature Engineering
merged_data = pd.merge(
    pd.merge(transactions, customers, on="CustomerID"),
    products,
    on="ProductID",
    suffixes=("_transaction", "_product")
)
merged_data.head()


Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price_transaction,CustomerName,Region,SignupDate,ProductName,Category,Price_product
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68,Andrea Jenkins,Europe,2022-12-03,ComfortLiving Bluetooth Speaker,Electronics,300.68
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68,Brittany Harvey,Asia,2024-09-04,ComfortLiving Bluetooth Speaker,Electronics,300.68
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68,Kathryn Stevens,Europe,2024-04-04,ComfortLiving Bluetooth Speaker,Electronics,300.68
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68,Travis Campbell,South America,2024-04-11,ComfortLiving Bluetooth Speaker,Electronics,300.68
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68,Timothy Perez,Europe,2022-03-15,ComfortLiving Bluetooth Speaker,Electronics,300.68


In [15]:
# Aggregate features by customer
customer_features = merged_data.groupby("CustomerID").agg({
    "TotalValue": "sum", # Total spend
    "Quantity": "sum", # Total quantity purchased
    "Price_product": "mean",# Average product price
    "Region": "first" # Region (categorical)
}).reset_index()

In [17]:
# Preprocessing pipeline
num_features = ["TotalValue", "Quantity", "Price_product"]
cat_features = ["Region"]

# Create preprocessing pipeline
preprocessor = ColumnTransformer([
    ("num", StandardScaler(), num_features),
    ("cat", OneHotEncoder(), cat_features)
])
X = preprocessor.fit_transform(customer_features)


In [18]:
# Train Nearest Neighbors Model
nn_model = NearestNeighbors(n_neighbors=4, metric="cosine")  # 4 includes the customer itself
nn_model.fit(X)

In [19]:
top_lookalikes = {}

for idx, customer_id in enumerate(customer_features["CustomerID"][:20]):
    distances, indices = nn_model.kneighbors([X[idx]])
    # Skip the first match (itself)
    lookalikes = [
        (customer_features["CustomerID"][indices[0][i]], round(1 - distances[0][i], 2))
        for i in range(1, 4)
    ]
    top_lookalikes[customer_id] = lookalikes

In [20]:
lookalike_df = pd.DataFrame({
    "CustomerID": top_lookalikes.keys(),
    "Lookalikes": [str(lst) for lst in top_lookalikes.values()]
})
lookalike_df.to_csv("Rushikesh_Borade_Lookalike.csv", index=False)