In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import warnings
warnings.filterwarnings('ignore')

# Load your data
df = pd.read_csv('final test1.csv')
print(f"Dataset shape: {df.shape}")
print(f"Columns: {df.columns.tolist()}")
print(f"Missing values:\n{df.isnull().sum()}")

Dataset shape: (25000, 29)
Columns: ['Customer ID', 'Product ID', 'Order ID', 'Customer Age', 'Gender', 'Product Name', 'MRP', 'Discount Price', 'Category', 'State', 'City', 'Subscription', 'Bill Number', 'Time Spent on Website', 'Rating', 'Marketing/Advertisement', 'Ship Mode', 'Order Status', 'Order Date', 'Delivery Date', 'Cancellation Date', 'Payment Method', 'Pin Code', 'Total Order Value', 'Payment Status', 'No of Clicks', 'Year', 'Month', 'Shipping Charges']
Missing values:
Customer ID                    0
Product ID                     0
Order ID                       0
Customer Age                   0
Gender                         0
Product Name                   0
MRP                            0
Discount Price                 0
Category                       0
State                          0
City                           0
Subscription                   0
Bill Number                    0
Time Spent on Website          0
Rating                         0
Marketing/Advertise

In [8]:
df.head()

Unnamed: 0,Customer ID,Product ID,Order ID,Customer Age,Gender,Product Name,MRP,Discount Price,Category,State,...,Delivery Date,Cancellation Date,Payment Method,Pin Code,Total Order Value,Payment Status,No of Clicks,Year,Month,Shipping Charges
0,YSB75,BW653,479577309,28,Female,Ariel Matic Top Load Liquid Detergent,977.44,909.02,Imported,Andhra Pradesh,...,13-05-2022,,Credit,318324,909.02,Paid,29,2022,May,0
1,FUS93,XV061,634865221,47,Male,VSR Channa Dal,834.09,450.41,Branded,Telangana,...,20-08-2021,,Debit,730162,450.41,Paid,95,2021,August,0
2,AJP28,GF695,113166210,63,Female,Tenali Double Horse Chana Dal,1095.2,1007.58,Branded,Gujarat,...,26-10-2021,,Debit,694091,1007.58,Paid,51,2021,October,0
3,URC55,VM478,740539230,41,Male,Tata Tea,748.16,389.04,Imported,Maharashtra,...,27-08-2021,,Credit,211807,489.04,Pending,12,2021,August,100
4,ZOP23,XD230,156544145,22,Female,VSR Channa Dal,1249.04,911.8,Local,Gujarat,...,10-12-2023,,Credit,78304,1011.8,Paid,26,2023,December,100


In [9]:
df.describe()

Unnamed: 0,Order ID,Customer Age,MRP,Discount Price,Bill Number,Time Spent on Website,Rating,Pin Code,No of Clicks,Year
count,25000.0,25000.0,25000.0,25000.0,25000.0,25000.0,25000.0,25000.0,25000.0,25000.0
mean,498713000.0,43.8566,1013.746646,735.621509,501669900000.0,10.150932,3.008728,496731.07948,50.42304,2022.0022
std,288189500.0,15.285775,571.079711,441.362817,288719900000.0,10.965989,1.159525,287726.921162,28.867925,0.816714
min,19128.0,18.0,20.04,10.91,23494740.0,1.0,1.0,5.0,1.0,2021.0
25%,250374600.0,31.0,523.995,367.665,254023800000.0,2.98,2.0,247984.5,26.0,2021.0
50%,499731100.0,44.0,1014.415,713.14,501972500000.0,5.0,3.0,495215.5,50.0,2022.0
75%,747102100.0,57.0,1506.8125,1061.3475,751932200000.0,13.43,4.0,745846.25,76.0,2023.0
max,999913100.0,70.0,1999.92,1876.22,999916000000.0,59.97,5.0,999997.0,100.0,2023.0


In [10]:
# Convert 'Total Order Value' to numeric (float), coercing errors to NaN if any
df['Total Order Value'] = pd.to_numeric(df['Total Order Value'], errors='coerce')
print(df['Total Order Value'].dtype)

float64


Unnamed: 0,Customer ID,Product ID,Order ID,Customer Age,Gender,Product Name,MRP,Discount Price,Category,State,...,DNA_Dimension_5,DNA_Cluster,is_premium,rating_consistency,time_pattern_consistency,price_pattern_consistency,too_perfect_ratings,suspiciously_fast,extreme_clicker,authenticity_score
0,YSB75,BW653,479577309,28,Female,Ariel Matic Top Load Liquid Detergent,977.44,909.02,Imported,Andhra Pradesh,...,1.398439,3,1,0.0,0.0,0.0,0,0,0,100.0
1,FUS93,XV061,634865221,47,Male,VSR Channa Dal,834.09,450.41,Branded,Telangana,...,-0.16425,0,0,0.0,0.0,0.0,0,0,0,100.0
2,AJP28,GF695,113166210,63,Female,Tenali Double Horse Chana Dal,1095.2,1007.58,Branded,Gujarat,...,-0.158946,3,0,0.0,0.0,0.0,0,0,0,100.0
3,URC55,VM478,740539230,41,Male,Tata Tea,748.16,389.04,Imported,Maharashtra,...,-1.980624,4,0,0.0,0.0,0.0,0,0,0,80.0
4,ZOP23,XD230,156544145,22,Female,VSR Channa Dal,1249.04,911.8,Local,Gujarat,...,-0.14536,4,0,0.0,0.0,0.0,0,0,0,100.0


**Create Core DNA Features**


In [14]:
df.head()

Unnamed: 0,Customer ID,Product ID,Order ID,Customer Age,Gender,Product Name,MRP,Discount Price,Category,State,...,city_Solapur,city_Surat,city_Suryapet,city_Thane,city_Tirupati,city_Vadodara,city_Vijayawada,city_Visakhapatnam,city_Vizianagaram,city_Warangal
0,YSB75,BW653,479577309,28,Female,Ariel Matic Top Load Liquid Detergent,977.44,909.02,Imported,Andhra Pradesh,...,False,False,False,False,False,False,False,False,False,False
1,FUS93,XV061,634865221,47,Male,VSR Channa Dal,834.09,450.41,Branded,Telangana,...,False,False,False,False,False,False,False,False,False,False
2,AJP28,GF695,113166210,63,Female,Tenali Double Horse Chana Dal,1095.2,1007.58,Branded,Gujarat,...,False,False,False,False,False,False,False,False,False,False
3,URC55,VM478,740539230,41,Male,Tata Tea,748.16,389.04,Imported,Maharashtra,...,False,False,False,False,False,False,False,False,False,False
4,ZOP23,XD230,156544145,22,Female,VSR Channa Dal,1249.04,911.8,Local,Gujarat,...,False,False,False,False,False,True,False,False,False,False
