PART 1 : Based on Feature Selection, Cleaning, and Preprocessing to Construct an Input from Data Source

In [2]:
import pandas as pd
from datetime import datetime
from sklearn.preprocessing import MaxAbsScaler

import sys
!{sys.executable} -m pip install kagglehub
import kagglehub

# Download Latest Version
path = kagglehub.dataset_download("jahias/microsoft-adventure-works-cycles-customer-data")

print("Path to dataset files:", path)

# Load datasets
customers = pd.read_csv(f"{path}/AWCustomers.csv")
sales = pd.read_csv(f"{path}/AWSales.csv")

# Merge on CustomerID
merged_df = pd.merge(customers, sales, on="CustomerID")

# Convert BirthDate
merged_df["BirthDate"] = pd.to_datetime(merged_df["BirthDate"], errors="coerce")

# Calculate Age
today = datetime.now()
merged_df["Age"] = merged_df["BirthDate"].apply(
    lambda x: today.year - x.year - ((today.month, today.day) < (x.month, x.day))
)

# Select relevant columns
new_columns = [
    "Gender",
    "MaritalStatus",
    "Age",
    "Education",
    "HomeOwnerFlag",
    "NumberCarsOwned",
    "NumberChildrenAtHome",
    "TotalChildren",
    "YearlyIncome",
    "BikeBuyer"
]
new_df = merged_df[new_columns]

print(new_df.head())


Using Colab cache for faster access to the 'microsoft-adventure-works-cycles-customer-data' dataset.
Path to dataset files: /kaggle/input/microsoft-adventure-works-cycles-customer-data
  Gender MaritalStatus  Age        Education  HomeOwnerFlag  NumberCarsOwned  \
0      M             M   37        Bachelors              1                3   
1      M             M   53  Partial College              1                2   
2      F             S   39        Bachelors              0                3   
3      M             M   47  Partial College              1                2   
4      M             S   50  Partial College              1                1   

   NumberChildrenAtHome  TotalChildren  YearlyIncome  BikeBuyer  
0                     0              1         81916          1  
1                     1              2         81076          1  
2                     0              0         86387          1  
3                     1              2         61481          1  
4   

Part 2 : Data Preprocessing and Transformation

In [3]:
print(new_df.isnull().sum())
new_df=new_df.dropna()

## Normalisation
cols=['Age','YearlyIncome','TotalChildren']
scaler=MaxAbsScaler()
scaled=scaler.fit_transform(new_df[cols])
new_df[cols]=pd.DataFrame(scaled,columns=cols,index=new_df.index)

##Discretization
new_df['AgeGroup']=pd.cut(
    new_df['Age'],
    bins=[0,0.3,0.6,1],
    labels=['Young','Middle-Aged','Senior']
)

# print(new_df['AgeGroup'].value_counts())

##Standardisation
from sklearn.preprocessing import StandardScaler
col=['NumberCarsOwned','NumberChildrenAtHome']
scaler2=StandardScaler()
new_df[col]=scaler2.fit_transform(new_df[col])

##Binarization
final_df=pd.get_dummies(new_df,columns=['Gender','MaritalStatus','Education','AgeGroup'])
print(final_df.head())

Gender                  0
MaritalStatus           0
Age                     0
Education               0
HomeOwnerFlag           0
NumberCarsOwned         0
NumberChildrenAtHome    0
TotalChildren           0
YearlyIncome            0
BikeBuyer               0
dtype: int64
        Age  HomeOwnerFlag  NumberCarsOwned  NumberChildrenAtHome  \
0  0.389474              1         1.892524             -0.594371   
1  0.557895              1         0.798389              1.163279   
2  0.410526              0         1.892524             -0.594371   
3  0.494737              1         0.798389              1.163279   
4  0.526316              1        -0.295746             -0.594371   

   TotalChildren  YearlyIncome  BikeBuyer  Gender_F  Gender_M  \
0       0.333333      0.588837          1     False      True   
1       0.666667      0.582798          1     False      True   
2       0.000000      0.620975          1      True     False   
3       0.666667      0.441944          1     False 

Part 3 : Calculating Proximity /Correlation Analysis of two features

In [4]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
obj1=final_df.iloc[0]
obj2=final_df.iloc[1]
A=obj1.values
B=obj2.values

# Simple Matching Coefficient
match=np.sum(A==B)
size=len(A)
smc=match/size
print(smc)

# Jaccard
match=np.sum((A==1) & (B==1))
denominator=np.sum((A==1) | (B==1))
jaccard=match/denominator
print(jaccard)

#cosine
cosine=cosine_similarity([A],[B])
print(cosine)

0.631578947368421
0.7142857142857143
[[0.67451652]]
