In [None]:
import pandas as pd
import numpy as np


df = pd.read_csv("AWCustomers.csv")

print("Columns in dataset:\n", df.columns.tolist())

selected_features = [
    'Age', 'Occupation', 'Gender', 'MaritalStatus', 'HomeOwnerFlag',
    'NumberCarsOwned', 'NumberChildrenAtHome', 'TotalChildren',
    'YearlyIncome', 'Education', 'CommuteDistance', 'Region', 'BikeBuyer'
]

selected_features = [col for col in selected_features if col in df.columns]

df_selected = df[selected_features].copy()

print("\nSelected Features:\n", df_selected.head())

feature_types = {
    'Age': 'Continuous (Ratio)',
    'Occupation': 'Nominal',
    'Gender': 'Nominal',
    'MaritalStatus': 'Nominal',
    'HomeOwnerFlag': 'Binary',
    'NumberCarsOwned': 'Discrete',
    'NumberChildrenAtHome': 'Discrete',
    'TotalChildren': 'Discrete',
    'YearlyIncome': 'Continuous',
    'Education': 'Ordinal',
    'CommuteDistance': 'Ordinal',
    'Region': 'Nominal',
    'BikeBuyer': 'Binary (Target)'
}
print("\nFeature Types:\n", feature_types)


Columns in dataset:
 ['CustomerID', 'Title', 'FirstName', 'MiddleName', 'LastName', 'Suffix', 'AddressLine1', 'AddressLine2', 'City', 'StateProvinceName', 'CountryRegionName', 'PostalCode', 'PhoneNumber', 'BirthDate', 'Education', 'Occupation', 'Gender', 'MaritalStatus', 'HomeOwnerFlag', 'NumberCarsOwned', 'NumberChildrenAtHome', 'TotalChildren', 'YearlyIncome', 'LastUpdated']

Selected Features:
        Occupation Gender MaritalStatus  HomeOwnerFlag  NumberCarsOwned  \
0        Clerical      M             M              1                3   
1        Clerical      M             M              1                2   
2        Clerical      F             S              0                3   
3  Skilled Manual      M             M              1                2   
4  Skilled Manual      M             S              1                1   

   NumberChildrenAtHome  TotalChildren  YearlyIncome        Education  
0                     0              1         81916        Bachelors  
1         

In [31]:
# (a) Handle missing values
for col in df_selected.columns:
    if df_selected[col].dtype == 'object':
        df_selected[col] = df_selected[col].fillna(df_selected[col].mode()[0])
    else:
        df_selected[col] = df_selected[col].fillna(df_selected[col].median())

# (b) Normalize numeric columns (Min-Max scaling)
numeric_cols = ['Age','NumberCarsOwned','NumberChildrenAtHome','TotalChildren','YearlyIncome']
for col in numeric_cols:
    if col in df_selected.columns:
        df_selected[col] = (df_selected[col] - df_selected[col].min()) / (df_selected[col].max() - df_selected[col].min())

# (c) Discretize Age (using original df values, not normalized)
if 'Age' in df.columns:
    df_selected['AgeGroup'] = pd.cut(df['Age'], bins=[0,25,35,50,70], labels=['Young','Adult','Middle-Aged','Senior'])

# (d) Standardization (Z-score)
for col in numeric_cols:
    if col in df_selected.columns:
        df_selected[col] = (df_selected[col] - df_selected[col].mean()) / df_selected[col].std()

# (e) One-hot encode categorical attributes
cat_cols = ['Occupation','Gender','MaritalStatus','Education','CommuteDistance','Region','AgeGroup']
cat_cols = [col for col in cat_cols if col in df_selected.columns]

df_encoded = pd.get_dummies(df_selected, columns=cat_cols, drop_first=True)

print("\nTransformed Data:\n", df_encoded.head())



Transformed Data:
    HomeOwnerFlag  NumberCarsOwned  NumberChildrenAtHome  TotalChildren  \
0              1         1.892473             -0.594355       0.161337   
1              1         0.798367              1.163248       1.239719   
2              0         1.892473             -0.594355      -0.917044   
3              1         0.798367              1.163248       1.239719   
4              1        -0.295738             -0.594355      -0.917044   

   YearlyIncome  Occupation_Management  Occupation_Manual  \
0      0.298547                  False              False   
1      0.271173                  False              False   
2      0.444249                  False              False   
3     -0.367391                  False              False   
4     -0.682747                  False              False   

   Occupation_Professional  Occupation_Skilled Manual  Gender_M  \
0                    False                      False      True   
1                    False        

In [None]:
# Pick two rows
obj1 = df_encoded.iloc[0].values
obj2 = df_encoded.iloc[1].values

# (a) Similarities
binary_cols = [c for c in df_encoded.columns if set(df_encoded[c].unique()).issubset({0,1})]
bin1 = df_encoded[binary_cols].iloc[0].values
bin2 = df_encoded[binary_cols].iloc[1].values

# Simple Matching
simple_matching = np.sum(bin1 == bin2) / len(bin1)

# Jaccard
jaccard = np.sum(np.logical_and(bin1, bin2)) / np.sum(np.logical_or(bin1, bin2))

# Cosine
cosine_sim = np.dot(obj1, obj2) / (np.linalg.norm(obj1) * np.linalg.norm(obj2))

print("\nSimilarity between Row 0 & Row 1:")
print("Simple Matching:", simple_matching)
print("Jaccard:", jaccard)
print("Cosine:", cosine_sim)

# (b) Correlation between CommuteDistance and YearlyIncome
commute_cols = [c for c in df_encoded.columns if "CommuteDistance" in c]
if commute_cols:  
    commute_feature = df_encoded[commute_cols[0]]
    income = df_encoded['YearlyIncome']

    cov = np.mean((commute_feature - commute_feature.mean()) * (income - income.mean()))
    corr = cov / (commute_feature.std() * income.std())

    print("\nCorrelation between Commute Distance and Yearly Income:", corr)



Similarity between Row 0 & Row 1:
Simple Matching: 0.9090909090909091
Jaccard: 0.6666666666666666
Cosine: 0.49062704799182394
