In [None]:
import pandas as pd

In [None]:
customers = pd.read_csv('/content/AWCustomers_ml_lab_2.csv')

In [None]:
sales = pd.read_csv('/content/AWSales_ml_lab_2.csv')

In [None]:
customers.head()

In [None]:
sales.head()

### **Part I: Based on Feature Selection, Cleaning, and Preprocessing to Construct an Input from Data Source**

(a) Examine the values of each attribute and Select a set of attributes only that would affect to predict
future bike buyers to create your input for data mining algorithms. Remove all the unnecessary
attributes. (Select features just by analysis).

In [None]:
customers = customers.drop(['CustomerID','Title','FirstName','MiddleName','LastName','Suffix','AddressLine1','AddressLine2','LastUpdated'], axis=1)

In [None]:
customers.head()

(b) Create a new Data Frame with the selected attributes only.


In [None]:
# List of selected features
selected_features = [
    'Education', 'Occupation', 'Gender', 'MaritalStatus',
    'HomeOwnerFlag', 'NumberCarsOwned', 'NumberChildrenAtHome',
    'TotalChildren', 'YearlyIncome', 'City', 'StateProvinceName'
]

# Creating a new DataFrame with only selected features
df_selected = customers[selected_features]

# Display the first few rows of the new DataFrame
print(df_selected.head())


(c) Determine a Data value type (Discrete, or Continuous, then Nominal, Ordinal, Interval, Ratio) of
each attribute in your selection to identify preprocessing tasks to create input for your data mining.

In [None]:
def determine_data_type(df, feature):
    unique_values = df[feature].nunique()
    dtype = df[feature].dtype

    # Discrete vs Continuous
    if dtype == 'object':
        value_type = 'Discrete'
    elif unique_values < 15:
        value_type = 'Discrete'
    else:
        value_type = 'Continuous'

    # Nominal, Ordinal, Interval, Ratio
    if feature in ['Gender', 'Occupation', 'MaritalStatus', 'City', 'StateProvinceName']:
        value_subtype = 'Nominal'
    elif feature == 'Education':
        value_subtype = 'Ordinal'
    elif feature in ['YearlyIncome']:
        value_subtype = 'Ratio'
    else:
        value_subtype = 'Ratio'  # Defaulting numerical data to Ratio if not specified

    return value_type, value_subtype

In [None]:
data_types = []

for feature in selected_features:
    value_type, value_subtype = determine_data_type(df_selected, feature)
    data_types.append({
        'Feature': feature,
        'Value Type': value_type,
        'Subtype': value_subtype
    })

df_data_types = pd.DataFrame(data_types)

print(df_data_types)

### **Part II: Data Preprocessing and Transformation**

Depending on the data type of each attribute, transform each object from your preprocessed data.  
Use all the data rows (~= 18000 rows) with the selected features as input to apply all the tasks below, do
not perform each task on the smaller data set that you got from your random sampling result.  

In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, KBinsDiscretizer
from sklearn.impute import SimpleImputer

df = df_selected.copy()

(a) Handling Null values

In [None]:
# Filling missing values with the most frequent value for categorical features and mean for numerical features
categorical_features = ['Education', 'Occupation', 'Gender', 'MaritalStatus', 'City', 'StateProvinceName']
numerical_features = ['HomeOwnerFlag', 'NumberCarsOwned', 'NumberChildrenAtHome', 'TotalChildren', 'YearlyIncome']

# Impute missing categorical data with mode
categorical_imputer = SimpleImputer(strategy='most_frequent')
df[categorical_features] = categorical_imputer.fit_transform(df[categorical_features])

# Impute missing numerical data with mean
numerical_imputer = SimpleImputer(strategy='mean')
df[numerical_features] = numerical_imputer.fit_transform(df[numerical_features])

In [None]:
df.head()

(b) Normalization

In [None]:
scaler = MinMaxScaler()
df[numerical_features] = scaler.fit_transform(df[numerical_features])

(c) Discretization (Binning) on Continuous attributes or Categorical Attributes with too many different
values  

In [None]:
discretizer = KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='uniform')
df['YearlyIncome_Binned'] = discretizer.fit_transform(df[['YearlyIncome']])

In [None]:
city_discretizer = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='uniform')
df['City_Binned'] = city_discretizer.fit_transform(df[['City']].apply(lambda x: pd.factorize(x)[0]))

(d) Standardization/Normalization

In [None]:
standard_scaler = StandardScaler()
df[numerical_features] = standard_scaler.fit_transform(df[numerical_features])

(e) Binarization (One Hot Encoding)

In [None]:
df = pd.get_dummies(df, columns=categorical_features)

In [None]:
df.head()

In [None]:
# df.drop(['YearlyIncome', 'City'], axis=1, inplace=True)

### **Part III: Calculating Proximity /Correlation Analysis of two features**

Make sure each attribute is transformed in a same scale for numeric attributes and Binarization for each
nominal attribute, and each discretized numeric attribute to standardization. Make sure to apply a correct
similarity measure for nominal (one hot encoding)/binary attributes and numeric attributes respectively.

(a) Calculate Similarity in Simple Matching, Jaccard Similarity, and Cosine Similarity between two
following objects of your transformed input data.

In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics.pairwise import cosine_similarity
from scipy.spatial.distance import jaccard

# Assume df is the DataFrame with the selected and preprocessed features

# Ensure all numeric attributes are standardized
numeric_features = ['HomeOwnerFlag', 'NumberCarsOwned', 'NumberChildrenAtHome', 'TotalChildren', 'YearlyIncome']

# Apply Standardization
scaler = StandardScaler()
df[numeric_features] = scaler.fit_transform(df[numeric_features])

# Binarize nominal attributes
nominal_features = ['Education', 'Occupation', 'Gender', 'MaritalStatus', 'City', 'StateProvinceName']

# Apply One-Hot Encoding
df = pd.get_dummies(df, columns=nominal_features)

# Ensure discretized numeric attributes are standardized
if 'YearlyIncome_Binned' in df.columns:
    df['YearlyIncome_Binned'] = StandardScaler().fit_transform(df[['YearlyIncome_Binned']])

# Select two sample objects (rows) for similarity calculation
obj1 = df.iloc[0]
obj2 = df.iloc[1]


In [None]:
import numpy as np

# Simple Matching Coefficient (SMC)
def simple_matching_coefficient(a, b):
    matches = np.sum(a == b)
    total = len(a)
    return matches / total

# Jaccard Similarity
def jaccard_similarity(a, b):
    a_binary = np.array(a != 0, dtype=int)
    b_binary = np.array(b != 0, dtype=int)
    intersection = np.sum(a_binary & b_binary)
    union = np.sum(a_binary | b_binary)
    return intersection / union

# Cosine Similarity
def cosine_sim(a, b):
    a = a.values.reshape(1, -1)
    b = b.values.reshape(1, -1)
    return cosine_similarity(a, b)[0][0]

# Compute the similarity measures
smc = simple_matching_coefficient(obj1, obj2)
jaccard_sim = jaccard_similarity(obj1, obj2)
cos_sim = cosine_sim(obj1, obj2)

print(f"Simple Matching Coefficient: {smc}")
print(f"Jaccard Similarity: {jaccard_sim}")
print(f"Cosine Similarity: {cos_sim}")


(b) Calculate Correlation between two features Commute Distance and Yearly Income

In [None]:
# Assuming 'CommuteDistance' and 'YearlyIncome' are in the DataFrame and have been preprocessed

# Calculate Correlation
correlation = df['CommuteDistance'].corr(df['YearlyIncome'])
print(f"Correlation between Commute Distance and Yearly Income: {correlation}")
