In [44]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler

In [45]:
df=pd.read_csv('./data/Original.csv',index_col=0)

In [46]:
import numpy as np
import pandas as pd
from scipy.stats import norm

def ordinal_quantification(df, columns):
    thresholds_dict = {}

    for col in columns:
        ordinal_data = df[col]
        
        # Count the occurrences of each category
        unique_vals, counts = np.unique(ordinal_data, return_counts=True)
        
        # Calculate probabilities pi = ni / N
        N = len(ordinal_data)
        probabilities = counts / N
        
        # Initialize arrays to store thresholds and quantification values
        thresholds = np.zeros(len(unique_vals) + 1)  # One more threshold than categories
        quant_values = np.zeros(len(unique_vals))    # Quantified values for each category
        
        # Calculate thresholds ti for each category
        cumulative_prob = 0
        for i in range(1, len(probabilities)):
            cumulative_prob += probabilities[i-1]
            thresholds[i] = norm.ppf(cumulative_prob)
        
        thresholds[0] = -np.inf  # The first threshold is negative infinity
        thresholds[-1] = np.inf  # The last threshold is positive infinity
        
        # Calculate quantification values qi for each category using the average probability
        cumulative_prob = 0
        for i in range(len(probabilities)):
            cumulative_prob += probabilities[i] / 2  # Average probability for category
            quant_values[i] = norm.ppf(cumulative_prob)
            cumulative_prob += probabilities[i] / 2  # Complete probability after category
        
        # Map quantification values back to the DataFrame in place
        quant_map = dict(zip(unique_vals, quant_values))
        df[col] = ordinal_data.map(quant_map)
        
        # Store the thresholds for this column
        thresholds_dict[col] = thresholds[1:-1]  # Exclude -inf and +inf

    return df, thresholds_dict


In [47]:
# Quantize columns Age
columns = ['Age','Education','Nscore','Escore','Oscore','Ascore','Cscore','SS','Impulsivity']
quantified_df, thresholds_dict = ordinal_quantification(df, columns)
print(thresholds_dict)

{'Age': array([-0.40942456,  0.24374663,  0.78969104,  1.56419456,  2.34360974]), 'Education': array([-2.17395889, -1.49563802, -1.38328254, -1.09691387, -0.2410078 ,
        0.12063672,  0.85113393,  1.67248   ]), 'Nscore': array([-3.273829  , -3.07260159, -2.5917443 , -2.46260694, -2.38722437,
       -2.30404426, -2.14653601, -1.97027933, -1.78494304, -1.61108222,
       -1.49563802, -1.38675251, -1.27403075, -1.1215144 , -0.9892386 ,
       -0.85687868, -0.72935834, -0.62885655, -0.53281095, -0.40364723,
       -0.29338123, -0.20012949, -0.09789469, -0.00598403,  0.09121601,
        0.18117092,  0.26710292,  0.35930973,  0.47542685,  0.56840312,
        0.69339187,  0.77883949,  0.87428537,  0.94883297,  1.09934402,
        1.16759338,  1.30768129,  1.44470376,  1.5419863 ,  1.67248   ,
        1.77201195,  1.91751597,  2.06147974,  2.20312121,  2.38722437,
        2.55528529,  2.6770635 ,  3.07260159]), 'Escore': array([-3.07260159, -2.94948659, -2.5917443 , -2.49117622, -2.4108562

In [48]:
df.head()

Unnamed: 0_level_0,Age,Gender,Education,Country,Ethnicity,Nscore,Escore,Oscore,Ascore,Cscore,...,Ecstasy,Heroin,Ketamine,Legalh,LSD,Meth,Mushrooms,Nicotine,Semer,VSA
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.497881,F,-0.05921,UK,WA,0.312874,-0.57545,-0.583314,-0.916984,-0.006649,...,CL0,CL0,CL0,CL0,CL0,CL0,CL0,CL2,CL0,CL0
2,-0.078537,M,1.984368,UK,WH,-0.678251,1.938848,1.435328,0.760954,-0.14277,...,CL4,CL0,CL2,CL0,CL2,CL3,CL0,CL4,CL0,CL0
3,0.497881,M,-0.05921,UK,WH,-0.467254,0.805225,-0.84732,-1.620897,-1.014499,...,CL0,CL0,CL0,CL0,CL0,CL0,CL1,CL0,CL0,CL0
4,-0.951966,F,1.163659,UK,WH,-0.148818,-0.806145,-0.019283,0.590423,0.584891,...,CL0,CL0,CL2,CL0,CL0,CL0,CL0,CL2,CL0,CL0
5,0.497881,F,1.984368,UK,WH,0.735444,-1.633389,-0.451736,-0.301721,1.306119,...,CL1,CL0,CL0,CL1,CL0,CL0,CL2,CL2,CL0,CL0


In [49]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA

def nominal_quantification(df, nominal_columns, non_nominal_columns):
    """
    Quantify nominal features using PCA and centroids.
    
    Args:
        df (pd.DataFrame): The DataFrame containing the data.
        nominal_columns (list): List of nominal columns to be quantified.
        non_nominal_columns (list): List of non-nominal columns used for PCA.
    
    Returns:
        pd.DataFrame: The updated DataFrame with nominal features quantified.
        dict: A dictionary of centroids for each nominal feature.
    """
    
    # Step 1: Exclude nominal features and apply PCA to the remaining features
    retained_features = df[non_nominal_columns]
    
    # Perform PCA on the retained features
    pca = PCA()
    pca_retained = pca.fit_transform(retained_features)

    # Select components based on Kaiser’s rule (keep components with eigenvalues > 1)
    eigenvalues = pca.explained_variance_
    informative_components = np.where(eigenvalues > 1)[0]
    
    centroids_dict = {}

    # Step 2: Calculate centroids for each nominal feature
    for col in nominal_columns:
        categories = df[col].unique()
        centroids = []

        for category in categories:
            # Get the PCA projections for this category
            category_data = pca_retained[df[col] == category]

            if category_data.shape[0] == 0:  # Skip if no data for category
                continue
            
            # Calculate the centroid of the category
            centroid = category_data.mean(axis=0)  # Shape will be (n_components,)
            centroids.append(centroid)

        if centroids:
            centroids = np.array(centroids)  # Shape will be (n_categories, n_components)
            centroids_dict[col] = centroids
            
            # Step 3: Calculate the first principal component of centroids
            if informative_components.size > 0:
                # Get the first principal component
                first_pc = pca.components_[informative_components[0]].reshape(-1, 1)  # Ensure it's a column vector
                
                # Step 4: Project centroids onto the first principal component
                projections = centroids @ first_pc  # Matrix multiplication
                
                # Step 5: Assign quantified values back to the DataFrame
                unique_categories = categories
                quant_map = dict(zip(unique_categories, projections.flatten()))  # Flatten projections to 1D
                
                # Replace the original nominal feature with the quantified version
                df[col] = df[col].map(quant_map)

    return df, centroids_dict

# Usage example



In [50]:
nominal_columns = ['Gender', 'Country', 'Ethnicity']
non_nominal_columns = ['Age', 'Education', 'Nscore', 'Escore', 'Oscore', 'Ascore', 'Cscore', 'SS', 'Impulsivity']
quantified_df, centroids_dict = nominal_quantification(df, nominal_columns, non_nominal_columns)
print(centroids_dict)

{'Gender': array([[-0.40540129,  0.07116953,  0.16601383, -0.05103606,  0.04036753,
        -0.09551151,  0.10582806,  0.10449228, -0.02378647],
       [ 0.40497139, -0.07109406, -0.16583778,  0.05098194, -0.04032473,
         0.09541022, -0.10571584, -0.10438148,  0.02376125]]), 'Country': array([[-0.57345682,  0.10001225, -0.05531235, -0.11129554,  0.14308995,
        -0.08245525,  0.02714553,  0.08492196, -0.00753269],
       [ 0.17388188,  0.25387784, -0.27131481,  0.03248144, -0.06529933,
         0.20201995,  0.18107971, -0.07339528,  0.00896023],
       [ 0.86652298, -0.16392726,  0.15335041,  0.16524871, -0.21252235,
         0.09212734, -0.0053193 , -0.11937831, -0.01230422],
       [ 0.59526382, -0.12971857,  0.00539646,  0.01017082, -0.21015231,
         0.15858254, -0.31554634, -0.11631088,  0.02502483],
       [ 0.45929539, -0.33273178, -0.06920693,  0.18386412,  0.04254336,
        -0.03764248, -0.02531976, -0.09690138,  0.08039684],
       [ 0.1515467 , -0.18451979, -0.1

In [None]:
# do not touch this code