# ALPHA-ANONYMOUS K-ANONYMITY 

In [7]:
import pandas as pd
from collections import defaultdict

df = pd.read_csv('C:\\Users\\Smile\\Downloads\\Financial Sample.csv');
print(df.head())

def alpha_anonymize(data, alpha, beta, sensitive_attributes):
    """
    Applies Alpha-Anonymous K-Anonymity to a dataset.
    
    Parameters:
    - data: Pandas DataFrame containing the dataset to be anonymized
    - alpha: Integer minimum group size for each group in the dataset
    - beta: Integer maximum group size for each group in the dataset
    - sensitive_attributes: List of strings containing the names of the sensitive attributes
    
    Returns:
    - Pandas DataFrame containing the anonymized dataset
    """
    
    # Group the records in the dataset based on the sensitive attributes
    groups = defaultdict(list)
    for index, row in data.iterrows():
        sensitive_values = tuple(row[sensitive_attributes].values)
        groups[sensitive_values].append(index)

    # Check the size of each group and merge or split groups as needed
    for sensitive_values, indices in groups.items():
        group_size = len(indices)
        if group_size < alpha:
            # Merge with the nearest group
            nearest_group_size = min([len(groups[k]) for k in groups if k != sensitive_values])
            if group_size + nearest_group_size <= beta:
                nearest_group = min(groups, key=lambda k: abs(len(groups[k]) - group_size))
                groups[nearest_group] += indices
                del groups[sensitive_values]
            else:
                # Can't merge, increase size
                group_size = alpha
        elif group_size > beta:
            # Split into smaller groups
            num_splits = (group_size // beta) + 1
            split_size = (group_size // num_splits) + 1
            for i in range(num_splits):
                start = i * split_size
                end = min((i+1) * split_size, group_size)
                new_group_indices = indices[start:end]
                new_group_sensitive_values = tuple(data.loc[new_group_indices, sensitive_attributes].iloc[0])
                groups[new_group_sensitive_values] += new_group_indices

    # Apply K-anonymity to each group
    k_anon_groups = []
    for sensitive_values, indices in groups.items():
        group_data = data.loc[indices]
        group_data = group_data.groupby(sensitive_attributes).filter(lambda x: len(x) >= alpha)
        k_anon_groups.append(group_data)

    # Combine the K-anonymous groups into the final dataset
    anon_data = pd.concat(k_anon_groups)
    return anon_data


      Segment  Country     Product   Discount Band   Units Sold   \
0  Government   Canada   Carretera            None    $1,618.50    
1  Government  Germany   Carretera            None    $1,321.00    
2   Midmarket   France   Carretera            None    $2,178.00    
3   Midmarket  Germany   Carretera            None      $888.00    
4   Midmarket   Mexico   Carretera            None    $2,470.00    

   Manufacturing Price   Sale Price   Gross Sales   Discounts         Sales   \
0                $3.00       $20.00    $32,370.00        $-      $32,370.00    
1                $3.00       $20.00    $26,420.00        $-      $26,420.00    
2                $3.00       $15.00    $32,670.00        $-      $32,670.00    
3                $3.00       $15.00    $13,320.00        $-      $13,320.00    
4                $3.00       $15.00    $37,050.00        $-      $37,050.00    

          COGS        Profit       Date  Month Number  Month Name   Year  
0   $16,185.00    $16,185.00   1/1/

In [8]:
data = pd.read_csv('C:\\Users\\Smile\\Downloads\\Financial Sample.csv')
anon_data = alpha_anonymize(data, alpha=5, beta=10, sensitive_attributes=['Segment', 'Month Number'])

# Print the first 10 rows of the anonymized dataset
print(anon_data.head(10))

# Display the anonymized dataset in a Jupyter notebook
display(anon_data)


        Segment                   Country     Product   Discount Band   \
0    Government                    Canada   Carretera            None    
1    Government                   Germany   Carretera            None    
31   Government                    Mexico        Velo            None    
53   Government                    France       Paseo             Low    
188  Government                    Canada       Paseo             Low    
214  Government                   Germany       Paseo          Medium    
231  Government                    France     Montana          Medium    
273  Government  United States of America   Carretera          Medium    
414  Government                    Mexico       Paseo          Medium    
509  Government                   Germany     Montana            High    

     Units Sold   Manufacturing Price   Sale Price    Gross Sales   \
0     $1,618.50                 $3.00       $20.00     $32,370.00    
1     $1,321.00                 $3.00       $

Unnamed: 0,Segment,Country,Product,Discount Band,Units Sold,Manufacturing Price,Sale Price,Gross Sales,Discounts,Sales,COGS,Profit,Date,Month Number,Month Name,Year
0,Government,Canada,Carretera,,"$1,618.50",$3.00,$20.00,"$32,370.00",$-,"$32,370.00","$16,185.00","$16,185.00",1/1/2014,1,January,2014
1,Government,Germany,Carretera,,"$1,321.00",$3.00,$20.00,"$26,420.00",$-,"$26,420.00","$13,210.00","$13,210.00",1/1/2014,1,January,2014
31,Government,Mexico,Velo,,"$1,493.00",$120.00,$7.00,"$10,451.00",$-,"$10,451.00","$7,465.00","$2,986.00",1/1/2014,1,January,2014
53,Government,France,Paseo,Low,"$3,945.00",$10.00,$7.00,"$27,615.00",$276.15,"$27,338.85","$19,725.00","$7,613.85",1/1/2014,1,January,2014
188,Government,Canada,Paseo,Low,"$4,251.00",$10.00,$7.00,"$29,757.00","$1,190.28","$28,566.72","$21,255.00","$7,311.72",1/1/2014,1,January,2014
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
453,Midmarket,Canada,Amarilla,Medium,"$1,630.50",$260.00,$15.00,"$24,457.50","$2,201.18","$22,256.33","$16,305.00","$5,951.33",7/1/2014,7,July,2014
478,Midmarket,Mexico,VTT,High,$641.00,$250.00,$15.00,"$9,615.00",$961.50,"$8,653.50","$6,410.00","$2,243.50",7/1/2014,7,July,2014
501,Midmarket,United States of America,Amarilla,High,"$3,199.50",$260.00,$15.00,"$47,992.50","$5,279.18","$42,713.33","$31,995.00","$10,718.33",7/1/2014,7,July,2014
561,Midmarket,France,VTT,High,"$3,874.50",$250.00,$15.00,"$58,117.50","$6,974.10","$51,143.40","$38,745.00","$12,398.40",7/1/2014,7,July,2014


# EFFICIENCY OF ALPHA ANONYMOUS

In [9]:
import time

# Load the input dataset
data = pd.read_csv('C:\\Users\\Smile\\Downloads\\Financial Sample.csv')
# Measure the time it takes to anonymize the dataset
start_time = time.time()
anon_data = alpha_anonymize(data, alpha=5, beta=10, sensitive_attributes=['Segment', 'Country'])
end_time = time.time()

# Print the time it took to anonymize the dataset
print(f"Anonymization took {end_time - start_time:.2f} seconds")


Anonymization took 0.22 seconds


In [15]:
import pandas as pd
import math

def entropy(df, col):
    """Calculate the entropy of a column in a DataFrame."""
    counts = df.groupby(col).size()
    total = counts.sum()
    probs = counts / total
    return -probs.map(lambda x: x * math.log2(x)).sum()

def generalize(df, qi_cols, hierarchy, alpha):
    """Apply entropy-based generalization to achieve alpha anonymity."""
    for col in qi_cols:
        level = 0
        while len(df.groupby(col).size()) > alpha:
            # If the entropy of the column is greater than log2(alpha),
            # replace each value with the more general value in the hierarchy
            if entropy(df, col) > math.log2(alpha):
                df[col] = df[col].map(hierarchy[col][level])
                level += 1
            else:
                break
    return df

# Example usage:
df = pd.read_csv('C:\\Users\\Smile\\Downloads\\Financial Sample.csv')
qi_cols = ['Segment','Country']
hierarchy = {
    'Segment': ['Goverment', 'mid-market'],
    'Country': ['Canada', 'Germany', 'France', 'Mexico'],
   
   
}
alpha = 10

start_time = time.time()
anon_df = generalize(df, qi_cols, hierarchy, alpha)
end_time = time.time()

print(f"Time taken: {end_time - start_time:.2f} seconds")


Time taken: 0.00 seconds
