In [2]:
import pandas as pd
import numpy as np

In [3]:
column_names = ['Age', 'Workclass', 'Fnlwgt', 'Education', 'Education-num',
                'Marital-status', 'Occupation', 'Relationship', 'Race',
                'Sex', 'Capital-gain', 'Capital-loss', 'Hours-per-week',
                'Native-country', 'Income']
df = pd.read_csv("../Data/adult.csv", names=column_names)
df.head()

Unnamed: 0,Age,Workclass,Fnlwgt,Education,Education-num,Marital-status,Occupation,Relationship,Race,Sex,Capital-gain,Capital-loss,Hours-per-week,Native-country,Income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [22]:
filtered_df = df.loc[df["Native-country"] == " United-States"]

In [23]:
# Filter the dataframe to exclude rows where "Native-country" is "United-States"
# filtered_df = df[df["Native-country"] != " United-States"]

# Count the occurrences of each income value
income_counts = filtered_df["Income"].value_counts()

# Calculate the percentage for each income value
income_percentages = (income_counts / len(filtered_df)) * 100

# Combine counts and percentages into a new DataFrame for clarity
income_summary = pd.DataFrame({
    "Count": income_counts,
    "Percentage": income_percentages
})

print(income_summary)

        Count  Percentage
Income                   
<=50K   21998   75.415681
>50K     7171   24.584319


In [27]:
filtered_df = df.loc[df["Native-country"] != " United-States"]

In [29]:
# Filter the dataframe to exclude rows where "Native-country" is "United-States"
# filtered_df = df[df["Native-country"] != " United-States"]

# Count the occurrences of each income value
income_counts = filtered_df["Income"].value_counts()

# Calculate the percentage for each income value
income_percentages = (income_counts / len(filtered_df)) * 100

# Combine counts and percentages into a new DataFrame for clarity
income_summary = pd.DataFrame({
    "Count": income_counts,
    "Percentage": income_percentages
})

print(income_summary)

        Count  Percentage
Income                   
<=50K    2721   80.241817
>50K      670   19.758183


Apriori and FP 


In [None]:
import pandas as pd
import numpy as np
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules
from mlxtend.frequent_patterns import fpgrowth


def prepare_data(df):
    """
    Prepare the dataframe for association rule mining by:
    1. Converting numeric columns to categorical
    2. Creating binary encoded columns
    3. Identifying high income threshold
    """
    # Create a copy of the dataframe
    df_processed = df.copy()

    # Define high income threshold (using the provided data's income column)
    df_processed['high_income'] = (
        df_processed['Income'] == '>50K').astype(int)

    # Columns to convert to categorical
    categorical_cols = [
        'Workclass', 'Education', 'Marital-status', 'Occupation',
        'Relationship', 'Race', 'Sex', 'Native-country'
    ]

    # Add binary columns for each categorical variable
    for col in categorical_cols:
        df_processed[col + '_cat'] = df_processed[col]

    # Categorize numeric columns
    df_processed['Age_cat'] = pd.cut(df_processed['Age'],
                                     bins=[0, 25, 35, 45, 55, 100],
                                     labels=['Young', 'Early-Career', 'Mid-Career', 'Late-Career', 'Senior'])

    df_processed['Education-num_cat'] = pd.cut(df_processed['Education-num'],
                                               bins=[0, 8, 10, 12, 16, 100],
                                               labels=['Basic', 'Some-HS', 'HS-Grad', 'Bachelors', 'Advanced'])

    df_processed['Hours-per-week_cat'] = pd.cut(df_processed['Hours-per-week'],
                                                bins=[0, 20, 40, 60, 100],
                                                labels=['Part-time', 'Standard', 'Overtime', 'Extensive'])

    return df_processed


def mine_association_rules(df_processed, algorithm='apriori', min_support=0.1, min_confidence=0.7):
    """
    Mine association rules using specified algorithm

    Parameters:
    - df_processed: Preprocessed dataframe
    - algorithm: 'apriori' or 'fpgrowth'
    - min_support: Minimum support threshold
    - min_confidence: Minimum confidence threshold

    Returns:
    - DataFrame of association rules
    """
    # Select relevant categorical columns
    rule_columns = [
        col for col in df_processed.columns
        if col.endswith('_cat') or col == 'high_income'
    ]

    # Prepare transactions
    transactions = df_processed[rule_columns].apply(lambda x:
                                                    [str(val) for val in x if pd.notna(val)], axis=1).tolist()

    # One-hot encode transactions
    te = TransactionEncoder()
    te_ary = te.fit(transactions).transform(transactions)
    te_df = pd.DataFrame(te_ary, columns=te.columns_)

    # Apply frequent itemset mining
    if algorithm == 'apriori':
        frequent_itemsets = apriori(
            te_df, min_support=min_support, use_colnames=True)
    else:  # fpgrowth
        frequent_itemsets = fpgrowth(
            te_df, min_support=min_support, use_colnames=True)

    # Generate association rules
    rules = association_rules(
        frequent_itemsets, metric="confidence", min_threshold=min_confidence)

    # Sort rules by lift
    rules = rules.sort_values('lift', ascending=False)

    return rules


# Prepare the data
df_processed = prepare_data(df)

# Mine rules using both Apriori and FP-Growth
print("Apriori Association Rules:")
apriori_rules = mine_association_rules(df_processed, algorithm='apriori')
print(apriori_rules)

print("\nFP-Growth Association Rules:")
fpgrowth_rules = mine_association_rules(df_processed, algorithm='fpgrowth')
print(fpgrowth_rules)