In [10]:
from collections import Counter
import pandas as pd
import import_ipynb
from cluster_factors.cluster_binary_factors import cluster_factors_voting, cluster_factors_rand_un, cluster_factors_corr, cluster_factors_rand
import yaml


def get_df():
    # Load the dataset
    df = pd.read_csv('data/telco.csv')
    
    # Drop numerical columns
    df = df.select_dtypes(exclude=['int64', 'float64'])
    df = df.drop('TotalCharges', axis=1)
    df = df.drop('customerID', axis=1)
    
    # Replace "Yes" and "No" with 1 and 0 respectively for binary columns
    binary_columns = ['Partner', 'Dependents', 'PhoneService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 
                      'TechSupport', 'StreamingTV', 'StreamingMovies', 'PaperlessBilling', 'Churn']
    for col in binary_columns:
        df[col] = df[col].map({'Yes': True, 'No': False})
        
    df_encoded = pd.get_dummies(df)

    return df_encoded, 'Churn'


def convert_to_dict(test_df, y_name, factors):
    
    # create a dictionary 
    cases = []
    for i, row in test_df.iterrows():
        case = {
            "pi": [],
            "delta": [],
            "reason": [],
            "decision": None
        }
        
        for col, value in row.items(): # iterate through all factor candidates in case
            if value and col != y_name: # only add factor if True
                if factors[col] == "pi" and value: #  pi (True) factor
                    case["pi"].append(col)
                elif factors[col] == "delta" and value: #  delta (False) factor
                    case["delta"].append(col)
                else: # todo: neutral/undecided case 
                    pass
        
        # for the mushroom dataset the reason is the same as the decision 
        if row[y_name]:
            case["reason"] = case["pi"]
            case["decision"] = "pi"
        elif not row[y_name]:
            case["reason"] = case["delta"]  
            case["decision"] = "delta"
            
        cases.append(case)
    
    return cases

def display_facotrs(factors):
    # Initialize counters for each factor type
    pis = []
    deltas = []
    uns = []
    
    # Count the occurrences of each factor type
    for factor, factor_type in factors.items():
        if factor_type == 'pi':
            pis.append(factor)
        elif factor_type == 'delta':
            deltas.append(factor)
        else:
            uns.append(factor)
    
    # Print the counts for each factor type
    print("Number of factors by type:")
    print(f"PI factors: {len(pis)} - {pis}")
    print(f"Delta factors: {len(deltas)} - {deltas}")
    print(f"UN factors: {len(uns)} - {uns}")
    
def dataset_to_yaml(cluster_type="corr", test_split=0.5):
    # import data, name of the outcome variables 
    df, y_name = get_df()
    
    # Set the seed for reproducibility
    df = df.sample(frac=1, random_state=42)
    
    # Determine the split point
    # Training is for the factor clustering
    split_point = int(test_split * len(df))
    
    # Split the DataFrame
    test_df = df.iloc[:split_point]
    train_df = df.iloc[split_point:]
    
    # get factor clusters using chosen clustering method
    if cluster_type == "corr": 
        factors = cluster_factors_corr(train_df, y_name)
    elif cluster_type == "vote": 
        factors = cluster_factors_voting(train_df, y_name)
    elif cluster_type == "rand_un":
        factors = cluster_factors_rand_un(train_df, y_name)
    else: 
        factors = cluster_factors_rand(train_df, y_name)
        
    display_facotrs(factors)    
    
    # get dict format of cases
    cases_dict = convert_to_dict(test_df, y_name, factors)    
    filename = f"data/telco-{cluster_type}-test-{test_split}.yaml"

    # Write the dictionary to a new YAML file
    with open(filename, "w") as yaml_file:
        yaml.dump(cases_dict, yaml_file, default_flow_style=False)
        
    return cases_dict

def get_existing_test_data(cluster="corr", test_split="0.25"):

    filename = f"data/telco-{cluster}-test-{test_split}.yaml"
    
    try:
        with open(filename, 'r') as file:
            data = yaml.safe_load(file)
        print(f"Loaded data successfully from '{filename}'")
    except FileNotFoundError:
        print(f"File '{filename}' not found.")
        print(f"Creating new file")
        data = dataset_to_yaml(cluster, float(test_split))
    except Exception as e:
        print("An error occurred:", str(e))
        
    return data 

In [20]:
df = pd.read_csv('data/telco.csv')
df

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.30,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.70,151.65,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,6840-RESVB,Male,0,Yes,Yes,24,Yes,Yes,DSL,Yes,...,Yes,Yes,Yes,Yes,One year,Yes,Mailed check,84.80,1990.5,No
7039,2234-XADUH,Female,0,Yes,Yes,72,Yes,Yes,Fiber optic,No,...,Yes,No,Yes,Yes,One year,Yes,Credit card (automatic),103.20,7362.9,No
7040,4801-JZAZL,Female,0,Yes,Yes,11,No,No phone service,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.60,346.45,No
7041,8361-LTMKD,Male,1,Yes,No,4,Yes,Yes,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Mailed check,74.40,306.6,Yes


In [2]:
df_encoded, _ = get_df()
df_encoded

Unnamed: 0,Partner,Dependents,PhoneService,PaperlessBilling,Churn,gender_Female,gender_Male,MultipleLines_No,MultipleLines_No phone service,MultipleLines_Yes,...,StreamingTV_True,StreamingMovies_False,StreamingMovies_True,Contract_Month-to-month,Contract_One year,Contract_Two year,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,True,False,False,True,False,True,False,False,True,False,...,False,True,False,True,False,False,False,False,True,False
1,False,False,True,False,False,False,True,True,False,False,...,False,True,False,False,True,False,False,False,False,True
2,False,False,True,True,True,False,True,True,False,False,...,False,True,False,True,False,False,False,False,False,True
3,False,False,False,False,False,False,True,False,True,False,...,False,True,False,False,True,False,True,False,False,False
4,False,False,True,True,True,True,False,True,False,False,...,False,True,False,True,False,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,True,True,True,True,False,False,True,False,False,True,...,True,False,True,False,True,False,False,False,False,True
7039,True,True,True,True,False,True,False,False,False,True,...,True,False,True,False,True,False,False,True,False,False
7040,True,True,False,True,False,True,False,False,True,False,...,False,True,False,True,False,False,False,False,True,False
7041,True,False,True,True,True,False,True,False,False,True,...,False,True,False,True,False,False,False,False,False,True


In [12]:
cases_dict = dataset_to_yaml("corr", 0.25)
pd.DataFrame(cases_dict)

Number of factors by type:
PI factors: 10 - ['PaperlessBilling', 'InternetService_Fiber optic', 'OnlineSecurity_False', 'OnlineBackup_False', 'DeviceProtection_False', 'TechSupport_False', 'StreamingTV_False', 'StreamingMovies_False', 'Contract_Month-to-month', 'PaymentMethod_Electronic check']
Delta factors: 10 - ['Partner', 'Dependents', 'InternetService_DSL', 'InternetService_No', 'OnlineSecurity_True', 'TechSupport_True', 'Contract_One year', 'Contract_Two year', 'PaymentMethod_Bank transfer (automatic)', 'PaymentMethod_Credit card (automatic)']
UN factors: 11 - ['PhoneService', 'gender_Female', 'gender_Male', 'MultipleLines_No', 'MultipleLines_No phone service', 'MultipleLines_Yes', 'OnlineBackup_True', 'DeviceProtection_True', 'StreamingTV_True', 'StreamingMovies_True', 'PaymentMethod_Mailed check']


Unnamed: 0,pi,delta,reason,decision
0,"[PaperlessBilling, OnlineSecurity_False, Onlin...","[Partner, InternetService_DSL]","[PaperlessBilling, OnlineSecurity_False, Onlin...",pi
1,"[PaperlessBilling, Contract_Month-to-month]","[InternetService_No, PaymentMethod_Bank transf...","[InternetService_No, PaymentMethod_Bank transf...",delta
2,[],"[Partner, Dependents, InternetService_No, Cont...","[Partner, Dependents, InternetService_No, Cont...",delta
3,"[InternetService_Fiber optic, OnlineSecurity_F...",[],"[InternetService_Fiber optic, OnlineSecurity_F...",pi
4,"[OnlineSecurity_False, OnlineBackup_False, Dev...","[InternetService_DSL, TechSupport_True, Contra...","[InternetService_DSL, TechSupport_True, Contra...",delta
...,...,...,...,...
1755,"[InternetService_Fiber optic, OnlineSecurity_F...",[],[],delta
1756,"[PaperlessBilling, InternetService_Fiber optic...",[],"[PaperlessBilling, InternetService_Fiber optic...",pi
1757,"[PaperlessBilling, InternetService_Fiber optic...","[Partner, TechSupport_True]","[Partner, TechSupport_True]",delta
1758,"[OnlineSecurity_False, TechSupport_False, Stre...","[InternetService_DSL, Contract_One year, Payme...","[InternetService_DSL, Contract_One year, Payme...",delta
