In this file we implement the four CBA prediction algorithms discussed in Chapter 7. 

In [1]:
import pandas as pd
import import_ipynb
from sklearn.linear_model import LogisticRegression

# the data sets are prepared in another file
import Data_preparation as d

# the objects Case, Feature and CB are defined in another file
from Classes import Case, Feature, CB, W_comparison

importing Jupyter notebook from Data_preparation.ipynb
importing Jupyter notebook from Classes.ipynb


In [2]:
# selection of important features
important_mushroom = ['odor_a', 'odor_c', 'odor_f', 'odor_l', 'odor_n', 'odor_p', 'gill-size_b', 'gill-size_n', 
        'gill-color_b', 'stalk-surface-above-ring_k', 'stalk-surface-below-ring_y', 'ring-type_f', 'spore-print-color_k', 
                      'spore-print-color_n', 'spore-print-color_r', 'spore-print-color_u', 'population_c']

important_churn = ['tenure', 'OnlineSecurity', 'TotalCharges', 'Churn', 'Internet_sevice',
       'InternetService_Fiber optic', 'InternetService_No',
       'Contract_Month-to-month', 'Contract_Two year']

important_admission = ['GRE Score', 'TOEFL Score', 'LOR ', 'CGPA']


In [3]:
# given training data and the label name, returns the feature weights
def log_regression(train, y_name):
    X_train = train.drop([y_name], axis=1)
    y_train = train[y_name].astype('int')
    
    model = LogisticRegression(solver = 'lbfgs')
    model.fit(X_train, y_train)

    return model.coef_

In [4]:
# receives a dataset, name of the outcome variables and the weights
# returns a test and train case base and the weights
def data_to_cb(data, y_name, important):
    # remove unimportant features
    columns = list(data.columns)
    columns.remove(y_name)
    for col in columns:
        if col not in important:
            del data[col]
            columns.remove(col)

    # split the data into 20% test and 80% training data
    test_size = round(0.2 * len(data))
    test = data.iloc[0:test_size,:]
    train = data.iloc[test_size:,:]
    
    # create a dictionary with the normalized weights of the logistic regression
    importance = log_regression(train, y_name)[0]
    # normalize the importancies to range (-1, 1)
    max_weight = max([abs(i) for i in importance])
    importance = [round((i / max_weight), 2) for i in importance]
    importance_dict = dict(zip(columns,importance))
    
    features = []
    
    for col in data.columns:
        if col != y_name:
            # tendencies are established based on the weights
            if importance_dict[col] > 0:
                features.append(Feature(col, 1, False))
            else:
                features.append(Feature(col, 0, False))
       
    # the test and train set make up two different case bases
    test_cb, train_cb = [], []
    
    # creating cases, triples Case(name, fact situation, outcome)
    for data, cb in [[test, test_cb], [train, train_cb]]:
        for i, row in data.iterrows():
            outcome = row[y_name]

            f_s = {}

            for f in features:
                f_s[f] = row[f.name]

            cb.append(Case(i, f_s, outcome))
        
    return CB(test_cb), CB(train_cb), importance_dict
                

Algorithm 1) priority + balance

1. If there are precedents with no negative differences with the focus case
    (a) Predict the most common outcome among these precedents
2. Else
    (a) Select all precedents with a maximal weighted balance (positive
    differences - negative differences) with the focus case
    (b) Predict the most common outcome among these precedents

In [5]:
def test_priority(con_cb, test_cb, importance_dict, print_scores = False):
    
    n_correct = 0
    
    for focus in test_cb.cases:
        comparisons, balances, best_comp, priority = [], [], [], []
        for case in con_cb.cases:
            com = focus.find_differences(case, importance_dict)
            comparisons.append(com)
            balances.append(com.balance)
            if com.w_dif == 0:
                priority.append(com)
            
        if len(priority) != 0:
            best_comp = priority
        else:
            best_balance = sorted(balances)[-1]

            for com in comparisons:
                if com.balance == best_balance:
                    best_comp.append(com)
                
        outcomes = []
        for com in best_comp:
            outcomes.append(com.case.outcome)
        
        prediction = max(outcomes, key = outcomes.count)
        
        if prediction == focus.outcome:
            n_correct += 1
    
    return n_correct / test_cb.length

Algorithm 2) balance
1. Select all precedents with a maximal weighted balance (positive differences - negative differences) with the focus case
2. Predict the most common outcome among these precedents

In [6]:
def test_balance(con_cb, test_cb, importance_dict, print_scores = False):
    
    n_correct = 0

    for focus in test_cb.cases:
        comparisons, balances, best_comp = [], [], []
        for case in con_cb.cases:
            com = focus.find_differences(case, importance_dict)
            comparisons.append(com)
            balances.append(com.balance)
            
        best_balance = sorted(balances)[-1]
            
        for com in comparisons:
            if com.balance == best_balance:
                best_comp.append(com)
                
        outcomes = []
        for com in best_comp:
            outcomes.append(com.case.outcome)
        
        prediction = max(outcomes, key = outcomes.count)
        
        if prediction == focus.outcome:
            n_correct += 1

    return n_correct / test_cb.length

Algorithm 3) minimize negative
1. select all precedents with minimal weighted negative differences with the focus case
2. predict the most common outcome among these precedents

In [7]:
def test_min_negative(con_cb, test_cb, importance_dict, max_positive = 1, print_scores = False):
    
    n_correct = 0

    for focus in test_cb.cases:
        comparisons, negative, best_comp = [], [], []
        for case in con_cb.cases:
            com = focus.find_differences(case, importance_dict)
            comparisons.append(com)
            negative.append(com.w_dif)

        best_worse = sorted(negative)[0]
            
        for com in comparisons:
            if com.w_dif == best_worse:
                best_comp.append(com)
                
        outcomes = []
        for com in best_comp:
            outcomes.append(com.case.outcome)
        
        prediction = max(outcomes, key = outcomes.count)
        
        if prediction == focus.outcome:
            n_correct += 1

    
    return n_correct / test_cb.length

Algorithm 4) nearest neighbor
1. select all precedents with minimal weighted differences (negative + positive) with the focus case
2. predict the most common outcome among these precedents

In [8]:
def test_nn(con_cb, test_cb, importance_dict, print_scores = False):
    
    n_correct = 0

    for focus in test_cb.cases:
        differences, comparisons, best_comp = [], [], []
        for case in con_cb.cases:
            com = focus.find_differences(case, importance_dict)
            comparisons.append(com)
            differences.append(com.w_dif + com.b_dif)

        best_dif = sorted(differences)[0]
            
        for com in comparisons:
            if com.w_dif + com.b_dif == best_dif:
                best_comp.append(com)
                
        outcomes = []
        for com in best_comp:
            outcomes.append(com.case.outcome)
        
        prediction = max(outcomes, key = outcomes.count)
        
        if prediction == focus.outcome:
            n_correct += 1
    
    return n_correct / test_cb.length

In [9]:
# example code to run a test
def run_CBA_algorithms():

    data, y_name, _, _ = d.get_churn()
    # uses a small part of the data for illustration
    test, train, importance_dict = data_to_cb(data.iloc[0:100,], y_name, important_churn)

    print(test_balance(train, test, importance_dict))
    print(test_min_negative(train, test, importance_dict))
    print(test_nn(train, test, importance_dict))
    print(test_priority(train, test, importance_dict))

    # turn the training case base into a consistent one 
    con_train = train.make_consistent(1)

    print(test_balance(con_train, test, importance_dict))
    print(test_min_negative(con_train, test, importance_dict))
    print(test_nn(con_train, test, importance_dict))
    print(test_priority(con_train, test, importance_dict))