In [1]:
import numpy as np
import pandas as pd
from dfply import *

In [2]:
def compute_probabilities(df, target):
    prob_table = pd.DataFrame()
    feature_matrix = df.drop(target, axis = 1)
    feature_variables = feature_matrix.columns.values
    target_variable = target
    
    for feature_variable in feature_variables:
        for unique_value in feature_matrix[feature_variable].unique():
            prob_table = prob_table.append((df >> mask(X[feature_variable] == unique_value)
                                               >> group_by(X[target_variable])
                                               >> summarize(count = n(X[feature_variable]))
                                               >> ungroup()
                                               >> rename(target = X[target_variable])
                                               >> mutate(probability = X['count']/X['count'].sum())
                                               >> mutate(variable = feature_variable)
                                               >> mutate(value = unique_value)
                                               >> select(['target', 'variable', 'value', 'probability'])), ignore_index=True)
    return prob_table

def return_best_rule(prob_table):
    best_rule = prob_table >> mask(X['probability'] == X['probability'].max())
    return best_rule.iloc[0, :]

def update_table(prob_table, rule):
    for idx in prob_table.index:
        if prob_table.iloc[idx, :].equals(rule):
            new_table = prob_table.drop(idx).reset_index(drop=True)
    return new_table

def PRISM(df, target):
    rules = []
    prob_table = compute_probabilities(df, target)
    
    print(prob_table)
    while prob_table.empty == False:
        best_rule = return_best_rule(prob_table)
#         rules.append(best_rule['variable']+' == '+str(best_rule['value'])+' Probability '+ str(best_rule['probability']) + ' / Target: ' + str(best_rule['target']))
        rules.append(best_rule)
        prob_table = update_table(prob_table, best_rule)
    
    return rules

## Teste de como utilizar ##

#Criando dados fictícios

# Os dados do dataset estarão aqui para as regras de associação
data = {'height': ['short', 'short', 'tall', 'tall', 'tall', 'tall', 'tall', 'short'],
        'hair': ['blond', 'blond', 'red', 'dark', 'dark', 'blond', 'dark', 'blond'],
        'eyes': ['blue', 'brown', 'blue', 'blue', 'blue', 'blue', 'brown', 'brown'],
        'teste': ['C1', 'C2', 'C1', 'C2', 'C2', 'C1', 'C2', 'C2']}

#Transformando em DataFrame
df = pd.DataFrame(data)

#Chamando a função do algoritmo PRISM no DataFrame e indicando o nome da coluna da variável alvo (classes).
rules = PRISM(df, 'teste')

#Imprimindo a lista de regras associativas, que por padrão já estão em ordem crescente de importância.
# for i in rules:
#     print(i)
# print(rules)

   target variable  value  probability
0      C1   height  short     0.333333
1      C2   height  short     0.666667
2      C1   height   tall     0.400000
3      C2   height   tall     0.600000
4      C1     hair  blond     0.500000
5      C2     hair  blond     0.500000
6      C1     hair    red     1.000000
7      C2     hair   dark     1.000000
8      C1     eyes   blue     0.600000
9      C2     eyes   blue     0.400000
10     C2     eyes  brown     1.000000


In [3]:
### Teste para o dataset Iris
from sklearn import datasets
from sklearn.decomposition import PCA

# Age: age of the patient at the time of diagnosis;
# Menopause: whether the patient is pre- or postmenopausal at time of diagnosis;
# Tumor size: the greatest diameter (in mm) of the excised tumor;
# Inv-nodes: the number (range 0 - 39) of axillary lymph nodes that contain metastatic breast cancer visible on histological examination;
# Node caps: if the cancer does metastasise to a lymph node, although outside the original site of the tumor it may remain “contained” by the capsule of the lymph node. However, over time, and with more aggressive disease, the tumor may replace the lymph node and then penetrate the capsule, allowing it to invade the surrounding tissues;
# Degree of malignancy: the histological grade (range 1-3) of the tumor. Tumors that are grade 1 predominantly consist of cells that, while neoplastic, retain many of their usual characteristics. Grade 3 tumors predominately consist of cells that are highly abnormal;
# Breast: breast cancer may obviously occur in either breast;
# Breast quadrant: the breast may be divided into four quadrants, using the nipple as a central point;
# Irradiation: radiation therapy is a treatment that uses high-energy x-rays to destroy cancer cells. 

df_cancer = pd.read_csv('cancer.csv')

# print(df_cancer.head())

rules = PRISM(df_cancer,'Class')

for i in rules:
    print(i)

                  target       variable  value  probability
0   no-recurrence-events            age      2     0.696629
1      recurrence-events            age      2     0.303371
2   no-recurrence-events            age      3     0.758242
3      recurrence-events            age      3     0.241758
4   no-recurrence-events            age      4     0.690909
..                   ...            ...    ...          ...
74     recurrence-events  breast.quad.y      1     0.299213
75  no-recurrence-events  breast.quad.y      0     0.809524
76     recurrence-events  breast.quad.y      0     0.190476
77  no-recurrence-events  breast.quad.y     -1     0.697674
78     recurrence-events  breast.quad.y     -1     0.302326

[79 rows x 4 columns]
target         no-recurrence-events
variable                        age
value                             5
probability                       1
Name: 8, dtype: object
target         no-recurrence-events
variable                        age
value             

In [4]:
# A conclusão será feita a partir do seguinte cálculo:
# Regra 1 (FC = 0.8) R2 (FC = 0.3) R3 (FC = -0.2) R4 (FC = 0.7)
# Para a R1 considerando a R2 -> 0.8 + 0.3(1 - 0.8) = 0.86
# Considerando a Regra 3
# 0.86 + 0(1 - 0.86) = 0.86
# MD = 0 + 0.2 (1-0) = 0.2
# Considerando a Regra 4
# 0.86 + 0.7(1 - 0.86) = 0.958
# Podemos calcular o Fator Confiança final = 0.95 - MD = 0.75 
# No caso dos nossos dados, temos fatores apenas positivos, então basta fazer o cálculo FC + FCi*(1 - FC)
# É possível também deixar algum valor para a comparação, por exemplo: Se o FC der > 75%, pode-se dizer que o usuário está
# diagnosticado com cancer. Menos que isso é incerto

In [10]:
#Caso de teste: age 2 menopause 2 tumor.size 3 inv.nodes 0 node.caps 1 deg.malig 3
def conclusion(case):
    rules = PRISM(df_cancer, 'Class')
    
    fc_list = []
    
    for i in case:
        for j in rules:
            if(i[0] == j['variable'] and i[1] == j['value'] and j['target'] == 'recurrence-events'):
                fc_list.append(j['probability'])
                break
    
    result = fc_list[0]
    for i in range(1, len(fc_list)):
        result = result + (fc_list[i] * (1 - result))
        print('Fc', fc_list[i])
        print('Result', result)
    
    return result
                
                
# test_case = [('age', 2), ('menopause', 2), ('tumor.size', 3), ('inv.nodes', 0), ('node.caps', 1), ('deg.malig', 3)] 93%
# test_case = [('age', 2), ('menopause', 1), ('tumor.size', 4)] 63%
# test_case = [('age', 2), ('menopause', 1), ('tumor.size', 3), ('inv.nodes', 0), ('node.caps', 5), ('deg.malig', 3)] 85%
# test_case = [('age', 0), ('menopause', 0), ('tumor.size', 0)] # 12.5% de chance de ter, então 87.5% de não ter, que podemos considerar maior
print('Fator de confiança de que o usuário está com cancer', conclusion(test_case)*100, '%')

                  target       variable  value  probability
0   no-recurrence-events            age      2     0.696629
1      recurrence-events            age      2     0.303371
2   no-recurrence-events            age      3     0.758242
3      recurrence-events            age      3     0.241758
4   no-recurrence-events            age      4     0.690909
..                   ...            ...    ...          ...
74     recurrence-events  breast.quad.y      1     0.299213
75  no-recurrence-events  breast.quad.y      0     0.809524
76     recurrence-events  breast.quad.y      0     0.190476
77  no-recurrence-events  breast.quad.y     -1     0.697674
78     recurrence-events  breast.quad.y     -1     0.302326

[79 rows x 4 columns]
Fator de confiança de que o usuário está com cancer 12.5 %
