In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from mlxtend.frequent_patterns import apriori, association_rules
from sklearn import cluster
from sklearn.preprocessing import LabelBinarizer
import warnings
warnings.filterwarnings('ignore')
from orangecontrib.associate.fpgrowth import *
import Orange
import csv
from io import StringIO
from collections import OrderedDict
from Orange.data import Table, Domain, ContinuousVariable, DiscreteVariable


def pandas_to_orange(df):
    domain, attributes, metas = construct_domain(df)
    orange_table = Orange.data.Table.from_numpy(domain = domain, X = df[attributes].values, Y = None, metas = df[metas].values, W = None)
    return orange_table

def construct_domain(df):
    columns = OrderedDict(df.dtypes)
    attributes = OrderedDict()
    metas = OrderedDict()
    for name, dtype in columns.items():

        if issubclass(dtype.type, np.number):
            if len(df[name].unique()) >= 13 or issubclass(dtype.type, np.inexact) or (df[name].max() > len(df[name].unique())):
                attributes[name] = Orange.data.ContinuousVariable(name)
            else:
                df[name] = df[name].astype(str)
                attributes[name] = Orange.data.DiscreteVariable(name, values = sorted(df[name].unique().tolist()))
        else:
            metas[name] = Orange.data.StringVariable(name)

    domain = Orange.data.Domain(attributes = attributes.values(), metas = metas.values())

    return domain, list(attributes.keys()), list(metas.keys())

pd.options.display.max_columns = None

In [2]:
green_data = pd.read_csv('green_consensus.csv')
hinselmann_data = pd.read_csv('hinselmann_consensus.csv')
schiller_data = pd.read_csv('schiller_consensus.csv')

target_attrs = ['experts::{}'.format(i) for i in range(5)]
target_attrs.append('consensus')

green = green_data.drop(columns=target_attrs)
hin = hinselmann_data.drop(columns=target_attrs)
sch = schiller_data.drop(columns=target_attrs)

green.head()

Unnamed: 0,cervix_area,os_area,walls_area,speculum_area,artifacts_area,cervix_artifacts_area,os_artifacts_area,walls_artifacts_area,speculum_artifacts_area,cervix_specularities_area,os_specularities_area,walls_specularities_area,speculum_specularities_area,specularities_area,area_h_max_diff,rgb_cervix_r_mean,rgb_cervix_r_std,rgb_cervix_r_mean_minus_std,rgb_cervix_r_mean_plus_std,rgb_cervix_g_mean,rgb_cervix_g_std,rgb_cervix_g_mean_minus_std,rgb_cervix_g_mean_plus_std,rgb_cervix_b_mean,rgb_cervix_b_std,rgb_cervix_b_mean_minus_std,rgb_cervix_b_mean_plus_std,rgb_total_r_mean,rgb_total_r_std,rgb_total_r_mean_minus_std,rgb_total_r_mean_plus_std,rgb_total_g_mean,rgb_total_g_std,rgb_total_g_mean_minus_std,rgb_total_g_mean_plus_std,rgb_total_b_mean,rgb_total_b_std,rgb_total_b_mean_minus_std,rgb_total_b_mean_plus_std,hsv_cervix_h_mean,hsv_cervix_h_std,hsv_cervix_s_mean,hsv_cervix_s_std,hsv_cervix_v_mean,hsv_cervix_v_std,hsv_total_h_mean,hsv_total_h_std,hsv_total_s_mean,hsv_total_s_std,hsv_total_v_mean,hsv_total_v_std,fit_cervix_hull_rate,fit_cervix_hull_total,fit_cervix_bbox_rate,fit_cervix_bbox_total,fit_circle_rate,fit_circle_total,fit_ellipse_rate,fit_ellipse_total,fit_ellipse_goodness,dist_to_center_cervix,dist_to_center_os
0,0.344647,0.00308,0.047522,0.288216,0.178585,0.016564,0.0,0.0435,0.010149,0.000133,0.0,0.0,0.085833,0.024907,0.26356,37.594458,15.785021,21.809437,53.379479,109.918445,38.735421,71.183024,148.653865,55.029618,22.16033,32.869287,77.189948,38.561367,38.119059,0.442308,76.680426,95.109755,51.565052,43.544702,146.674807,48.808474,40.765228,8.043247,89.573702,5.014628,2.991944,167.95278,25.813163,109.919447,38.733741,5.090801,2.93665,159.486916,38.437294,95.123889,51.583029,0.923067,0.373371,0.844454,0.40813,0.603399,0.571175,0.962995,0.35789,85.474311,0.265933,0.346294
1,0.165329,0.0,0.048236,0.504736,0.502783,0.007012,0.0,0.097405,0.973837,0.004055,0.0,0.0,0.054999,0.028431,0.0,59.505882,24.361877,35.144005,83.86776,122.366075,44.742407,77.623669,167.108482,78.058434,30.818729,47.239706,108.877163,54.932467,39.447415,15.485052,94.379883,101.680459,46.028852,55.651607,147.709311,63.218931,43.925912,19.293019,107.144843,4.944382,2.965108,130.260492,24.143867,122.366647,44.743932,5.080063,2.894163,128.251978,33.000693,101.725519,46.09351,0.850861,0.194308,0.646645,0.255673,0.497315,0.332444,0.894625,0.184803,124.794129,1.0,0.283059
2,0.45701,0.001681,0.242888,0.212859,0.0,0.0,0.0,0.0,0.0,0.001756,0.0,0.0,0.083055,0.018591,0.269798,39.353851,19.417332,19.936519,58.771183,109.543386,49.75349,59.789896,159.296876,54.642888,27.781965,26.860923,82.424853,41.24223,34.196356,7.045875,75.438586,109.592342,57.57664,52.015702,167.168982,53.470241,38.344391,15.12585,91.814632,5.049946,2.983966,163.576979,24.973042,109.544201,49.753659,5.078936,2.968023,162.268659,33.590792,109.597127,57.584515,0.918514,0.497554,0.747443,0.611432,0.633925,0.720923,0.920287,0.496596,94.948697,0.51874,0.419375
3,0.513244,0.005711,0.213781,0.251819,0.079795,0.0,0.0,0.017594,0.007208,0.001288,0.0,0.000315,0.0,0.000729,0.107022,46.322391,17.711957,28.610434,64.034349,116.075087,43.593124,72.481962,159.668211,51.430923,18.573016,32.857907,70.00394,40.365565,17.259087,23.106478,57.624652,102.641859,38.606995,64.034863,141.248854,50.805205,18.072101,32.733104,68.877306,5.177654,2.969214,156.242754,25.499379,116.08177,43.582671,5.071879,2.909002,158.343946,28.273928,102.648278,38.5983,0.95171,0.539286,0.855409,0.599998,0.61814,0.830304,0.964611,0.532073,74.22167,0.347202,0.361672
4,0.390319,0.009454,0.272884,0.373487,0.0,0.0,0.0,0.0,0.0,0.000196,0.0,0.000304,0.071442,0.026759,0.442831,37.552979,14.454975,23.098004,52.007953,101.044906,37.171973,63.872932,138.216879,53.971671,17.591669,36.380003,71.56334,40.717229,38.810699,1.90653,79.527927,97.446185,48.106253,49.339932,145.552438,51.733004,39.866956,11.866047,91.59996,4.978534,2.964685,158.160578,30.308891,101.050711,37.163383,5.051587,2.92354,157.131407,39.439642,97.457304,48.125747,0.955996,0.408286,0.88299,0.442043,0.623938,0.625574,0.957604,0.4076,61.546536,0.437852,0.673196


In [3]:
def discretize(df, bins=3):
    for col in list(df):
        intervals = pd.cut(df[col], bins)
        df[col] = pd.cut(df[col], bins, labels=[str(i+1) for i in range(bins)])

        attrs = []
        values = intervals.unique().tolist()
        values.sort()

        for val in values:
            low, high = val.left, val.right
            attrs.append('{}:[{}, {}]'.format(col, low, high))

        lb = LabelBinarizer().fit_transform(df[col])
        if (len(attrs) == 2):
            v = list(map(lambda x: 1 - x, lb))
            lb = np.concatenate((lb, v), 1)

        df2 = pd.DataFrame(data=lb, columns=attrs)
        df = df.drop(columns=[col])
        df = pd.concat([df, df2], axis=1, join='inner')
    
    return df

green_dis = discretize(green)
hin_dis = discretize(hin)
sch_dis = discretize(sch)
    

In [4]:
print(green_dis.shape[1])
green_dis.head()

181


Unnamed: 0,"cervix_area:[-0.001, 0.333]","cervix_area:[0.333, 0.667]","cervix_area:[0.667, 1.0]","os_area:[-2.26e-05, 0.00754]","os_area:[0.00754, 0.0151]","os_area:[0.0151, 0.0226]","walls_area:[-0.001, 0.333]","walls_area:[0.333, 0.667]","walls_area:[0.667, 1.0]","speculum_area:[-0.000603, 0.201]","speculum_area:[0.201, 0.402]","speculum_area:[0.402, 0.603]","artifacts_area:[-0.000503, 0.168]","artifacts_area:[0.168, 0.335]","artifacts_area:[0.335, 0.503]","cervix_artifacts_area:[-0.000296, 0.0987]","cervix_artifacts_area:[0.0987, 0.197]","cervix_artifacts_area:[0.197, 0.296]","os_artifacts_area:[-0.00065, 0.217]","os_artifacts_area:[0.217, 0.433]","os_artifacts_area:[0.433, 0.65]","walls_artifacts_area:[-0.000765, 0.255]","walls_artifacts_area:[0.255, 0.51]","walls_artifacts_area:[0.51, 0.765]","speculum_artifacts_area:[-0.000974, 0.325]","speculum_artifacts_area:[0.649, 0.974]","cervix_specularities_area:[-8.31e-05, 0.0277]","cervix_specularities_area:[0.0277, 0.0554]","cervix_specularities_area:[0.0554, 0.0831]","os_specularities_area:[-0.000154, 0.0515]","os_specularities_area:[0.103, 0.154]","walls_specularities_area:[-4.14e-05, 0.0138]","walls_specularities_area:[0.0138, 0.0276]","walls_specularities_area:[0.0276, 0.0414]","speculum_specularities_area:[-0.000368, 0.123]","speculum_specularities_area:[0.123, 0.245]","speculum_specularities_area:[0.245, 0.368]","specularities_area:[-8.41e-05, 0.028]","specularities_area:[0.028, 0.0561]","specularities_area:[0.0561, 0.0841]","area_h_max_diff:[-0.000628, 0.209]","area_h_max_diff:[0.209, 0.419]","area_h_max_diff:[0.419, 0.628]","rgb_cervix_r_mean:[-0.166, 55.379]","rgb_cervix_r_mean:[55.379, 110.757]","rgb_cervix_r_mean:[110.757, 166.136]","rgb_cervix_r_std:[-0.0587, 19.567]","rgb_cervix_r_std:[19.567, 39.133]","rgb_cervix_r_std:[39.133, 58.7]","rgb_cervix_r_mean_minus_std:[-11.514, 33.698]","rgb_cervix_r_mean_minus_std:[33.698, 78.776]","rgb_cervix_r_mean_minus_std:[78.776, 123.854]","rgb_cervix_r_mean_plus_std:[-0.21, 70.146]","rgb_cervix_r_mean_plus_std:[70.146, 140.292]","rgb_cervix_r_mean_plus_std:[140.292, 210.438]","rgb_cervix_g_mean:[-0.228, 75.868]","rgb_cervix_g_mean:[75.868, 151.735]","rgb_cervix_g_mean:[151.735, 227.603]","rgb_cervix_g_std:[-0.0811, 27.017]","rgb_cervix_g_std:[27.017, 54.034]","rgb_cervix_g_std:[54.034, 81.051]","rgb_cervix_g_mean_minus_std:[-4.129, 60.956]","rgb_cervix_g_mean_minus_std:[60.956, 125.847]","rgb_cervix_g_mean_minus_std:[125.847, 190.738]","rgb_cervix_g_mean_plus_std:[-0.281, 93.76]","rgb_cervix_g_mean_plus_std:[93.76, 187.521]","rgb_cervix_g_mean_plus_std:[187.521, 281.281]","rgb_cervix_b_mean:[-0.0822, 27.41]","rgb_cervix_b_mean:[27.41, 54.82]","rgb_cervix_b_mean:[54.82, 82.231]","rgb_cervix_b_std:[-0.0421, 14.046]","rgb_cervix_b_std:[14.046, 28.092]","rgb_cervix_b_std:[28.092, 42.138]","rgb_cervix_b_mean_minus_std:[-9.843, 11.236]","rgb_cervix_b_mean_minus_std:[11.236, 32.252]","rgb_cervix_b_mean_minus_std:[32.252, 53.267]","rgb_cervix_b_mean_plus_std:[-0.112, 37.275]","rgb_cervix_b_mean_plus_std:[37.275, 74.55]","rgb_cervix_b_mean_plus_std:[74.55, 111.825]","rgb_total_r_mean:[0.0498, 39.778]","rgb_total_r_mean:[39.778, 79.387]","rgb_total_r_mean:[79.387, 118.996]","rgb_total_r_std:[2.719, 25.815]","rgb_total_r_std:[25.815, 48.842]","rgb_total_r_std:[48.842, 71.87]","rgb_total_r_mean_minus_std:[-16.281, 14.265]","rgb_total_r_mean_minus_std:[14.265, 44.718]","rgb_total_r_mean_minus_std:[44.718, 75.172]","rgb_total_r_mean_plus_std:[2.788, 58.948]","rgb_total_r_mean_plus_std:[58.948, 114.941]","rgb_total_r_mean_plus_std:[114.941, 170.933]","rgb_total_g_mean:[0.163, 59.968]","rgb_total_g_mean:[59.968, 119.594]","rgb_total_g_mean:[119.594, 179.22]","rgb_total_g_std:[4.181, 36.041]","rgb_total_g_std:[36.041, 67.805]","rgb_total_g_std:[67.805, 99.57]","rgb_total_g_mean_minus_std:[-7.124, 36.725]","rgb_total_g_mean_minus_std:[36.725, 80.443]","rgb_total_g_mean_minus_std:[80.443, 124.161]","rgb_total_g_mean_plus_std:[4.379, 84.456]","rgb_total_g_mean_plus_std:[84.456, 164.294]","rgb_total_g_mean_plus_std:[164.294, 244.131]","rgb_total_b_mean:[0.105, 25.604]","rgb_total_b_mean:[25.604, 51.027]","rgb_total_b_mean:[51.027, 76.45]","rgb_total_b_std:[2.924, 19.554]","rgb_total_b_std:[19.554, 36.135]","rgb_total_b_std:[36.135, 52.715]","rgb_total_b_mean_minus_std:[-25.351, -3.113]","rgb_total_b_mean_minus_std:[-3.113, 19.06]","rgb_total_b_mean_minus_std:[19.06, 41.232]","rgb_total_b_mean_plus_std:[3.045, 39.729]","rgb_total_b_mean_plus_std:[39.729, 76.304]","rgb_total_b_mean_plus_std:[76.304, 112.879]","hsv_cervix_h_mean:[3.139, 4.072]","hsv_cervix_h_mean:[4.072, 5.001]","hsv_cervix_h_mean:[5.001, 5.931]","hsv_cervix_h_std:[-0.00311, 1.036]","hsv_cervix_h_std:[2.072, 3.107]","hsv_cervix_s_mean:[-0.244, 81.343]","hsv_cervix_s_mean:[81.343, 162.686]","hsv_cervix_s_mean:[162.686, 244.029]","hsv_cervix_s_std:[-0.0761, 25.375]","hsv_cervix_s_std:[25.375, 50.75]","hsv_cervix_s_std:[50.75, 76.126]","hsv_cervix_v_mean:[-0.228, 75.869]","hsv_cervix_v_mean:[75.869, 151.738]","hsv_cervix_v_mean:[151.738, 227.607]","hsv_cervix_v_std:[-0.081, 27.011]","hsv_cervix_v_std:[27.011, 54.022]","hsv_cervix_v_std:[54.022, 81.033]","hsv_total_h_mean:[3.153, 4.052]","hsv_total_h_mean:[4.052, 4.95]","hsv_total_h_mean:[4.95, 5.847]","hsv_total_h_std:[2.316, 2.573]","hsv_total_h_std:[2.573, 2.83]","hsv_total_h_std:[2.83, 3.086]","hsv_total_s_mean:[7.775, 87.328]","hsv_total_s_mean:[87.328, 166.643]","hsv_total_s_mean:[166.643, 245.958]","hsv_total_s_std:[17.132, 47.16]","hsv_total_s_std:[47.16, 77.097]","hsv_total_s_std:[77.097, 107.035]","hsv_total_v_mean:[0.199, 59.994]","hsv_total_v_mean:[59.994, 119.609]","hsv_total_v_mean:[119.609, 179.225]","hsv_total_v_std:[4.228, 36.068]","hsv_total_v_std:[36.068, 67.814]","hsv_total_v_std:[67.814, 99.559]","fit_cervix_hull_rate:[-0.00102, 0.339]","fit_cervix_hull_rate:[0.677, 1.016]","fit_cervix_hull_total:[-0.000993, 0.331]","fit_cervix_hull_total:[0.331, 0.662]","fit_cervix_hull_total:[0.662, 0.993]","fit_cervix_bbox_rate:[-0.00101, 0.336]","fit_cervix_bbox_rate:[0.336, 0.672]","fit_cervix_bbox_rate:[0.672, 1.008]","fit_cervix_bbox_total:[-0.000996, 0.332]","fit_cervix_bbox_total:[0.332, 0.664]","fit_cervix_bbox_total:[0.664, 0.996]","fit_circle_rate:[-0.000791, 0.264]","fit_circle_rate:[0.264, 0.527]","fit_circle_rate:[0.527, 0.791]","fit_circle_total:[-0.0018, 0.599]","fit_circle_total:[0.599, 1.197]","fit_circle_total:[1.197, 1.796]","fit_ellipse_rate:[-0.00107, 0.356]","fit_ellipse_rate:[0.712, 1.067]","fit_ellipse_total:[-0.000941, 0.314]","fit_ellipse_total:[0.314, 0.627]","fit_ellipse_total:[0.627, 0.941]","fit_ellipse_goodness:[-0.499, 166.222]","fit_ellipse_goodness:[166.222, 332.443]","fit_ellipse_goodness:[332.443, 498.665]","dist_to_center_cervix:[0.11, 0.407]","dist_to_center_cervix:[0.407, 0.704]","dist_to_center_cervix:[0.704, 1.0]","dist_to_center_os:[0.144, 0.43]","dist_to_center_os:[0.43, 0.715]","dist_to_center_os:[0.715, 1.0]"
0,0,1,0,1,0,0,1,0,0,0,1,0,0,1,0,1,0,0,1,0,0,1,0,0,0,1,1,0,0,0,1,1,0,0,1,0,0,1,0,0,0,1,0,1,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,1,0,1,0,0,0,1,0,0,1,1,0,0,0,1,0,1,0,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,1,0,1,0,0,0,1,0,0,1,1,0,0,0,1,0,1,0,0,1,0,0,1,0,0,0,1,0,0,1,0,1,0,1,0,0,0,1,0,0,1,0,1,0,0,1,0,0,0,1,0,1,0,0,0,1,1,0,0,1,0,0,1,0,1,0,0,1,0,0,1,0,0
1,1,0,0,1,0,0,1,0,0,0,0,1,0,0,1,1,0,0,1,0,0,1,0,0,1,0,1,0,0,0,1,1,0,0,1,0,0,0,1,0,1,0,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0,1,0,0,1,0,0,1,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0,1,0,0,1,0,0,1,0,1,0,1,0,0,1,0,1,0,0,0,1,0,0,1,0,0,0,1,0,0,1,0,1,0,1,0,0,0,1,0,0,1,0,1,0,1,0,0,0,1,0,1,0,0,0,1,0,1,0,0,1,0,1,0,0,1,0,0,0,0,1,1,0,0
2,0,1,0,1,0,0,1,0,0,0,1,0,1,0,0,1,0,0,1,0,0,1,0,0,0,1,1,0,0,0,1,1,0,0,1,0,0,1,0,0,0,1,0,1,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0,1,0,1,0,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,1,0,1,0,0,1,0,1,0,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0,1,0,0,1,0,1,0,0,0,1,0,0,1,1,0,0,0,1,1,0,0,0,1,0,0,1,0,0,0,1,0,0,1,0,1,0,1,0,0,0,1,0,0,1,0,1,0,0,1,0,0,0,1,0,1,0,0,0,1,0,1,0,1,0,0,1,0,1,0,0,0,1,0,1,0,0
3,0,1,0,1,0,0,1,0,0,0,1,0,1,0,0,1,0,0,1,0,0,1,0,0,0,1,1,0,0,0,1,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,1,0,1,0,0,1,0,1,0,0,0,1,0,1,0,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,1,0,0,0,0,1,0,1,0,0,0,1,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0,1,0,1,0,1,0,0,0,1,0,0,1,0,1,0,0,1,0,0,0,1,0,1,0,0,0,1,0,1,0,1,0,0,1,0,1,0,0,1,0,0,1,0,0
4,0,1,0,0,1,0,1,0,0,0,1,0,1,0,0,1,0,0,1,0,0,1,0,0,0,1,1,0,0,0,1,1,0,0,1,0,0,1,0,0,0,0,1,1,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,1,0,1,0,0,1,0,0,1,0,1,0,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0,1,0,1,0,0,0,1,0,1,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0,1,0,1,0,1,0,0,0,1,0,0,1,0,1,0,0,1,0,0,0,1,0,1,0,0,0,1,0,1,0,1,0,0,1,0,1,0,0,0,1,0,0,1,0


In [5]:
green = pandas_to_orange(green_dis)
green
X, mapping = OneHot.encode(green)

In [None]:
itemsets = frequent_itemsets(X, min_support=0.2)
list(itemsets)