In [48]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import sys
sys.path.append('..')
from utils_cd import (
        split_dataset,
        standard_deviation,
        plot_comparison_results,
        impute_values,
        plot_results,
        plot_param_improv
)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.preprocessing import Normalizer, StandardScaler
from imblearn.over_sampling import SMOTE

pd.options.display.max_columns = None
sns.set(style='darkgrid')
CLASS = 'consensus'

In [19]:
green_data = pd.read_csv('green.csv')
hinselmann_data = pd.read_csv('hinselmann.csv')
schiller_data = pd.read_csv('schiller.csv')

In [21]:
expert_columns = ['experts::{}'.format(i) for i in range(6)]
green = green_data.drop(columns=expert_columns)
hinselmann = hinselmann_data.drop(columns=expert_columns)
schiller = schiller_data.drop(columns=expert_columns)
coloscopies = pd.concat([green, hinselmann, schiller])

The dataset is divided into 3 modalities: green, hinselmann and schiller. There are 7 target variables which consist of 6 experts (X 0..6) and a consensus. The target variables can take 2 values 0 for bad and 1 for good.
We should analyze how each expert evaluates each colposcopy and if they are usually right with the consensus? First we are going to analyze this without the experts attributes

In [22]:
coloscopies.head()

Unnamed: 0,cervix_area,os_area,walls_area,speculum_area,artifacts_area,cervix_artifacts_area,os_artifacts_area,walls_artifacts_area,speculum_artifacts_area,cervix_specularities_area,os_specularities_area,walls_specularities_area,speculum_specularities_area,specularities_area,area_h_max_diff,rgb_cervix_r_mean,rgb_cervix_r_std,rgb_cervix_r_mean_minus_std,rgb_cervix_r_mean_plus_std,rgb_cervix_g_mean,rgb_cervix_g_std,rgb_cervix_g_mean_minus_std,rgb_cervix_g_mean_plus_std,rgb_cervix_b_mean,rgb_cervix_b_std,rgb_cervix_b_mean_minus_std,rgb_cervix_b_mean_plus_std,rgb_total_r_mean,rgb_total_r_std,rgb_total_r_mean_minus_std,rgb_total_r_mean_plus_std,rgb_total_g_mean,rgb_total_g_std,rgb_total_g_mean_minus_std,rgb_total_g_mean_plus_std,rgb_total_b_mean,rgb_total_b_std,rgb_total_b_mean_minus_std,rgb_total_b_mean_plus_std,hsv_cervix_h_mean,hsv_cervix_h_std,hsv_cervix_s_mean,hsv_cervix_s_std,hsv_cervix_v_mean,hsv_cervix_v_std,hsv_total_h_mean,hsv_total_h_std,hsv_total_s_mean,hsv_total_s_std,hsv_total_v_mean,hsv_total_v_std,fit_cervix_hull_rate,fit_cervix_hull_total,fit_cervix_bbox_rate,fit_cervix_bbox_total,fit_circle_rate,fit_circle_total,fit_ellipse_rate,fit_ellipse_total,fit_ellipse_goodness,dist_to_center_cervix,dist_to_center_os,consensus
0,0.344647,0.00308,0.047522,0.288216,0.178585,0.016564,0.0,0.0435,0.010149,0.000133,0.0,0.0,0.085833,0.024907,0.26356,37.594458,15.785021,21.809437,53.379479,109.918445,38.735421,71.183024,148.653865,55.029618,22.16033,32.869287,77.189948,38.561367,38.119059,0.442308,76.680426,95.109755,51.565052,43.544702,146.674807,48.808474,40.765228,8.043247,89.573702,5.014628,2.991944,167.95278,25.813163,109.919447,38.733741,5.090801,2.93665,159.486916,38.437294,95.123889,51.583029,0.923067,0.373371,0.844454,0.40813,0.603399,0.571175,0.962995,0.35789,85.474311,0.265933,0.346294,1.0
1,0.165329,0.0,0.048236,0.504736,0.502783,0.007012,0.0,0.097405,0.973837,0.004055,0.0,0.0,0.054999,0.028431,0.0,59.505882,24.361877,35.144005,83.86776,122.366075,44.742407,77.623669,167.108482,78.058434,30.818729,47.239706,108.877163,54.932467,39.447415,15.485052,94.379883,101.680459,46.028852,55.651607,147.709311,63.218931,43.925912,19.293019,107.144843,4.944382,2.965108,130.260492,24.143867,122.366647,44.743932,5.080063,2.894163,128.251978,33.000693,101.725519,46.09351,0.850861,0.194308,0.646645,0.255673,0.497315,0.332444,0.894625,0.184803,124.794129,1.0,0.283059,0.0
2,0.45701,0.001681,0.242888,0.212859,0.0,0.0,0.0,0.0,0.0,0.001756,0.0,0.0,0.083055,0.018591,0.269798,39.353851,19.417332,19.936519,58.771183,109.543386,49.75349,59.789896,159.296876,54.642888,27.781965,26.860923,82.424853,41.24223,34.196356,7.045875,75.438586,109.592342,57.57664,52.015702,167.168982,53.470241,38.344391,15.12585,91.814632,5.049946,2.983966,163.576979,24.973042,109.544201,49.753659,5.078936,2.968023,162.268659,33.590792,109.597127,57.584515,0.918514,0.497554,0.747443,0.611432,0.633925,0.720923,0.920287,0.496596,94.948697,0.51874,0.419375,0.0
3,0.513244,0.005711,0.213781,0.251819,0.079795,0.0,0.0,0.017594,0.007208,0.001288,0.0,0.000315,0.0,0.000729,0.107022,46.322391,17.711957,28.610434,64.034349,116.075087,43.593124,72.481962,159.668211,51.430923,18.573016,32.857907,70.00394,40.365565,17.259087,23.106478,57.624652,102.641859,38.606995,64.034863,141.248854,50.805205,18.072101,32.733104,68.877306,5.177654,2.969214,156.242754,25.499379,116.08177,43.582671,5.071879,2.909002,158.343946,28.273928,102.648278,38.5983,0.95171,0.539286,0.855409,0.599998,0.61814,0.830304,0.964611,0.532073,74.22167,0.347202,0.361672,1.0
4,0.390319,0.009454,0.272884,0.373487,0.0,0.0,0.0,0.0,0.0,0.000196,0.0,0.000304,0.071442,0.026759,0.442831,37.552979,14.454975,23.098004,52.007953,101.044906,37.171973,63.872932,138.216879,53.971671,17.591669,36.380003,71.56334,40.717229,38.810699,1.90653,79.527927,97.446185,48.106253,49.339932,145.552438,51.733004,39.866956,11.866047,91.59996,4.978534,2.964685,158.160578,30.308891,101.050711,37.163383,5.051587,2.92354,157.131407,39.439642,97.457304,48.125747,0.955996,0.408286,0.88299,0.442043,0.623938,0.625574,0.957604,0.4076,61.546536,0.437852,0.673196,1.0


In [23]:
coloscopies.describe()

Unnamed: 0,cervix_area,os_area,walls_area,speculum_area,artifacts_area,cervix_artifacts_area,os_artifacts_area,walls_artifacts_area,speculum_artifacts_area,cervix_specularities_area,os_specularities_area,walls_specularities_area,speculum_specularities_area,specularities_area,area_h_max_diff,rgb_cervix_r_mean,rgb_cervix_r_std,rgb_cervix_r_mean_minus_std,rgb_cervix_r_mean_plus_std,rgb_cervix_g_mean,rgb_cervix_g_std,rgb_cervix_g_mean_minus_std,rgb_cervix_g_mean_plus_std,rgb_cervix_b_mean,rgb_cervix_b_std,rgb_cervix_b_mean_minus_std,rgb_cervix_b_mean_plus_std,rgb_total_r_mean,rgb_total_r_std,rgb_total_r_mean_minus_std,rgb_total_r_mean_plus_std,rgb_total_g_mean,rgb_total_g_std,rgb_total_g_mean_minus_std,rgb_total_g_mean_plus_std,rgb_total_b_mean,rgb_total_b_std,rgb_total_b_mean_minus_std,rgb_total_b_mean_plus_std,hsv_cervix_h_mean,hsv_cervix_h_std,hsv_cervix_s_mean,hsv_cervix_s_std,hsv_cervix_v_mean,hsv_cervix_v_std,hsv_total_h_mean,hsv_total_h_std,hsv_total_s_mean,hsv_total_s_std,hsv_total_v_mean,hsv_total_v_std,fit_cervix_hull_rate,fit_cervix_hull_total,fit_cervix_bbox_rate,fit_cervix_bbox_total,fit_circle_rate,fit_circle_total,fit_ellipse_rate,fit_ellipse_total,fit_ellipse_goodness,dist_to_center_cervix,dist_to_center_os,consensus
count,287.0,287.0,287.0,287.0,287.0,287.0,287.0,287.0,287.0,287.0,287.0,287.0,287.0,287.0,287.0,287.0,287.0,287.0,287.0,287.0,287.0,287.0,287.0,287.0,287.0,287.0,287.0,287.0,287.0,287.0,287.0,287.0,287.0,287.0,287.0,287.0,287.0,287.0,287.0,287.0,287.0,287.0,287.0,287.0,287.0,287.0,287.0,287.0,287.0,287.0,287.0,287.0,287.0,287.0,287.0,287.0,287.0,287.0,287.0,287.0,287.0,287.0,287.0
mean,0.477013,0.007368,0.172563,0.235195,0.047354,0.031741,0.051262,0.041739,0.018311,0.0144,0.012181,0.008781,0.083601,0.031248,0.196742,75.198745,33.046061,42.152684,108.244806,79.45988,34.565994,44.893886,114.025873,97.929804,39.466866,58.462938,137.39667,72.469339,47.439715,25.029624,119.909054,77.472621,48.758443,28.714178,126.231064,88.271212,53.15313,35.118081,141.424342,4.532769,2.746939,145.503441,35.048319,124.136695,46.442824,4.534511,2.707917,138.736802,46.294892,112.927822,61.643427,0.915829,0.500141,0.786685,0.577768,0.587217,0.800333,0.942705,0.484914,133.914046,0.479419,0.435682,0.752613
std,0.223491,0.005986,0.188248,0.174551,0.065196,0.061481,0.158501,0.110074,0.09199,0.029968,0.052387,0.034504,0.114312,0.037532,0.136503,48.16604,15.70788,40.186336,59.316677,51.018551,16.07323,42.030691,62.895974,64.683745,21.149424,51.958875,81.01139,34.338149,19.901635,25.194989,50.155554,34.172702,20.539788,25.09497,50.493121,49.441108,22.976548,34.889056,68.756407,0.696035,0.551488,54.708605,19.041504,58.188115,18.734886,0.645581,0.244458,54.326386,19.428641,37.683655,18.814282,0.17182,0.220632,0.165678,0.232471,0.131334,0.402581,0.17678,0.211091,90.492877,0.238634,0.16261,0.432247
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-24.017678,0.0,0.0,0.0,-21.587384,0.0,0.0,0.0,-24.33497,0.0,0.16863,2.787604,-16.189196,2.956234,0.341837,4.276427,-15.341175,4.618264,0.181047,2.973292,-25.284722,3.154339,3.141593,0.0,0.0,0.0,0.0,0.0,3.15526,1.795977,8.012986,16.080708,0.378291,4.323202,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.091654,0.084065,0.0
25%,0.340607,0.003323,0.053151,0.07978,0.0,0.0,0.0,0.0,0.0,0.001825,0.0,0.0,0.0,0.003184,0.092591,34.496634,22.36914,8.413342,58.667212,41.497873,23.13421,13.934702,70.741515,41.944269,22.041881,16.262651,68.219836,41.501214,31.098427,5.559045,77.369191,54.35696,31.584063,10.056994,87.79077,42.266989,32.879118,7.934501,70.585556,4.074556,2.621009,105.265026,23.75816,82.780563,35.881129,4.025967,2.489841,90.661939,32.403382,91.634507,50.282307,0.922583,0.368366,0.754746,0.414595,0.556931,0.549645,0.949209,0.353102,73.789175,0.313253,0.329278,1.0
50%,0.45129,0.006522,0.124431,0.231214,0.021029,0.008172,0.0,0.0,0.0,0.005318,0.0,0.0,0.043305,0.017661,0.18218,72.030423,32.668921,35.597245,107.322686,69.370696,33.531397,38.647922,101.021314,86.776822,37.502717,39.746574,125.555498,77.598957,51.281012,18.401777,136.18508,77.600047,47.910148,27.18855,129.198686,95.768628,57.111269,30.531833,163.862828,4.47703,2.93286,158.160578,30.09851,128.891419,46.116476,4.373245,2.725266,146.171618,40.300184,114.819244,63.939001,0.953806,0.489469,0.813545,0.576178,0.59677,0.730047,0.978936,0.465894,109.093948,0.437852,0.403812,1.0
75%,0.583656,0.00978,0.22036,0.353565,0.0652,0.034768,0.0,0.020056,0.005444,0.013496,0.002466,0.002685,0.128408,0.044163,0.295105,114.137249,42.891179,72.836229,156.930159,110.356499,44.219358,70.769231,154.577659,157.864529,54.625038,104.216058,213.531888,99.714899,63.140595,44.803574,157.739156,97.414052,64.993691,42.585618,160.235909,127.729985,72.63239,61.193622,198.42416,5.028624,3.020969,187.975041,41.294304,172.847477,58.143456,5.077348,2.929614,181.975327,57.706254,140.250158,75.409329,0.97831,0.611083,0.856936,0.714037,0.651683,1.011504,0.997162,0.597069,170.912792,0.587447,0.512937,1.0
max,1.0,0.042066,1.0,0.65478,0.502783,0.458985,1.0,0.765429,0.973837,0.26336,0.55652,0.436639,0.654463,0.21695,0.688795,194.878145,81.499913,156.718535,240.798414,227.60276,81.051207,190.738056,281.280876,233.388948,90.798902,192.511554,275.344128,136.309472,84.600448,85.403988,199.569888,179.220369,99.570024,124.161002,244.131283,197.363143,96.305186,150.849879,250.444743,5.931405,3.107393,244.19562,122.284452,233.480223,88.276424,5.846898,3.086482,245.957748,118.877785,197.370763,99.558681,1.015691,0.992861,1.007799,0.996427,0.801348,1.795553,1.067302,0.940996,575.369328,1.0,1.0,1.0


Let's see how many values and attributes do we have for each dataset.

In [47]:
print('Number of attributes: {}'.format(coloscopies.shape[1] - 1))
for df, name in [(green, 'Green'), (hinselmann, 'Hinselmann'), (schiller, 'Schiller'), (coloscopies, 'Overall')]:
    n_rows = df.shape[0]
    n_pos = df[CLASS].value_counts()[1.0]
    n_neg = df[CLASS].value_counts()[0.0]
    print('{} -- Number of rows: {}, Number of class good: {}, Number of class bad: {}'.format(name, n_rows, n_pos, n_neg))

Number of attributes: 62
Green -- Number of rows: 98, Number of class good: 67, Number of class bad: 31
Hinselmann -- Number of rows: 97, Number of class good: 82, Number of class bad: 15
Schiller -- Number of rows: 92, Number of class good: 67, Number of class bad: 25
Overall -- Number of rows: 287, Number of class good: 216, Number of class bad: 71


Let's check for the data balancing first across datasets.

In [42]:
def plot_balancing():
    df_balancing = {}
    i = 0
    for df, name in [(green, 'Green'), (hinselmann, 'Hinselmann'), (schiller, 'Schiller'), (coloscopies, 'Overall')]:
        n_rows = df.shape[0]
        n_pos = df[CLASS].value_counts()[1.0]
        n_neg = df[CLASS].value_counts()[0.0]
        perc_pos = n_pos / n_rows * 100
        perc_neg = n_neg / n_rows * 100
        df_balancing[i] = {'Percentage': round(perc_pos, 2), 'Class': 'Good', 'Dataset': name}
        i += 1
        df_balancing[i] = {'Percentage': round(perc_neg, 2), 'Class': 'Bad', 'Dataset': name}
        i += 1

    df_balancing = pd.DataFrame.from_dict(df_balancing, "index")
    df_balancing.to_csv('plot_data/balancing_overall.csv')
    plt.figure(figsize=(12,6))
    ax = sns.barplot(x='Dataset', y='Percentage', hue='Class', data=df_balancing)

    for p in ax.patches:
        ax.text(p.get_x() + p.get_width()/2., p.get_height(), '{0:.3f}'.format(float(p.get_height())), 
            fontsize=12, color='black', ha='center', va='bottom')

    plt.savefig('plots/balancing_overall.pdf')
    plt.clf()

<Figure size 864x432 with 0 Axes>

As we can see all the datasets are pretty unbalanced, being the green dataset the only one which maybe can be considered balanced having a ratio of more that 70% / 30% between classes.

Pre processing:
- Missing values
- Normalization
- Outliers
- Feature Selection
- Balancing

Let's check for missing values first.

In [43]:
print('Missing values: {}'.format(coloscopies.shape[0] - coloscopies.dropna().shape[0]))

Missing values: 0


There are no missing values in any dataset as we can see so no need to treat those.

Before we do the preprocessing let's analyze the accuracies that we obtain with each classifier so we can define a baseline. We will be using train test split but in the final evaluation we will be using cross validation because it is the best for this kind of problems with hundreds of observations (10 fold?). 

Each dataset presents it's own distribution of values and differ in number of good and bad quality colposcopies. This may indicate that an image that is good using the green filter may be bad using the Hinselmann technique. So we think that it is a good idea to analyze the datasets separatly