# Tabular data visualization

In [31]:
import matplotlib.pyplot as plt
import pandas as pd
import torch
import json
import os
import zipfile
import torchvision.transforms as transforms
from tqdm import tqdm

In [3]:
with zipfile.ZipFile('data_train.zip', 'r') as zip_obj:
    zip_obj.extractall()
with open("data_train/data_train.json", "r") as fin:
    data_train = json.load(fin)

targets_train = pd.read_csv("data_train/targets_train.csv", index_col=0)


with zipfile.ZipFile('data_test.zip', 'r') as zip_obj:
    zip_obj.extractall()
with open("data_test/data_test.json", "r") as fin:
    data_test = json.load(fin)

First step is to extract data features from json file to pandas dataframe.

The data describes different objects mainly by bounding box coordinates and probability of belonging to a particular type.

Since the data isn't a ground truth but noisy predictions of 3 different models, it might be useful to use a threshold for the probability to filter it. 

In [6]:
def extract_basic_features(breast):
    predictors = {}
    thresh = 0.35
    for key in ["tissue_density_predicted", "cancer_probability_predicted", "laterality"]:
        predictors[key] = breast[key]
        
    view = ['CC', 'MLO']        
    object_type = ['nipple', 'fibrocystic_breast_changes', 'mass_benign', 'artifact', 
                   'lymphonodus', 'calcinates_benign', 'other', 'calcified vessels', 
                   'mass_malignant', 'pectoral muscle', 'skin_thickening', 'papilloma', 
                   'calcified cyst', 'calcinates_malignant']
    model_number = [1, 2, 3]    
    max_x, max_y = 0, 0
    
    for v in view:
        for ob in object_type:
            for m in model_number:
                predictors['{}_{}_model_{}_crd'.format(v, ob, m)] = []
                predictors['{}_{}_model_{}_prob'.format(v, ob, m)] = []
                for obj in breast[v]:                
                    if (ob in obj["object_type"]) and 
                    (obj["model_number"] == m) and 
                    (obj['probability'] >= thresh):
                        predictors['{}_{}_model_{}_crd'.format(v, ob, m)].append(obj['coordinates'])
                        max_x = max(max_x, obj['coordinates'][2])
                        max_y = max(max_y, obj['coordinates'][3])
                        predictors['{}_{}_model_{}_prob'.format(v, ob, m)].append(obj['probability'])
    return predictors, max_x, max_y

In [8]:
predictors = {}
w, h = 0, 0
for key, value in data_train.items():
    predictors[key], max_x, max_y = extract_basic_features(value)
    w, h = max(w, max_x), max(h, max_y)
    
print('Max width of the frame is {}, max height of the frame is {}.'.format(w, h))

Max width of the frame is 1216, max height of the frame is 2208.


In [10]:
df_train = pd.DataFrame.from_dict(predictors, orient="index")
df_train = pd.merge(df_train, targets_train, left_index=True, right_index=True)
df_train['laterality'] = df_train['laterality'].map(lambda lat: int(lat == "L"))

In [33]:
df_train.head()

Unnamed: 0,tissue_density_predicted,cancer_probability_predicted,laterality,CC_nipple_model_1_crd,CC_nipple_model_1_prob,CC_fibrocystic_breast_changes_model_1_crd,CC_fibrocystic_breast_changes_model_1_prob,CC_mass_benign_model_1_crd,CC_mass_benign_model_1_prob,CC_mass_benign_model_2_crd,...,MLO_papilloma_model_1_prob,MLO_calcified cyst_model_1_crd,MLO_calcified cyst_model_1_prob,MLO_calcinates_malignant_model_1_crd,MLO_calcinates_malignant_model_1_prob,MLO_calcinates_malignant_model_2_crd,MLO_calcinates_malignant_model_2_prob,MLO_calcinates_malignant_model_3_crd,MLO_calcinates_malignant_model_3_prob,BiRads
f55c7e1c-9c91-4509-8724-4309341db0a5,1,0.045,1,"[[949, 1232, 1018, 1391]]",[0.9894999861717224],[],[],"[[484, 600, 518, 628]]",[0.5989999771118164],[],...,[],[],[],[],[],[],[],[],[],1
7ce278f3-df17-4b91-b17a-d038005a044f,2,0.35,0,"[[889, 1300, 935, 1430]]",[0.4311999976634979],"[[605, 1166, 771, 1358]]",[0.751800000667572],[],[],"[[608, 1159, 780, 1337]]",...,[],[],[],[],[],[],[],[],[],1
8e462f38-8063-46c0-bdf5-15be8c3bebbb,2,0.993,1,[],[],[],[],"[[837, 1026, 877, 1077]]",[0.44600000977516174],[],...,[],[],[],"[[417, 1444, 493, 1546]]",[0.7111999988555908],[],[],[],[],1
3dbd7019-8e7b-4e2d-8343-e976b0f9d581,2,0.963,0,[],[],[],[],[],[],[],...,[],[],[],"[[441, 1417, 516, 1514], [604, 1490, 669, 1602]]","[0.9528999924659729, 0.7172999978065491]",[],[],[],[],1
71981fe9-8f8e-4ee1-834e-832ae0f9ab39,2,0.974,1,"[[1111, 1205, 1150, 1326]]",[0.9546999931335449],[],[],"[[376, 426, 466, 482]]",[0.9559000134468079],"[[372, 422, 477, 486]]",...,[],[],[],[],[],[],[],[],[],2


Removing unfilled columns:

In [None]:
for elem in df_train.columns:
    if df_train[elem].sum() == []:
        df_train = df_train.drop([elem], axis=1)

In [12]:
y = df_train["BiRads"].copy()
X = df_train.iloc[:, 0:-1].copy()

In [34]:
def make_tensor(df_raw, max_h, max_w):
    # each feature has 2 columns - coordinates and probability except for first 3 columns
    features_num = int((df_raw.shape[0] - 3) / 2 + 3)
    C, H, W = features_num, max_h + 2, max_w + 4

    pic = torch.zeros(C, H, W)
    # fill the whole channel with a category number for first 3 columns
    pic[0, :, :], pic[1, :, :], pic[2, :, :] = list(df_raw.iloc[0:3])

    for feature in range(3, C):
        for i in range(len(df_raw.iloc[2 * feature - 3])):
            x_min, y_min, x_max, y_max = df_raw.iloc[2 * feature - 3][i]
            prob = df_raw.iloc[2 * feature - 2][i]
            
            # display the most confident prediction
            pic[feature, y_min:y_max, x_min:x_max] = torch.where(
                pic[feature, y_min:y_max, x_min:x_max] > prob, 
                pic[feature, y_min:y_max, x_min:x_max], 
                torch.ones_like(pic[feature, y_min:y_max, x_min:x_max]) * prob)

    return pic

Creating a folder with train tensors for further classification.

In [None]:
!mkdir img_tensors
for row in tqdm(range(len(X))):
    img_tensor = make_tensor(X.iloc[row], h, w)
    out = transforms.functional.resize(img_tensor, (330, 180))
    torch.save(out, 'img_tensors/{}.pt'.format(X.iloc[row].name))
!zip -r /img_tensors.zip img_tensors

Creating test tensors in the same way as for train.

In [23]:
test_predictors = {}
w, h = 0, 0
for key, value in data_test.items():
    test_predictors[key], max_x, max_y = extract_basic_features(value)
    w, h = max(w, max_x), max(h, max_y)
print('Max width of the frame is {}, max height of the frame is {}.'.format(w, h))

Max width of the frame is 1216, max height of the frame is 2208.


In [24]:
df_test = pd.DataFrame.from_dict(test_predictors, orient="index")
df_test['laterality'] = df_test['laterality'].map(lambda lat: 0 if lat == "L" else 1)

In [25]:
for elem in df_test.columns:
    if elem not in X.columns:
        df_test = df_test.drop([elem], axis=1)

In [None]:
!mkdir test_img_tensors
for row in tqdm(range(len(df_test))):
    img_tensor = make_tensor(df_test.iloc[row], h, w)
    out = transforms.functional.resize(img_tensor, (330, 180))
    torch.save(out, 'test_img_tensors/{}.pt'.format(df_test.iloc[row].name))
!zip -r /test_img_tensors.zip test_img_tensors