In [34]:
import numpy as np
import pandas as pd

In [35]:
# Read the labels from the labels.tok file
with open('labels.tok', 'r') as labels_file:
    labels = labels_file.read().splitlines()
labels = labels[1:] # Remove the first line

In [36]:
def read_hyp_file(path):
    data = []
    with open(path, 'r') as file:
        lines = file.readlines()
        sublist = []
        for line in lines:
            line = line.strip()
            
            if line.startswith('['):
                sublist = [float(value) for value in line[1:-1].split()]
            elif line.endswith(']'):
                sublist += [float(value) for value in line[:-1].split()]
                data.append(sublist)
            else:
                sublist += [float(value) for value in line.split()]
    return data

In [37]:
# Read the file
data = read_hyp_file('hyps.txt')

In [38]:
len(data)

36769

In [39]:
import csv

def count_lines(csv_file):
    with open(csv_file, 'r') as file:
        reader = csv.reader(file)
        line_count = sum(1 for row in reader)
    return line_count

csv_file = 'development.csv'
num_lines = count_lines(csv_file)
print(f"The CSV file contains {num_lines} lines.")

The CSV file contains 36770 lines.


In [40]:
df_predictions = pd.DataFrame(data, columns=labels)

In [41]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
normalized_data = scaler.fit_transform(df_predictions.values)
df_predictions = pd.DataFrame(normalized_data, columns=df_predictions.columns)

Thresholding

In [42]:
# create ground truth and predictions dataframes for validation set

df_validation_pred = pd.DataFrame(read_hyp_file('validation_hyps.txt'), columns=labels)
df_validation_gt = pd.DataFrame(read_hyp_file('validation_refs.txt'), columns=labels)

scaler = MinMaxScaler()
normalized_data = scaler.fit_transform(df_validation_pred.values)
df_validation_pred = pd.DataFrame(normalized_data, columns=df_validation_pred.columns)

In [43]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

ground_truth = df_validation_gt['No positive label'].values

best_threshold = None
best_metric_score = 0.0

for threshold in np.linspace(0, 1, num=100):
    predicted = np.where(df_validation_pred['No positive label'] >= threshold, 1, 0)
    metric_score = accuracy_score(ground_truth, predicted)
    if metric_score > best_metric_score:
        best_threshold = threshold
        best_metric_score = metric_score

print("Best Threshold:", best_threshold)
print("Best Metric Score:", best_metric_score)

Best Threshold: 0.98989898989899
Best Metric Score: 0.9637523834695766


In [44]:
threshold = best_threshold

In [45]:
# Remove all predictions that are seen as without label
df_predictions.loc[df_predictions['No positive label'] > threshold, :] = 0
df_predictions = df_predictions.drop('No positive label', axis=1)

In [46]:
# Add dicom ids
numbers = []
with open('test_dicoms.tok', 'r') as file:
    lines = file.readlines()
    for line in lines:
        number = line.split('/')[-1].split('.')[0]
        numbers.append(number)

df_predictions.insert(0, 'dicom_id', numbers)

In [47]:
df_predictions.head()

Unnamed: 0,dicom_id,Fracture,Pneumonia,Cardiomegaly,Pleural Other,Infiltration,Pneumomediastinum,Edema,Lung Opacity,Pneumothorax,...,Emphysema,Fibrosis,Atelectasis,Nodule,Subcutaneous Emphysema,Pneumoperitoneum,Lung Lesion,Enlarged Cardiomediastinum,Support Devices,Consolidation
0,2833b85f-3bb4273f-cffd3794-2bf2cd57-7ddb3f5f,0.18225,0.611718,0.343113,0.285934,0.589168,0.221845,0.332137,0.225033,0.18784,...,0.229469,0.257269,0.126203,0.388263,0.109379,0.11355,0.265342,0.307235,0.118208,0.277249
1,7fdad032-90608fe2-c1f5a700-bf95f6f9-e9b0dbc7,0.188554,0.733192,0.176213,0.309677,0.594983,0.190225,0.27398,0.474208,0.162861,...,0.336326,0.346683,0.237759,0.494115,0.102404,0.125139,0.347526,0.268361,0.143047,0.436949
2,8b88f03a-2004113d-4118def5-35820db5-d0913df8,0.256748,0.616698,0.222315,0.332335,0.551194,0.159743,0.200137,0.29401,0.087097,...,0.32022,0.261468,0.182192,0.438836,0.049597,0.084579,0.219485,0.305169,0.071268,0.204907
3,e4ee2428-245e9222-399ef211-62b44204-35580e8d,0.35531,0.6019,0.402589,0.4263,0.5962,0.115664,0.298513,0.289182,0.081456,...,0.398963,0.328726,0.279989,0.484472,0.044968,0.127768,0.300531,0.293487,0.168122,0.241923
4,2630b0b8-2d5af3a6-1a02c2ca-952e9535-b44c35ee,0.218914,0.782368,0.317242,0.331897,0.699162,0.235713,0.417185,0.540468,0.182036,...,0.314578,0.421755,0.300189,0.496728,0.144357,0.190964,0.387878,0.38714,0.176693,0.526943


In [48]:
df_predictions.to_csv('submission.csv', index=False)