In [12]:
import os
from scipy.io import loadmat
import numpy as np
import pandas as pd
import wandb
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.notebook import tqdm
from joblib import Parallel, delayed
import plotly.express as px

In [None]:
artifact = wandb.Artifact(name="PhysioNet_Dataset", type="dataset")
artifact.add_dir("./Training_WFDB")

In [None]:
wandb.init(project="PhysioNet_Challenge", name="LogDataset")
wandb.log_artifact(artifact)
wandb.finish()

In [13]:
import plotly.graph_objects as go

In [14]:
def make_plot(signal, filename):
    """
    Plots the signal.
    """

    # fig, ax_ls = plt.subplots(3, 4, figsize=(7, 28/3))
    # fig.title = filename
    figure = go.Figure()
    x = list(range(signal.shape[1]))
    for i in range(12):
        figure.add_trace(go.Scatter(x=x, y=signal[i], name=f"Channel {i + 1}", visible="legendonly"))
        
    return figure

In [15]:
df = pd.read_csv("./Dx_map.csv")
mapping = {}
for i, row in df.iterrows():
    mapping[row['SNOMED CT Code']] = row['Dx']

In [16]:
def get_data(file_name):
    """
    Loads the diagnoses from the .hea file.
    """
    file_name = file_name.split(".")[0]
    with open(f"./Training_WFDB/{file_name}.hea") as f:
        data = f.readlines()
    
    signal = loadmat(f"./Training_WFDB/{file_name}.mat")["val"]
    
    reqd = {}
    reqd["id"] = file_name
    reqd["length"] = signal.shape[1]
    for line in data:
        if line.startswith("#Age"):
            l = line.split(":")
            try:
                age = float(l[1].strip())
            except:
                age = np.nan
            reqd["age"] = age

        if line.startswith("#Sex"):
            l = line.split(":")
            sex = l[1].strip()
            reqd["sex"] = sex
        
        if line.startswith("#Dx"):
            l = line.split(":")
            diagnosis = l[1].strip()
            if "," in diagnosis:
                diagnosis = diagnosis.split(",")
            if not isinstance(diagnosis, list):
                diagnosis = [diagnosis]
            diagnosis = [mapping[int(d)] for d in diagnosis]
            reqd["diagnosis"] = diagnosis
    return reqd

In [17]:
files = os.listdir("./Training_WFDB")

In [18]:
df = {
    "id": [],
    "age": [],
    "sex": [],
    "diagnosis": [],
    "length": [],
    # "signals": []
}
files = [i for i in files if i.endswith(".hea")]

data = [get_data(file) for file in tqdm(files)]
for row in tqdm(data):
    for k in df.keys():
        df[k].append(row[k])

  0%|          | 0/6877 [00:00<?, ?it/s]

  0%|          | 0/6877 [00:00<?, ?it/s]

In [19]:
table = wandb.Table(dataframe=pd.DataFrame(df))
# run.finish()

In [20]:
pd.DataFrame(df)

Unnamed: 0,id,age,sex,diagnosis,length
0,A4137,51.0,Male,[atrial fibrillation],5000
1,A6092,67.0,Female,"[atrial fibrillation, right bundle branch block]",5000
2,A0201,22.0,Female,[st depression],7500
3,A6551,84.0,Male,[atrial fibrillation],7651
4,A3493,90.0,Male,"[right bundle branch block, premature atrial c...",5213
...,...,...,...,...,...
6872,A1652,65.0,Male,[st depression],21000
6873,A4024,14.0,Male,[sinus rhythm],7000
6874,A3329,83.0,Female,[1st degree av block],5000
6875,A0979,71.0,Female,"[atrial fibrillation, right bundle branch block]",10500


In [22]:
run = wandb.init(project="PhysioNet_Challenge", name="EDA", entity="timeseriesbois")
# artifact.wait()
run.use_artifact("timeseriesbois/PhysioNet_Challenge/run-2ibagv9z-data_table:v0", type="run_table")
run.log({"eda_table": table})
run.finish()




VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…




VBox(children=(Label(value='0.391 MB of 0.391 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

In [None]:
from sklearn.model_selection import train_test_split
train_files, test_files = train_test_split(files, test_size=0.1)
train_files, val_files = train_test_split(train_files, test_size=0.1)

In [None]:
len(train_files), len(test_files), len(val_files)

In [None]:
df = {
    "id": [],
    "age": [],
    "sex": [],
    "diagnosis": [],
    # "signals": []
}
data = [get_data(file) for file in tqdm(train_files)]
for row in tqdm(data):
    for k in df.keys():
        df[k].append(row[k])
train_table = wandb.Table(dataframe=pd.DataFrame(df))

In [None]:
df = {
    "id": [],
    "age": [],
    "sex": [],
    "diagnosis": [],
    # "signals": []
}
data = [get_data(file) for file in tqdm(val_files)]
for row in tqdm(data):
    for k in df.keys():
        df[k].append(row[k])
val_table = wandb.Table(dataframe=pd.DataFrame(df))

In [None]:
df = {
    "id": [],
    "age": [],
    "sex": [],
    "diagnosis": [],
    # "signals": []
}
data = [get_data(file) for file in tqdm(test_files)]
for row in tqdm(data):
    for k in df.keys():
        df[k].append(row[k])
test_table = wandb.Table(dataframe=pd.DataFrame(df))

In [None]:
run = wandb.init(project="PhysioNet_Challenge", name="DataSplit")
run.use_artifact('manan-goel/PhysioNet_Challenge/run-19xtri8j-eda_table:v0', type='run_table')

In [None]:
run.log({
    "TrainingData": train_table,
    "ValidationData": val_table,
    "TestingData": test_table
})
run.finish()

In [None]:
columns = ["id", "age", "sex", "diagnosis", "signals"]
table = wandb.Table(columns=columns)

In [None]:
files = [i for i in files if i.endswith(".hea")]
for file in tqdm(files):
    name = file.split(".")[0]
    signal = loadmat(f"./Training_WFDB/{name}.mat")["val"]
    figure = make_plot(signal, name)
    row = get_data(name)

    row_ = [name, row["age"], row["sex"], row["diagnosis"], wandb.Html(figure.to_html())]
#     row_.extend([wandb.Html(f.to_html()) for f in figures])
    table.add_data(*row_)

In [None]:
files = [i for i in os.listdir("./Training_WFDB") if i.endswith(".hea")]

In [None]:
df = {
    "id": [],
    "age": [],
    "sex": [],
    "diagnosis": [],
    "signals": [],
    "length": [],
}
files = [i for i in files if i.endswith(".hea")]

data = [get_data(file) for file in tqdm(files)]
for row in tqdm(data):
    for k in row.keys():
        df[k].append(row[k])
    df['signals'].append(wandb.Image(f"./Training_WFDB/{row['id']}.png"))

In [None]:
pd.DataFrame(df)

In [None]:
table = wandb.Table(dataframe=pd.DataFrame(df))

In [None]:
run = wandb.init(project="PhysioNet_Challenge", name="EDA_Signal")
run.use_artifact('manan-goel/PhysioNet_Challenge/run-19xtri8j-eda_table:v0', type='run_table')

In [None]:
run.log({"Signals_updated": table})
run.finish()

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer

In [None]:
mlb = MultiLabelBinarizer()
mlb.fit(pd.DataFrame(df)["diagnosis"])

In [None]:
import pickle
with open("mlb.pickle", "wb") as f:
    pickle.dump(mlb, f)

In [None]:
signal = loadmat(f"./Training_WFDB/{name}.mat")["val"]
signal.shape

In [None]:
lens = []
files = [i for i in files if i.endswith(".hea")]
signals = []
for f in tqdm(files):
    name = f.split(".")[0]
    signal = loadmat(f"./Training_WFDB/{name}.mat")["val"]

    s = np.pad(signal, ((0, 0), (0, 72000 - signal.shape[1])), "constant", constant_values=0)
    signals.append(s)

In [None]:
signals = np.array(signals)
signals.shape

In [None]:
np.save("signals.npy", signals)

In [None]:
pd.DataFrame(df)["length"].hist(bins=100)

In [None]:
signals = []
diagnoses = []
for file in tqdm(files):
    name = file.split(".")[0]
    signal = loadmat(f"./Training_WFDB/{name}.mat")["val"]
    data = get_data(name)

    s = []
    for si in signal:
        s.append(resample(si, 5000))
    s = np.array(s)
    signals.append(s)
    diagnoses.append(data["diagnosis"])
signals = np.array(signals)

In [None]:
import pickle
with open("mlb.pickle", "rb") as f:
    binarizer = pickle.load(f)

labels = binarizer.fit_transform(diagnoses)

In [None]:
np.save("downsampled_signals.npy", signals)

In [None]:
df = {
    k: [] for k in binarizer.classes_
}

In [None]:
for d in diagnoses:
    for k in df.keys():
        if k in d:
            df[k].append(1)
        else:
            df[k].append(0)

In [None]:
pd.DataFrame(df).to_csv("labels.csv")

In [None]:
artifact = wandb.Artifact(name="preprocessed_dataset", type="dataset")
artifact.add_file("downsampled_signals.npy")
artifact.add_file("labels.csv")

run = wandb.init(project="PhysioNet_Challenge", name="Preprocessed_Dataset")
run.use_artifact('manan-goel/PhysioNet_Challenge/run-19xtri8j-eda_table:v0', type='run_table')
run.log_artifact(artifact)
run.finish()