In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# import Tableone
from tableone import TableOne

from ydata_profiling import ProfileReport


In [2]:
df = pd.read_csv("PET_Diszitis final.tsv", sep="\t")

# Remove rows with age not convertible to int
df = df[pd.to_numeric(df['age'], errors='coerce').notnull()]

# Remove rows with numbers in "name"
df = df[~df['name'].str.contains(r"[0-9]", na=False)]

# Remove rows with nan in "name"
df = df[df['name'].notna()]

df

Unnamed: 0.1,Unnamed: 0,name,DOB,age,Fokus abgeklärt,unspez gewertet,unspez Fokus abgeklärt 0nein 1ja+neg 2ja+pos,weitere,Thrombus,"sex (1F, 2M)",...,"1 = Fokussuche, 2 = Ausschluss/Nachweis Diszitis weil MRT unklar, 3 = MRT nicht möglich / Ersatz für MRT, 4 = VK, 5 = Materialschaden, Frage nach Infekt, 6 = Infekt im Labor ohne Fokus",Risikofaktoren,RevisionsOP 2 =kein Infekt,ASA,ausgeheilt 2=NA 3=dead,"Neurologie 1 = Paresen, 2 = vorbestehend, 3 = Tetraparese","Besserung 1 = komplett 2 =ja, aber nicht auf normal 3 tot 4 neues Defizit","(1= DM, 2=iv Drogen, 3=PAVK, 4=Cortisontherapie, 5=Zahnbehandlung, 6=Immunschwäche, 7=zn CTX, 8=Malignom, 9=Infiltrationen, 10=C2, 11=Parkinson, 12 = Niereninsuffizienz)",Unnamed: 45,Unnamed: 46
0,1069341,"Rankel, Christine",7/11/1965,56,-,-,-,-,,1,...,1,1,1.0,3.0,1,,,,,
1,1121310,"Mentzel, Frank",17/9/1954,67,-,-,-,"Ulcus, bekannt",,2,...,1,4,2.0,3.0,1,,,,ein PET Onko zusätzlich davor,
3,1187375,"Vial, Renee",10/3/1950,72,-,-,-,BronchialCA,,1,...,1,7.8,0.0,2.0,1,,,,ein PET Onko zusätzlich davor,
4,1202379,"Mayr, Daniela",8/5/1984,38,-,-,-,Zn Pankreatitis,,1,...,2,2,0.0,3.0,x,,,,,
5,1227623,"Zellner, Leonhard",12/9/1947,75,-,-,-,-,1,2,...,1,0,0.0,2.0,1,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
354,2622568,"Kett, Margarete",9/2/1931,87,-,-,-,-,,1,...,,,,,,,,,,
359,2592895,"Wittmann, Peter",13/2/1959,59,-,-,-,-,,2,...,1,1,0.0,3.0,2,2.0,x,,,
360,2604561,"Gruenwald, Konrad",22/6/1934,84,-,-,-,-,,2,...,,,,,,,,,,
363,1559145,"Meier, Franz",17/7/1934,81,1,-,-,-,,2,...,1,1,0.0,3.0,2,0.0,x,,,


In [3]:
categorical = [
    'name',
    'sex (1F, 2M)',
    'Fokus abgeklärt',
    'unspez gewertet',
    'unspez Fokus abgeklärt 0nein 1ja+neg 2ja+pos',
    'weitere',
    'Diagnose',
    'HWS',
    'BWS',
    'LWS',
    'intraspinal',
    'biopsy',
    'OP',
    '1 = lowgrade 2 = highgrade',
    'histo surgery 3 intermediär 0 neg',
    'mibi other',
    'discitis in MRT = TE, 2 Frage diszitis b MRT unklar, 0 = n übereinstimmend, 3 kein MRT, 4 Ausschluss Diszitis, 5 Diszitis im MRT n erkannt, 6 neuer Nachweis Diszitis',
    'other spinal TE',
    'spinal: overall, 0 = nicht übereinstimmend, 1= übereinstimmend, 2 = MRT unklar, 3 kein MRT',
    'initialer Fokus',
    'neuer Fokus nach PET',
    'TE at sus focus 2 = vorOP spinal, 0 Fokus nicht dargestellt, 3 kein Fokus, 4 Fokus weg/saniert, 5 Fokus nicht gefunden',
    'reason for PET',
    '1 = Fokussuche, 2 = Ausschluss/Nachweis Diszitis weil MRT unklar, 3 = MRT nicht möglich / Ersatz für MRT, 4 = VK, 5 = Materialschaden, Frage nach Infekt, 6 = Infekt im Labor ohne Fokus',
    'Risikofaktoren',
    'Neurologie 1 = Paresen, 2 = vorbestehend, 3 = Tetraparese',
]

continuous = [
    'age',
    # 'CRP initial',
]

columns = categorical[:] + continuous[:]

table1 = TableOne(
    df, 
    columns=columns, 
    categorical=categorical, 
    continuous=continuous,
    groupby=None, 
    pval=False
)
# Save table 1 to tsv
# table1.to_csv("table1.tsv", sep="\t")

table1

Unnamed: 0,Unnamed: 1,Missing,Overall
n,,,193
"name, n (%)","Achatz, Heidi",,1 (0.5)
"name, n (%)","Alicic, Adnan",,1 (0.5)
"name, n (%)","Alqahtani, Melahi Mutrak",,1 (0.5)
"name, n (%)","Amort, Hermann",,1 (0.5)
...,...,...,...
"Neurologie 1 = Paresen, 2 = vorbestehend, 3 = Tetraparese, n (%)",1.0,,1 (0.5)
"Neurologie 1 = Paresen, 2 = vorbestehend, 3 = Tetraparese, n (%)",2.0,,5 (2.6)
"Neurologie 1 = Paresen, 2 = vorbestehend, 3 = Tetraparese, n (%)",3.0,,1 (0.5)
"Neurologie 1 = Paresen, 2 = vorbestehend, 3 = Tetraparese, n (%)",,,162 (83.9)


In [5]:
profile = ProfileReport(df, title="Data Report", explorative=True)

# Export to PDF
profile.to_file("report.pdf")
# profile.to_file("report.html")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

100%|██████████| 47/47 [00:00<00:00, 163.46it/s]
  annotation = ("{:" + self.fmt + "}").format(val)
        (using `df.profile_report(missing_diagrams={"Heatmap": False}`)
        If this is problematic for your use case, please report this as an issue:
        https://github.com/ydataai/ydata-profiling/issues
        (include the error message: 'could not convert string to float: '--'')


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]



Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]