### Joining Patients.csv

In [468]:
import pandas as pd
import sqlite3

# Define file paths
csv_file = "patients.csv"  # Change this to your actual CSV file path
db_file = "CKD_train_unlocked.db"  # Change this to your actual database file

# Load CSV into Pandas DataFrame
# Drop any 'Unnamed' columns (index-related issues)
df_patients = pd.read_csv(csv_file)
df_patients = df_patients.loc[:, ~df_patients.columns.str.contains('^Unnamed')]

# Connect to SQLite database
conn = sqlite3.connect(db_file)
cursor = conn.cursor()

# Define table name
table_name = "Patients"  # Change this to match your table

# Save to SQLite without the unwanted column
df_patients.to_sql(table_name, conn, if_exists="append", index=False)  # Use 'replace' to overwrite

# Commit and close
conn.commit()
conn.close()

print(f"Data from {csv_file} successfully added to {table_name} in {db_file} without 'Unnamed' columns.")

Data from patients.csv successfully added to Patients in CKD_train_unlocked.db without 'Unnamed' columns.


In [469]:
df_patients

Unnamed: 0,Patient,DateOfBirth,Sex
0,30,1932-01-10,M
1,60,1967-09-20,M
2,80,1946-05-15,M
3,100,1946-06-05,M
4,120,1943-11-06,F
...,...,...,...
16092,1522810,1979-09-10,M
16093,1522890,1973-03-16,F
16094,1522980,1947-01-17,F
16095,1523010,1946-02-01,F


In [470]:
df_patients['DateOfBirth'].isna().sum()

np.int64(2)

In [471]:
df_patients_cleaned = df_patients.dropna(subset=['DateOfBirth'])
print(df_patients_cleaned)

       Patient DateOfBirth Sex
0           30  1932-01-10   M
1           60  1967-09-20   M
2           80  1946-05-15   M
3          100  1946-06-05   M
4          120  1943-11-06   F
...        ...         ...  ..
16092  1522810  1979-09-10   M
16093  1522890  1973-03-16   F
16094  1522980  1947-01-17   F
16095  1523010  1946-02-01   F
16096  1523060  1938-03-05   F

[16095 rows x 3 columns]


### Reading Labs table for DB

In [472]:
import sqlite3
import pandas as pd

# Connect to the SQLite database
conn = sqlite3.connect("CKD_train_unlocked.db")

# Read the "labs" table into a pandas DataFrame
df_labs = pd.read_sql("SELECT * FROM labs", conn)

# Show the first few rows
print(df_labs)

# Close the connection
conn.close()


        Patient   EntryDate      Analyte  ValueNumber ValueText  \
0       1175090  2020-05-26      CKD-EPI        129.6      None   
1       1175090  2024-05-07      CKD-EPI        113.4      None   
2       1175090  2020-05-26  s_kreatinin         73.1      None   
3       1175090  2024-05-07  s_kreatinin         84.3      None   
4       1175090  2024-05-07         UACR          0.4      None   
...         ...         ...          ...          ...       ...   
222089  1522890  2025-03-06      CKD-EPI          4.2      None   
222090  1522890  2025-03-03  s_kreatinin       1077.7      None   
222091  1522890  2025-03-06  s_kreatinin        890.3      None   
222092  1487910  2025-03-04      CKD-EPI          9.0      None   
222093  1487910  2025-03-04  s_kreatinin        526.9      None   

                  Unit  
0       mL/min/1.73 m²  
1       mL/min/1.73 m²  
2               µmol/L  
3               µmol/L  
4                g/mol  
...                ...  
222089  mL/min/1.73 

In [473]:
df_labs_cleaned = df_labs.dropna(subset=['Unit'])
print(df_labs_cleaned)

        Patient   EntryDate      Analyte  ValueNumber ValueText  \
0       1175090  2020-05-26      CKD-EPI        129.6      None   
1       1175090  2024-05-07      CKD-EPI        113.4      None   
2       1175090  2020-05-26  s_kreatinin         73.1      None   
3       1175090  2024-05-07  s_kreatinin         84.3      None   
4       1175090  2024-05-07         UACR          0.4      None   
...         ...         ...          ...          ...       ...   
222089  1522890  2025-03-06      CKD-EPI          4.2      None   
222090  1522890  2025-03-03  s_kreatinin       1077.7      None   
222091  1522890  2025-03-06  s_kreatinin        890.3      None   
222092  1487910  2025-03-04      CKD-EPI          9.0      None   
222093  1487910  2025-03-04  s_kreatinin        526.9      None   

                  Unit  
0       mL/min/1.73 m²  
1       mL/min/1.73 m²  
2               µmol/L  
3               µmol/L  
4                g/mol  
...                ...  
222089  mL/min/1.73 

In [474]:
df_labs_cleaned['Unit'].unique()

array(['mL/min/1.73 m²', 'µmol/L', 'g/mol', 'g/l'], dtype=object)

In [475]:
df_labs_cleaned

Unnamed: 0,Patient,EntryDate,Analyte,ValueNumber,ValueText,Unit
0,1175090,2020-05-26,CKD-EPI,129.6,,mL/min/1.73 m²
1,1175090,2024-05-07,CKD-EPI,113.4,,mL/min/1.73 m²
2,1175090,2020-05-26,s_kreatinin,73.1,,µmol/L
3,1175090,2024-05-07,s_kreatinin,84.3,,µmol/L
4,1175090,2024-05-07,UACR,0.4,,g/mol
...,...,...,...,...,...,...
222089,1522890,2025-03-06,CKD-EPI,4.2,,mL/min/1.73 m²
222090,1522890,2025-03-03,s_kreatinin,1077.7,,µmol/L
222091,1522890,2025-03-06,s_kreatinin,890.3,,µmol/L
222092,1487910,2025-03-04,CKD-EPI,9.0,,mL/min/1.73 m²


In [476]:
df_labs_cleaned['Analyte'].unique()

array(['CKD-EPI', 's_kreatinin', 'UACR', 'UPCR', 'PU'], dtype=object)

### Serum Creatinine

In [477]:
df_s_kreatinin = df_labs_cleaned[df_labs_cleaned['Analyte'] == 's_kreatinin']
print(df_s_kreatinin)

        Patient   EntryDate      Analyte  ValueNumber ValueText    Unit
2       1175090  2020-05-26  s_kreatinin         73.1      None  µmol/L
3       1175090  2024-05-07  s_kreatinin         84.3      None  µmol/L
11        97360  2016-08-01  s_kreatinin         75.9      None  µmol/L
12        97360  2016-08-06  s_kreatinin         63.7      None  µmol/L
13        97360  2016-08-13  s_kreatinin         65.9      None  µmol/L
...         ...         ...          ...          ...       ...     ...
222085  1521890  2025-02-27  s_kreatinin        125.2      None  µmol/L
222087  1419930  2025-03-03  s_kreatinin        167.2      None  µmol/L
222090  1522890  2025-03-03  s_kreatinin       1077.7      None  µmol/L
222091  1522890  2025-03-06  s_kreatinin        890.3      None  µmol/L
222093  1487910  2025-03-04  s_kreatinin        526.9      None  µmol/L

[131803 rows x 6 columns]


In [478]:
# Convert 'EntryDate' to datetime format
df_s_kreatinin['EntryDate'] = pd.to_datetime(df_s_kreatinin['EntryDate'])

# Sort the DataFrame by 'Patient' and 'EntryDate'
df_s_kreatinin_sorted = df_s_kreatinin.sort_values(by=['Patient', 'EntryDate'])

# Drop duplicates, keeping only the latest entry for each patient
df_s_kreatinin_latest = df_s_kreatinin_sorted.drop_duplicates(subset='Patient', keep='last')

# Display the resulting DataFrame
print(df_s_kreatinin_latest)

        Patient  EntryDate      Analyte  ValueNumber ValueText    Unit
69901        60 2025-02-19  s_kreatinin        135.7      None  µmol/L
61067        80 2017-05-22  s_kreatinin        232.4      None  µmol/L
58734       120 2014-05-12  s_kreatinin        242.8      None  µmol/L
70553       160 2025-02-07  s_kreatinin         66.0      None  µmol/L
95208       180 2025-02-13  s_kreatinin         73.2      None  µmol/L
...         ...        ...          ...          ...       ...     ...
222037  1522810 2025-03-07  s_kreatinin         84.0      None  µmol/L
222091  1522890 2025-03-06  s_kreatinin        890.3      None  µmol/L
222051  1522980 2025-02-27  s_kreatinin        161.1      None  µmol/L
222055  1523010 2025-02-26  s_kreatinin        112.1      None  µmol/L
222068  1523060 2025-03-04  s_kreatinin         75.0      None  µmol/L

[11536 rows x 6 columns]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_s_kreatinin['EntryDate'] = pd.to_datetime(df_s_kreatinin['EntryDate'])


In [479]:
# Molecular weight of creatinine in g/mol
MW_CREATININE = 113.12  

# Conversion function
def convert_mumol_to_mgdl(row):
    if row["Unit"] == "µmol/L":
        return row["ValueNumber"] * 0.011312  # Conversion formula
    return row["ValueNumber"]  # Keep unchanged if not in µmol/L

# Apply conversion
df_s_kreatinin_latest["ValueNumber_converted"] = df_s_kreatinin_latest.apply(convert_mumol_to_mgdl, axis=1)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_s_kreatinin_latest["ValueNumber_converted"] = df_s_kreatinin_latest.apply(convert_mumol_to_mgdl, axis=1)


In [480]:
df_s_kreatinin_latest

Unnamed: 0,Patient,EntryDate,Analyte,ValueNumber,ValueText,Unit,ValueNumber_converted
69901,60,2025-02-19,s_kreatinin,135.7,,µmol/L,1.535038
61067,80,2017-05-22,s_kreatinin,232.4,,µmol/L,2.628909
58734,120,2014-05-12,s_kreatinin,242.8,,µmol/L,2.746554
70553,160,2025-02-07,s_kreatinin,66.0,,µmol/L,0.746592
95208,180,2025-02-13,s_kreatinin,73.2,,µmol/L,0.828038
...,...,...,...,...,...,...,...
222037,1522810,2025-03-07,s_kreatinin,84.0,,µmol/L,0.950208
222091,1522890,2025-03-06,s_kreatinin,890.3,,µmol/L,10.071074
222051,1522980,2025-02-27,s_kreatinin,161.1,,µmol/L,1.822363
222055,1523010,2025-02-26,s_kreatinin,112.1,,µmol/L,1.268075


In [481]:
count_high_creatinine = df_s_kreatinin_latest[df_s_kreatinin_latest['ValueNumber_converted'] > 3].shape[0]
print(f"Number of people with s_kreatinin ValueNumber_converted greater than 2: {count_high_creatinine}")

Number of people with s_kreatinin ValueNumber_converted greater than 2: 377


In [482]:
df_s_kreatinin_final = df_s_kreatinin_latest[['Patient', 'EntryDate', 'ValueNumber_converted']]

In [483]:
df_s_kreatinin_final

Unnamed: 0,Patient,EntryDate,ValueNumber_converted
69901,60,2025-02-19,1.535038
61067,80,2017-05-22,2.628909
58734,120,2014-05-12,2.746554
70553,160,2025-02-07,0.746592
95208,180,2025-02-13,0.828038
...,...,...,...
222037,1522810,2025-03-07,0.950208
222091,1522890,2025-03-06,10.071074
222051,1522980,2025-02-27,1.822363
222055,1523010,2025-02-26,1.268075


#### Calculating eGFR from serum creatinin, age, and gender


In [484]:
df_patients_cleaned

Unnamed: 0,Patient,DateOfBirth,Sex
0,30,1932-01-10,M
1,60,1967-09-20,M
2,80,1946-05-15,M
3,100,1946-06-05,M
4,120,1943-11-06,F
...,...,...,...
16092,1522810,1979-09-10,M
16093,1522890,1973-03-16,F
16094,1522980,1947-01-17,F
16095,1523010,1946-02-01,F


In [485]:
df_merged_skr_age_sex = pd.merge(df_s_kreatinin_final, df_patients_cleaned, on="Patient", how="inner")
print(df_merged_skr_age_sex)

       Patient  EntryDate  ValueNumber_converted DateOfBirth Sex
0           60 2025-02-19               1.535038  1967-09-20   M
1           80 2017-05-22               2.628909  1946-05-15   M
2          120 2014-05-12               2.746554  1943-11-06   F
3          160 2025-02-07               0.746592  1945-09-10   F
4          180 2025-02-13               0.828038  1963-05-13   F
...        ...        ...                    ...         ...  ..
11530  1522810 2025-03-07               0.950208  1979-09-10   M
11531  1522890 2025-03-06              10.071074  1973-03-16   F
11532  1522980 2025-02-27               1.822363  1947-01-17   F
11533  1523010 2025-02-26               1.268075  1946-02-01   F
11534  1523060 2025-03-04               0.848400  1938-03-05   F

[11535 rows x 5 columns]


#### Calculating the age of the patient at the moment of the test

In [486]:
df_merged_skr_age_sex["EntryDate"] = pd.to_datetime(df_merged_skr_age_sex["EntryDate"])
df_merged_skr_age_sex["DateOfBirth"] = pd.to_datetime(df_merged_skr_age_sex["DateOfBirth"])


In [487]:
df_merged_skr_age_sex["AgeAtTest"] = (df_merged_skr_age_sex["EntryDate"] - df_merged_skr_age_sex["DateOfBirth"]).dt.days // 365


In [488]:
df_merged_skr_age_sex

Unnamed: 0,Patient,EntryDate,ValueNumber_converted,DateOfBirth,Sex,AgeAtTest
0,60,2025-02-19,1.535038,1967-09-20,M,57
1,80,2017-05-22,2.628909,1946-05-15,M,71
2,120,2014-05-12,2.746554,1943-11-06,F,70
3,160,2025-02-07,0.746592,1945-09-10,F,79
4,180,2025-02-13,0.828038,1963-05-13,F,61
...,...,...,...,...,...,...
11530,1522810,2025-03-07,0.950208,1979-09-10,M,45
11531,1522890,2025-03-06,10.071074,1973-03-16,F,52
11532,1522980,2025-02-27,1.822363,1947-01-17,F,78
11533,1523010,2025-02-26,1.268075,1946-02-01,F,79


In [489]:
import numpy as np

# Define the CKD-EPI formula function
def calculate_egfr(row):
    Scr = row["ValueNumber_converted"]  # Serum Creatinine in mg/dL
    age = row["AgeAtTest"]
    sex = row["Sex"]
    
    if sex == "F":
        if Scr <= 0.7:
            eGFR = 144 * (Scr / 0.7) ** -0.329 * (0.993 ** age)
        else:
            eGFR = 144 * (Scr / 0.7) ** -1.209 * (0.993 ** age)
    elif sex == "M":
        if Scr <= 0.9:
            eGFR = 141 * (Scr / 0.9) ** -0.411 * (0.993 ** age)
        else:
            eGFR = 141 * (Scr / 0.9) ** -1.209 * (0.993 ** age)
    else:
        eGFR = np.nan  # Handle unexpected values
    
    return eGFR

# Apply the function to the dataframe
df_merged_skr_age_sex["eGFR"] = df_merged_skr_age_sex.apply(calculate_egfr, axis=1)


In [490]:
df_merged_skr_age_sex

Unnamed: 0,Patient,EntryDate,ValueNumber_converted,DateOfBirth,Sex,AgeAtTest,eGFR
0,60,2025-02-19,1.535038,1967-09-20,M,57,49.543571
1,80,2017-05-22,2.628909,1946-05-15,M,71,23.430758
2,120,2014-05-12,2.746554,1943-11-06,F,70,16.867030
3,160,2025-02-07,0.746592,1945-09-10,F,79,76.474898
4,180,2025-02-13,0.828038,1963-05-13,F,61,76.571500
...,...,...,...,...,...,...,...
11530,1522810,2025-03-07,0.950208,1979-09-10,M,45,96.256862
11531,1522890,2025-03-06,10.071074,1973-03-16,F,52,3.978582
11532,1522980,2025-02-27,1.822363,1947-01-17,F,78,26.183022
11533,1523010,2025-02-26,1.268075,1946-02-01,F,79,40.306430


In [491]:
import pandas as pd

# Define a function to categorize the patients
def categorize_gfr(value):
    if value < 15:
        return 'G5'
    elif 15 <= value <= 29:
        return 'G4'
    elif 30 <= value <= 44:
        return 'G3b'    
    elif 45 <= value <= 59:
        return 'G3a'
    elif 60 <= value <= 89:
        return 'G2'
    else:
        return 'G1'

# Apply the function to the ValueNumber column to create the new column
df_merged_skr_age_sex['GFR Category'] = df_merged_skr_age_sex['eGFR'].apply(categorize_gfr)

# Display the updated DataFrame
print(df_merged_skr_age_sex)

       Patient  EntryDate  ValueNumber_converted DateOfBirth Sex  AgeAtTest  \
0           60 2025-02-19               1.535038  1967-09-20   M         57   
1           80 2017-05-22               2.628909  1946-05-15   M         71   
2          120 2014-05-12               2.746554  1943-11-06   F         70   
3          160 2025-02-07               0.746592  1945-09-10   F         79   
4          180 2025-02-13               0.828038  1963-05-13   F         61   
...        ...        ...                    ...         ...  ..        ...   
11530  1522810 2025-03-07               0.950208  1979-09-10   M         45   
11531  1522890 2025-03-06              10.071074  1973-03-16   F         52   
11532  1522980 2025-02-27               1.822363  1947-01-17   F         78   
11533  1523010 2025-02-26               1.268075  1946-02-01   F         79   
11534  1523060 2025-03-04               0.848400  1938-03-05   F         87   

            eGFR GFR Category  
0      49.543571   

In [492]:
df_merged_skr_age_sex = df_merged_skr_age_sex[['Patient', 'EntryDate', 'eGFR', 'GFR Category']]

In [493]:
df_merged_skr_age_sex

Unnamed: 0,Patient,EntryDate,eGFR,GFR Category
0,60,2025-02-19,49.543571,G3a
1,80,2017-05-22,23.430758,G4
2,120,2014-05-12,16.867030,G4
3,160,2025-02-07,76.474898,G2
4,180,2025-02-13,76.571500,G2
...,...,...,...,...
11530,1522810,2025-03-07,96.256862,G1
11531,1522890,2025-03-06,3.978582,G5
11532,1522980,2025-02-27,26.183022,G4
11533,1523010,2025-02-26,40.306430,G3b


### CKD-EPI or equivalently eGFR

In [494]:
df_egfr = df_labs_cleaned[df_labs_cleaned['Analyte'] == 'CKD-EPI']
print(df_egfr)

        Patient   EntryDate  Analyte  ValueNumber ValueText            Unit
0       1175090  2020-05-26  CKD-EPI        129.6      None  mL/min/1.73 m²
1       1175090  2024-05-07  CKD-EPI        113.4      None  mL/min/1.73 m²
5         97360  2020-01-16  CKD-EPI         40.8      None  mL/min/1.73 m²
6         97360  2020-09-02  CKD-EPI         42.0      None  mL/min/1.73 m²
7         97360  2021-01-15  CKD-EPI         27.0      None  mL/min/1.73 m²
...         ...         ...      ...          ...       ...             ...
222078  1521890  2025-03-06  CKD-EPI         61.8      None  mL/min/1.73 m²
222086  1419930  2025-03-03  CKD-EPI         25.2      None  mL/min/1.73 m²
222088  1522890  2025-03-03  CKD-EPI          3.0      None  mL/min/1.73 m²
222089  1522890  2025-03-06  CKD-EPI          4.2      None  mL/min/1.73 m²
222092  1487910  2025-03-04  CKD-EPI          9.0      None  mL/min/1.73 m²

[66983 rows x 6 columns]


In [495]:
import pandas as pd

# Define a function to categorize the patients
def categorize_gfr(value):
    if value < 15:
        return 'G5'
    elif 15 <= value <= 29:
        return 'G4'
    elif 30 <= value <= 44:
        return 'G3b'    
    elif 45 <= value <= 59:
        return 'G3a'
    elif 60 <= value <= 89:
        return 'G2'
    else:
        return 'G1'

# Apply the function to the ValueNumber column to create the new column
df_egfr['GFR Category'] = df_egfr['ValueNumber'].apply(categorize_gfr)

# Display the updated DataFrame
print(df_egfr)

        Patient   EntryDate  Analyte  ValueNumber ValueText            Unit  \
0       1175090  2020-05-26  CKD-EPI        129.6      None  mL/min/1.73 m²   
1       1175090  2024-05-07  CKD-EPI        113.4      None  mL/min/1.73 m²   
5         97360  2020-01-16  CKD-EPI         40.8      None  mL/min/1.73 m²   
6         97360  2020-09-02  CKD-EPI         42.0      None  mL/min/1.73 m²   
7         97360  2021-01-15  CKD-EPI         27.0      None  mL/min/1.73 m²   
...         ...         ...      ...          ...       ...             ...   
222078  1521890  2025-03-06  CKD-EPI         61.8      None  mL/min/1.73 m²   
222086  1419930  2025-03-03  CKD-EPI         25.2      None  mL/min/1.73 m²   
222088  1522890  2025-03-03  CKD-EPI          3.0      None  mL/min/1.73 m²   
222089  1522890  2025-03-06  CKD-EPI          4.2      None  mL/min/1.73 m²   
222092  1487910  2025-03-04  CKD-EPI          9.0      None  mL/min/1.73 m²   

       GFR Category  
0                G1  
1      

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_egfr['GFR Category'] = df_egfr['ValueNumber'].apply(categorize_gfr)


In [496]:
df_egfr

Unnamed: 0,Patient,EntryDate,Analyte,ValueNumber,ValueText,Unit,GFR Category
0,1175090,2020-05-26,CKD-EPI,129.6,,mL/min/1.73 m²,G1
1,1175090,2024-05-07,CKD-EPI,113.4,,mL/min/1.73 m²,G1
5,97360,2020-01-16,CKD-EPI,40.8,,mL/min/1.73 m²,G3b
6,97360,2020-09-02,CKD-EPI,42.0,,mL/min/1.73 m²,G3b
7,97360,2021-01-15,CKD-EPI,27.0,,mL/min/1.73 m²,G4
...,...,...,...,...,...,...,...
222078,1521890,2025-03-06,CKD-EPI,61.8,,mL/min/1.73 m²,G2
222086,1419930,2025-03-03,CKD-EPI,25.2,,mL/min/1.73 m²,G4
222088,1522890,2025-03-03,CKD-EPI,3.0,,mL/min/1.73 m²,G5
222089,1522890,2025-03-06,CKD-EPI,4.2,,mL/min/1.73 m²,G5


In [497]:
# Convert 'EntryDate' to datetime format
df_egfr['EntryDate'] = pd.to_datetime(df_egfr['EntryDate'])

# Sort the DataFrame by 'Patient' and 'EntryDate'
df_egfr_sorted = df_egfr.sort_values(by=['Patient', 'EntryDate'])

# Drop duplicates, keeping only the latest entry for each patient
df_egfr_latest = df_egfr_sorted.drop_duplicates(subset='Patient', keep='last')

# Display the resulting DataFrame
print(df_egfr_latest)

        Patient  EntryDate  Analyte  ValueNumber ValueText            Unit  \
69768        60 2025-02-19  CKD-EPI         49.2      None  mL/min/1.73 m²   
70539       160 2025-02-07  CKD-EPI         76.2      None  mL/min/1.73 m²   
95196       180 2025-02-13  CKD-EPI         76.2      None  mL/min/1.73 m²   
65900       190 2022-12-07  CKD-EPI         30.6      None  mL/min/1.73 m²   
102755      270 2024-04-23  CKD-EPI         61.8      None  mL/min/1.73 m²   
...         ...        ...      ...          ...       ...             ...   
222027  1522810 2025-03-07  CKD-EPI         96.0      None  mL/min/1.73 m²   
222089  1522890 2025-03-06  CKD-EPI          4.2      None  mL/min/1.73 m²   
222048  1522980 2025-02-27  CKD-EPI         26.4      None  mL/min/1.73 m²   
222054  1523010 2025-02-26  CKD-EPI         40.2      None  mL/min/1.73 m²   
222062  1523060 2025-03-04  CKD-EPI         61.8      None  mL/min/1.73 m²   

       GFR Category  
69768           G3a  
70539            G2

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_egfr['EntryDate'] = pd.to_datetime(df_egfr['EntryDate'])


In [498]:
df_egfr_latest

Unnamed: 0,Patient,EntryDate,Analyte,ValueNumber,ValueText,Unit,GFR Category
69768,60,2025-02-19,CKD-EPI,49.2,,mL/min/1.73 m²,G3a
70539,160,2025-02-07,CKD-EPI,76.2,,mL/min/1.73 m²,G2
95196,180,2025-02-13,CKD-EPI,76.2,,mL/min/1.73 m²,G2
65900,190,2022-12-07,CKD-EPI,30.6,,mL/min/1.73 m²,G3b
102755,270,2024-04-23,CKD-EPI,61.8,,mL/min/1.73 m²,G2
...,...,...,...,...,...,...,...
222027,1522810,2025-03-07,CKD-EPI,96.0,,mL/min/1.73 m²,G1
222089,1522890,2025-03-06,CKD-EPI,4.2,,mL/min/1.73 m²,G5
222048,1522980,2025-02-27,CKD-EPI,26.4,,mL/min/1.73 m²,G4
222054,1523010,2025-02-26,CKD-EPI,40.2,,mL/min/1.73 m²,G3b


In [499]:
df_egfr_final = df_egfr_latest[['Patient', 'EntryDate', 'ValueNumber', 'GFR Category']]

In [500]:
df_egfr_final

Unnamed: 0,Patient,EntryDate,ValueNumber,GFR Category
69768,60,2025-02-19,49.2,G3a
70539,160,2025-02-07,76.2,G2
95196,180,2025-02-13,76.2,G2
65900,190,2022-12-07,30.6,G3b
102755,270,2024-04-23,61.8,G2
...,...,...,...,...
222027,1522810,2025-03-07,96.0,G1
222089,1522890,2025-03-06,4.2,G5
222048,1522980,2025-02-27,26.4,G4
222054,1523010,2025-02-26,40.2,G3b


In [501]:
df_egfr_final = df_egfr_final.rename(columns={"ValueNumber": "eGFR"})

### UACR test

In [502]:
df_uacr = df_labs_cleaned.loc[df_labs_cleaned['Analyte'] == 'UACR']
print(df_uacr)

        Patient   EntryDate Analyte  ValueNumber ValueText   Unit
4       1175090  2024-05-07    UACR          0.4      None  g/mol
42        97360  2014-06-17    UACR          2.6      None  g/mol
44        97360  2016-06-28    UACR          4.8      None  g/mol
45        97360  2017-05-12    UACR         14.0      None  g/mol
49        97360  2022-12-08    UACR        718.0      None  g/mol
...         ...         ...     ...          ...       ...    ...
220376  1500950  2024-01-10    UACR          1.0      None  g/mol
220948  1511630  2024-07-16    UACR          1.0      None  g/mol
221602  1516530  2024-12-10    UACR          1.4      None  g/mol
221715  1519520  2024-12-20    UACR          3.6      None  g/mol
221951  1521710  2025-02-14    UACR         55.9      None  g/mol

[7906 rows x 6 columns]


In [503]:
df_uacr['Unit'].unique()

array(['g/mol'], dtype=object)

In [504]:
none_count = df_uacr['Unit'].isnull().sum()
print(f"Number of 'None' values in 'Unit' column: {none_count}")

Number of 'None' values in 'Unit' column: 0


In [505]:
df_uacr['Unit'].value_counts()

Unit
g/mol    7906
Name: count, dtype: int64

In [506]:
df_uacr['Unit'] = df_uacr['Unit'].replace({'mg/mmolKr': 'g/mol', 'mg/mmol': 'g/mol', 'g/mol kre': 'g/mol', 'g/molkre': 'g/mol'})
print(df_uacr['Unit'].value_counts())

Unit
g/mol    7906
Name: count, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_uacr['Unit'] = df_uacr['Unit'].replace({'mg/mmolKr': 'g/mol', 'mg/mmol': 'g/mol', 'g/mol kre': 'g/mol', 'g/molkre': 'g/mol'})


In [507]:
df_uacr

Unnamed: 0,Patient,EntryDate,Analyte,ValueNumber,ValueText,Unit
4,1175090,2024-05-07,UACR,0.4,,g/mol
42,97360,2014-06-17,UACR,2.6,,g/mol
44,97360,2016-06-28,UACR,4.8,,g/mol
45,97360,2017-05-12,UACR,14.0,,g/mol
49,97360,2022-12-08,UACR,718.0,,g/mol
...,...,...,...,...,...,...
220376,1500950,2024-01-10,UACR,1.0,,g/mol
220948,1511630,2024-07-16,UACR,1.0,,g/mol
221602,1516530,2024-12-10,UACR,1.4,,g/mol
221715,1519520,2024-12-20,UACR,3.6,,g/mol


In [508]:
#count_greater_than_50 = df_uacr[df_uacr['ValueNumber'] > 100].shape[0]
#print(f"Number of 'ValueNumber' values greater than 50: {count_greater_than_50}")

In [509]:
import pandas as pd

# Define a function to categorize the patients
def categorize_albuminaria(value):
    if value < 3:
        return 'A1'
    elif 3 <= value <= 30:
        return 'A2'
    else:
        return 'A3'

# Apply the function to the ValueNumber column to create the new column
df_uacr['Albuminaria Category'] = df_uacr['ValueNumber'].apply(categorize_albuminaria)

# Display the updated DataFrame
print(df_uacr)

        Patient   EntryDate Analyte  ValueNumber ValueText   Unit  \
4       1175090  2024-05-07    UACR          0.4      None  g/mol   
42        97360  2014-06-17    UACR          2.6      None  g/mol   
44        97360  2016-06-28    UACR          4.8      None  g/mol   
45        97360  2017-05-12    UACR         14.0      None  g/mol   
49        97360  2022-12-08    UACR        718.0      None  g/mol   
...         ...         ...     ...          ...       ...    ...   
220376  1500950  2024-01-10    UACR          1.0      None  g/mol   
220948  1511630  2024-07-16    UACR          1.0      None  g/mol   
221602  1516530  2024-12-10    UACR          1.4      None  g/mol   
221715  1519520  2024-12-20    UACR          3.6      None  g/mol   
221951  1521710  2025-02-14    UACR         55.9      None  g/mol   

       Albuminaria Category  
4                        A1  
42                       A1  
44                       A2  
45                       A2  
49                   

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_uacr['Albuminaria Category'] = df_uacr['ValueNumber'].apply(categorize_albuminaria)


In [510]:
df_uacr

Unnamed: 0,Patient,EntryDate,Analyte,ValueNumber,ValueText,Unit,Albuminaria Category
4,1175090,2024-05-07,UACR,0.4,,g/mol,A1
42,97360,2014-06-17,UACR,2.6,,g/mol,A1
44,97360,2016-06-28,UACR,4.8,,g/mol,A2
45,97360,2017-05-12,UACR,14.0,,g/mol,A2
49,97360,2022-12-08,UACR,718.0,,g/mol,A3
...,...,...,...,...,...,...,...
220376,1500950,2024-01-10,UACR,1.0,,g/mol,A1
220948,1511630,2024-07-16,UACR,1.0,,g/mol,A1
221602,1516530,2024-12-10,UACR,1.4,,g/mol,A1
221715,1519520,2024-12-20,UACR,3.6,,g/mol,A2


In [511]:
# Convert 'EntryDate' to datetime format
df_uacr['EntryDate'] = pd.to_datetime(df_uacr['EntryDate'])

# Sort the DataFrame by 'Patient' and 'EntryDate'
df_uacr_sorted = df_uacr.sort_values(by=['Patient', 'EntryDate'])

# Drop duplicates, keeping only the latest entry for each patient
df_uacr_latest = df_uacr_sorted.drop_duplicates(subset='Patient', keep='last')

# Display the resulting DataFrame
print(df_uacr_latest)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_uacr['EntryDate'] = pd.to_datetime(df_uacr['EntryDate'])


        Patient  EntryDate Analyte  ValueNumber ValueText   Unit  \
69908        60 2024-05-15    UACR          1.6      None  g/mol   
65914       190 2016-09-01    UACR          2.3      None  g/mol   
62100       280 2014-02-04    UACR          1.7      None  g/mol   
97455       420 2018-12-03    UACR         29.7      None  g/mol   
11958       520 2018-03-07    UACR         56.0      None  g/mol   
...         ...        ...     ...          ...       ...    ...   
212495  1519790 2025-02-26    UACR         12.0      None  g/mol   
212506  1519870 2025-01-29    UACR          0.7      None  g/mol   
212418  1520930 2025-01-24    UACR          1.6      None  g/mol   
221951  1521710 2025-02-14    UACR         55.9      None  g/mol   
212694  1521790 2025-02-24    UACR         12.0      None  g/mol   

       Albuminaria Category  
69908                    A1  
65914                    A1  
62100                    A1  
97455                    A2  
11958                    A3  
...

In [512]:
df_uacr_final = df_uacr_latest[['Patient', 'EntryDate', 'ValueNumber', 'Albuminaria Category']]

In [513]:
df_uacr_final

Unnamed: 0,Patient,EntryDate,ValueNumber,Albuminaria Category
69908,60,2024-05-15,1.6,A1
65914,190,2016-09-01,2.3,A1
62100,280,2014-02-04,1.7,A1
97455,420,2018-12-03,29.7,A2
11958,520,2018-03-07,56.0,A3
...,...,...,...,...
212495,1519790,2025-02-26,12.0,A2
212506,1519870,2025-01-29,0.7,A1
212418,1520930,2025-01-24,1.6,A1
221951,1521710,2025-02-14,55.9,A3


In [514]:
df_uacr_final[df_uacr_final['Patient'] == 60]

Unnamed: 0,Patient,EntryDate,ValueNumber,Albuminaria Category
69908,60,2024-05-15,1.6,A1


In [515]:
df_uacr_final = df_uacr_final.rename(columns={"ValueNumber": "UACR"})

### UACR and GFR glues together

In [516]:
df_merged = pd.merge(df_egfr_final, df_uacr_final, on="Patient", how="outer", suffixes=("_eGFR", "_UACR"))
df_merged = df_merged.fillna(value='None')


In [517]:
df_merged

Unnamed: 0,Patient,EntryDate_eGFR,eGFR,GFR Category,EntryDate_UACR,UACR,Albuminaria Category
0,60,2025-02-19 00:00:00,49.2,G3a,2024-05-15 00:00:00,1.6,A1
1,160,2025-02-07 00:00:00,76.2,G2,,,
2,180,2025-02-13 00:00:00,76.2,G2,,,
3,190,2022-12-07 00:00:00,30.6,G3b,2016-09-01 00:00:00,2.3,A1
4,270,2024-04-23 00:00:00,61.8,G2,,,
...,...,...,...,...,...,...,...
7889,1522810,2025-03-07 00:00:00,96.0,G1,,,
7890,1522890,2025-03-06 00:00:00,4.2,G5,,,
7891,1522980,2025-02-27 00:00:00,26.4,G4,,,
7892,1523010,2025-02-26 00:00:00,40.2,G3b,,,


In [518]:
df_merged["EntryDate_eGFR"] = df_merged["EntryDate_eGFR"].astype(str).str.split(" ").str[0]
df_merged["EntryDate_UACR"] = df_merged["EntryDate_UACR"].astype(str).str.split(" ").str[0]


In [519]:
df_merged.head(50)


Unnamed: 0,Patient,EntryDate_eGFR,eGFR,GFR Category,EntryDate_UACR,UACR,Albuminaria Category
0,60,2025-02-19,49.2,G3a,2024-05-15,1.6,A1
1,160,2025-02-07,76.2,G2,,,
2,180,2025-02-13,76.2,G2,,,
3,190,2022-12-07,30.6,G3b,2016-09-01,2.3,A1
4,270,2024-04-23,61.8,G2,,,
5,280,,,,2014-02-04,1.7,A1
6,290,2020-02-26,80.4,G2,,,
7,310,2021-02-08,46.8,G3a,,,
8,370,2024-06-10,49.8,G3a,,,
9,420,2024-12-10,63.6,G2,2018-12-03,29.7,A2


### Adding UACR_GFR results to the database

In [520]:
import sqlite3

# Connect to the SQLite database
conn = sqlite3.connect(db_file)

# Define the table name
merged_table_name = "GFR_UACR_results"

# Save the dataframe to the SQLite database
df_merged.to_sql(merged_table_name, conn, if_exists="replace", index=False)

# Commit and close the connection
conn.commit()
conn.close()

print(f"Dataframe successfully added to the {merged_table_name} table in {db_file}.")

Dataframe successfully added to the GFR_UACR_results table in CKD_train_unlocked.db.


### Computing eGFR manually from the skreatinin, sex and age of the patient

In [521]:
df_s_kreatinin

Unnamed: 0,Patient,EntryDate,Analyte,ValueNumber,ValueText,Unit
2,1175090,2020-05-26,s_kreatinin,73.1,,µmol/L
3,1175090,2024-05-07,s_kreatinin,84.3,,µmol/L
11,97360,2016-08-01,s_kreatinin,75.9,,µmol/L
12,97360,2016-08-06,s_kreatinin,63.7,,µmol/L
13,97360,2016-08-13,s_kreatinin,65.9,,µmol/L
...,...,...,...,...,...,...
222085,1521890,2025-02-27,s_kreatinin,125.2,,µmol/L
222087,1419930,2025-03-03,s_kreatinin,167.2,,µmol/L
222090,1522890,2025-03-03,s_kreatinin,1077.7,,µmol/L
222091,1522890,2025-03-06,s_kreatinin,890.3,,µmol/L


In [522]:
df_patients

Unnamed: 0,Patient,DateOfBirth,Sex
0,30,1932-01-10,M
1,60,1967-09-20,M
2,80,1946-05-15,M
3,100,1946-06-05,M
4,120,1943-11-06,F
...,...,...,...
16092,1522810,1979-09-10,M
16093,1522890,1973-03-16,F
16094,1522980,1947-01-17,F
16095,1523010,1946-02-01,F


In [523]:
df_egfr_final   #taken directly from CKD-EPI test

Unnamed: 0,Patient,EntryDate,eGFR,GFR Category
69768,60,2025-02-19,49.2,G3a
70539,160,2025-02-07,76.2,G2
95196,180,2025-02-13,76.2,G2
65900,190,2022-12-07,30.6,G3b
102755,270,2024-04-23,61.8,G2
...,...,...,...,...
222027,1522810,2025-03-07,96.0,G1
222089,1522890,2025-03-06,4.2,G5
222048,1522980,2025-02-27,26.4,G4
222054,1523010,2025-02-26,40.2,G3b


In [524]:
df_merged_skr_age_sex   #calculated manually from s_kreatinin

Unnamed: 0,Patient,EntryDate,eGFR,GFR Category
0,60,2025-02-19,49.543571,G3a
1,80,2017-05-22,23.430758,G4
2,120,2014-05-12,16.867030,G4
3,160,2025-02-07,76.474898,G2
4,180,2025-02-13,76.571500,G2
...,...,...,...,...
11530,1522810,2025-03-07,96.256862,G1
11531,1522890,2025-03-06,3.978582,G5
11532,1522980,2025-02-27,26.183022,G4
11533,1523010,2025-02-26,40.306430,G3b


In [525]:
df_uacr_final

Unnamed: 0,Patient,EntryDate,UACR,Albuminaria Category
69908,60,2024-05-15,1.6,A1
65914,190,2016-09-01,2.3,A1
62100,280,2014-02-04,1.7,A1
97455,420,2018-12-03,29.7,A2
11958,520,2018-03-07,56.0,A3
...,...,...,...,...
212495,1519790,2025-02-26,12.0,A2
212506,1519870,2025-01-29,0.7,A1
212418,1520930,2025-01-24,1.6,A1
221951,1521710,2025-02-14,55.9,A3


In [526]:
import pandas as pd

# Merge df_egfr_final and df_uacr_final on 'Patient'
df_combined = pd.merge(df_egfr_final, df_uacr_final, on="Patient", how="outer", suffixes=("_eGFR", "_UACR"))

# Merge the result with df_merged_skr_age_sex on 'Patient'
df_combined = pd.merge(df_combined, df_merged_skr_age_sex, on="Patient", how="outer", suffixes=("", "_sKreatinin"))

# Fill NaN values with 'None' for better readability
df_combined = df_combined.fillna(value='None')

# Display the combined DataFrame
print(df_combined)

       Patient       EntryDate_eGFR  eGFR GFR Category       EntryDate_UACR  \
0           60  2025-02-19 00:00:00  49.2          G3a  2024-05-15 00:00:00   
1           80                 None  None         None                 None   
2          120                 None  None         None                 None   
3          160  2025-02-07 00:00:00  76.2           G2                 None   
4          180  2025-02-13 00:00:00  76.2           G2                 None   
...        ...                  ...   ...          ...                  ...   
11534  1522810  2025-03-07 00:00:00  96.0           G1                 None   
11535  1522890  2025-03-06 00:00:00   4.2           G5                 None   
11536  1522980  2025-02-27 00:00:00  26.4           G4                 None   
11537  1523010  2025-02-26 00:00:00  40.2          G3b                 None   
11538  1523060  2025-03-04 00:00:00  61.8           G2                 None   

       UACR Albuminaria Category            EntryDa

In [527]:
df_combined

Unnamed: 0,Patient,EntryDate_eGFR,eGFR,GFR Category,EntryDate_UACR,UACR,Albuminaria Category,EntryDate,eGFR_sKreatinin,GFR Category_sKreatinin
0,60,2025-02-19 00:00:00,49.2,G3a,2024-05-15 00:00:00,1.6,A1,2025-02-19 00:00:00,49.543571,G3a
1,80,,,,,,,2017-05-22 00:00:00,23.430758,G4
2,120,,,,,,,2014-05-12 00:00:00,16.86703,G4
3,160,2025-02-07 00:00:00,76.2,G2,,,,2025-02-07 00:00:00,76.474898,G2
4,180,2025-02-13 00:00:00,76.2,G2,,,,2025-02-13 00:00:00,76.5715,G2
...,...,...,...,...,...,...,...,...,...,...
11534,1522810,2025-03-07 00:00:00,96.0,G1,,,,2025-03-07 00:00:00,96.256862,G1
11535,1522890,2025-03-06 00:00:00,4.2,G5,,,,2025-03-06 00:00:00,3.978582,G5
11536,1522980,2025-02-27 00:00:00,26.4,G4,,,,2025-02-27 00:00:00,26.183022,G4
11537,1523010,2025-02-26 00:00:00,40.2,G3b,,,,2025-02-26 00:00:00,40.30643,G3b


In [528]:
import pandas as pd
import numpy as np

# Ensure NaN handling
df_combined["eGFR"] = df_combined["eGFR"].combine_first(df_combined["eGFR_sKreatinin"])
df_combined["GFR Category"] = df_combined["GFR Category"].combine_first(df_combined["GFR Category_sKreatinin"])
df_combined["EntryDate_eGFR"] = df_combined["EntryDate_eGFR"].combine_first(df_combined["EntryDate"])

# Drop the redundant columns after substitution
#df_combined.drop(columns=["eGFR_sKreatinin", "GFR Category_sKreatinin", "EntryDate"], inplace=True)


In [529]:
df_combined

Unnamed: 0,Patient,EntryDate_eGFR,eGFR,GFR Category,EntryDate_UACR,UACR,Albuminaria Category,EntryDate,eGFR_sKreatinin,GFR Category_sKreatinin
0,60,2025-02-19 00:00:00,49.2,G3a,2024-05-15 00:00:00,1.6,A1,2025-02-19 00:00:00,49.543571,G3a
1,80,,,,,,,2017-05-22 00:00:00,23.430758,G4
2,120,,,,,,,2014-05-12 00:00:00,16.86703,G4
3,160,2025-02-07 00:00:00,76.2,G2,,,,2025-02-07 00:00:00,76.474898,G2
4,180,2025-02-13 00:00:00,76.2,G2,,,,2025-02-13 00:00:00,76.5715,G2
...,...,...,...,...,...,...,...,...,...,...
11534,1522810,2025-03-07 00:00:00,96.0,G1,,,,2025-03-07 00:00:00,96.256862,G1
11535,1522890,2025-03-06 00:00:00,4.2,G5,,,,2025-03-06 00:00:00,3.978582,G5
11536,1522980,2025-02-27 00:00:00,26.4,G4,,,,2025-02-27 00:00:00,26.183022,G4
11537,1523010,2025-02-26 00:00:00,40.2,G3b,,,,2025-02-26 00:00:00,40.30643,G3b


In [530]:
print(df_combined[df_combined["eGFR"] == 'None'])

       Patient EntryDate_eGFR  eGFR GFR Category       EntryDate_UACR  UACR  \
1           80           None  None         None                 None  None   
2          120           None  None         None                 None  None   
6          200           None  None         None                 None  None   
7          230           None  None         None                 None  None   
9          280           None  None         None  2014-02-04 00:00:00   1.7   
...        ...            ...   ...          ...                  ...   ...   
10279  1482810           None  None         None                 None  None   
10599  1491710           None  None         None                 None  None   
10700  1494270           None  None         None  2023-09-06 00:00:00  7.42   
11425  1518070           None  None         None                 None  None   
11476  1519980           None  None         None                 None  None   

      Albuminaria Category            EntryDate eGF

In [531]:
# Loop through each row in the DataFrame
for index, row in df_combined.iterrows():
    if row["eGFR"] == "None" and not pd.isna(row["eGFR_sKreatinin"]):  # If eGFR is missing but eGFR_sKreatinin exists
        df_combined.at[index, "eGFR"] = row["eGFR_sKreatinin"]         # Replace eGFR with eGFR_sKreatinin
        df_combined.at[index, "EntryDate_eGFR"] = row["EntryDate"]         # Replace eGFR with eGFR_sKreatinin
        df_combined.at[index, "GFR Category"] = row["GFR Category_sKreatinin"]  # Also replace the category

# Drop the redundant columns after substitution
df_combined.drop(columns=["eGFR_sKreatinin", "GFR Category_sKreatinin", "EntryDate"], inplace=True)

In [532]:
df_combined["EntryDate_eGFR"] = df_combined["EntryDate_eGFR"].astype(str).str.split(" ").str[0]
df_combined["EntryDate_UACR"] = df_combined["EntryDate_UACR"].astype(str).str.split(" ").str[0]


In [533]:
df_combined

Unnamed: 0,Patient,EntryDate_eGFR,eGFR,GFR Category,EntryDate_UACR,UACR,Albuminaria Category
0,60,2025-02-19,49.2,G3a,2024-05-15,1.6,A1
1,80,2017-05-22,23.430758,G4,,,
2,120,2014-05-12,16.86703,G4,,,
3,160,2025-02-07,76.2,G2,,,
4,180,2025-02-13,76.2,G2,,,
...,...,...,...,...,...,...,...
11534,1522810,2025-03-07,96.0,G1,,,
11535,1522890,2025-03-06,4.2,G5,,,
11536,1522980,2025-02-27,26.4,G4,,,
11537,1523010,2025-02-26,40.2,G3b,,,


In [537]:
df_combined[df_combined['UACR'] != 'None']

Unnamed: 0,Patient,EntryDate_eGFR,eGFR,GFR Category,EntryDate_UACR,UACR,Albuminaria Category
0,60,2025-02-19,49.2,G3a,2024-05-15,1.6,A1
5,190,2022-12-07,30.6,G3b,2016-09-01,2.3,A1
9,280,2015-03-09,53.855314,G3a,2014-02-04,1.7,A1
13,420,2024-12-10,63.6,G2,2018-12-03,29.7,A2
17,520,2020-07-14,10.2,G5,2018-03-07,56.0,A3
...,...,...,...,...,...,...,...
11467,1519790,2025-02-26,36.0,G3b,2025-02-26,12.0,A2
11470,1519870,2025-01-29,95.4,G1,2025-01-29,0.7,A1
11500,1520930,2025-01-24,108.6,G1,2025-01-24,1.6,A1
11519,1521710,2025-02-19,42.6,G3b,2025-02-14,55.9,A3


In [535]:
import sqlite3

# Connect to the SQLite database
conn = sqlite3.connect(db_file)

# Define the table name
combined_table_name = "uACR_eGFR_latest"

# Save the dataframe to the SQLite database
df_combined.to_sql(combined_table_name, conn, if_exists="replace", index=False)

# Commit and close the connection
conn.commit()
conn.close()

print(f"Dataframe successfully added to the {combined_table_name} table in {db_file}.")

Dataframe successfully added to the uACR_eGFR_latest table in CKD_train_unlocked.db.


## Machine Learning Part

In [538]:
df_combined

Unnamed: 0,Patient,EntryDate_eGFR,eGFR,GFR Category,EntryDate_UACR,UACR,Albuminaria Category
0,60,2025-02-19,49.2,G3a,2024-05-15,1.6,A1
1,80,2017-05-22,23.430758,G4,,,
2,120,2014-05-12,16.86703,G4,,,
3,160,2025-02-07,76.2,G2,,,
4,180,2025-02-13,76.2,G2,,,
...,...,...,...,...,...,...,...
11534,1522810,2025-03-07,96.0,G1,,,
11535,1522890,2025-03-06,4.2,G5,,,
11536,1522980,2025-02-27,26.4,G4,,,
11537,1523010,2025-02-26,40.2,G3b,,,
