### Step 1 data preprocessing

part (a): tabular data

the zero-shot ability of our model comes from embedding of feature description

First we should extract the name of each column

In [6]:
!export http_proxy=http://localhost:1080
!export https_proxy=http://localhost:1080

In [None]:
from icecream import ic
ic.enable()
import numpy as np
import pandas as pd
import pickle
from tqdm import tqdm
import json

from langchain import LLMChain, PromptTemplate
from langchain.chat_models import init_chat_model
from langchain_openai import OpenAIEmbeddings

import os
os.environ["OPENAI_API_KEY"] = "your-openai-key"

In [None]:
### NACC dataset

tabular_data_path = "/openbayes/home/LMTDE/data/datasets/nacc_new/naccImg_validation_normed.csv" # can be any tabular data file, e.g. CSV, TSV, etc.
pkl_path = "/openbayes/home/LMTDE/data/datasets/nacc_new/meta/column_info.pkl"
background = "Funded by NIA, the National Alzheimer's Coordinating Center (NACC) has developed and maintains a large relational database of standardized clinical and neuropathological research data. NACC provides a valuable resource for both exploratory and explanatory Alzheimer's disease research. NACC data are freely available to all researchers."

###

### ADNI dataset

# tabular_data_path = "/openbayes/home/LMTDE/data/datasets/adni/adni_final.csv"
# pkl_path = "/openbayes/home/LMTDE/data/datasets/adni/meta/column_info.pkl"
# background = "The ADNI data set is a comprehensive and widely used collection of longitudinal clinical, imaging, genetic, and other biomarker data. It encompasses various data types, including structural, functional, and molecular brain imaging, biofluid biomarkers, cognitive assessments, genetic data, and demographic information.ADNI participants are assigned to a Schedule of Events (SOE) based on various factors, such as clinical diagnosis (unimpaired, MCI, AD/dementia). During each visit, the SOE dictates what data is collected. This data is made available to approved researchers."

###



# read the data into a pandas DataFrame
tabular_data = pd.read_csv(tabular_data_path)



def save_pkl(column_info, pkl_path=pkl_path, step=0):
    # add step to pkl path if step > 0
    if step > 0:
        pkl_path = pkl_path.replace(".pkl", f"_step{step}.pkl")
    with open(pkl_path, "wb") as f:
        pickle.dump(column_info, f)
    print(f"column info saved to {pkl_path}")

def load_pkl(pkl_path=pkl_path, step=0):
    if step > 0:
        pkl_path = pkl_path.replace(".pkl", f"_step{step}.pkl")
    if not os.path.exists(pkl_path):
        print(f"column info not found at {pkl_path}")
        return None
    with open(pkl_path, "rb") as f:
        column_info = pickle.load(f)
    return column_info

focus_on_this_column = None
columns_that_need_to_update = []

  tabular_data = pd.read_csv(tabular_data_path)


Second, classify the data to {"categorical", "numerical"}

In [None]:
def what_type_of_data(tabular_data, column_name):
    # "img_emb": column name contains "img_"
    # "numerical": data type in [int, float] and (unique values > 32 or any float data changes after rounding to 0 decimal places)
    # "categorical": else
    if "img_" in column_name:
        return "img_emb"
    if tabular_data[column_name].dtype in [int, float]:
        if tabular_data[column_name].nunique() > 10: # 32:
            return "numerical"
        # if any float data changes after rounding to 0 decimal places, then it is numerical
        if not tabular_data[column_name].apply(lambda x: round(x, 0)).equals(tabular_data[column_name]):
            return "numerical"
    return "categorical"

# store the data type of each column in a dictionary
column_info = {
    column_name: {
        "type": what_type_of_data(tabular_data, column_name),
        "num_categories": tabular_data[column_name].nunique()
    } for column_name in tabular_data.columns
}
# column_info["MOCA"].update({"type": "numerical"})
# column_info["PTEDUCAT"].update({"type": "numerical"})
save_pkl(column_info, step=1)

column info saved to /openbayes/home/LMTDE/data/datasets/nacc_new/meta/column_info_step1.pkl
483


Third, we rewrite and concatenate the description and category (for categorical data)

In [None]:
def rewrite_column_description(background, description, rewrite_count=1):
    prompt_template = PromptTemplate(
        input_variables=["background", "description", "rewrite_count"],
        template="""
        [Background]
        Background information: {background}

        [Instruction]
        Please comprehend and then rewrite the following raw text (which may be a JSON-style string including detailed explanation, or contain only uppercase letters representing abbreviations, in which case you need to infer their meanings based on background knowledge) into a declarative sentence that is easy for humans to understand. Ensure to include the meaning of every possible data value (if provided in the raw text). Rewrite it in {rewrite_count} different ways or styles. (Separate the results with a newline character, no empty lines, and no serial numbers before the results.) Do not include the original column name (the string of uppercase letters) in your rewritten results; express the meaning using normal language instead.

        [Raw text]
        {description}
        """
    )
    # ic(background, description, prompt_template)

    llm = init_chat_model("gpt-4o-mini", model_provider="openai")
    chain = LLMChain(llm=llm, prompt=prompt_template)
    result = chain.run(background=background, description=description, rewrite_count=rewrite_count)
    rewritten_descriptions = result.strip().split("\n")
    # drop empty line
    rewritten_descriptions = [desc for desc in rewritten_descriptions if desc]
    assert(len(rewritten_descriptions) >= rewrite_count)
    return rewritten_descriptions[:rewrite_count]

if 'nacc' in tabular_data_path:
    # use additional information to rewrite the column descriptions
    addfile_path = "/openbayes/home/LMTDE/data/datasets/nacc_new/uds3_rdd_origin.xlsx"
    # for each row in addfile, 1th, 2th, ..., 6th column means: VariableName, Form, VariableType, ShortDescriptor, DataType, AllowableCodes
    # create a dictionary to store, key=VariableName, value={"VariableName":..., "Form":..., ...}(json data)
    addfile_data = pd.read_excel(addfile_path, sheet_name="uds3-rdd", header=None)
    addfile_column_name = addfile_data.iloc[:, 0].values
    addfile_info = {}
    for i in range(addfile_data.shape[0]):
        column_name = addfile_data.iloc[i, 0]
        addfile_info[column_name] = {}
        addfile_info[column_name]["origin_column_name"] = addfile_data.iloc[i, 0]
        addfile_info[column_name]["form"] = addfile_data.iloc[i, 1]
        addfile_info[column_name]["variable_type"] = addfile_data.iloc[i, 2]
        addfile_info[column_name]["description"] = addfile_data.iloc[i, 3]
        addfile_info[column_name]["data_type"] = addfile_data.iloc[i, 4]
        addfile_info[column_name]["allowable_codes"] = addfile_data.iloc[i, 5]

elif 'adni' in tabular_data_path:
    addfile_path = "/openbayes/home/LMTDE/data/datasets/adni/meta/adni_all_descriptor.csv"
    addfile_data = pd.read_csv(addfile_path)
    # get the field name from first row
    field_names = addfile_data.columns.tolist()
    # ic(field_names)
    # for each row
    addfile_column_name = addfile_data.iloc[:, 3].values
    # ic(addfile_column_name)
    addfile_info = {}
    for i in range(addfile_data.shape[0]):
        column_name = addfile_data.iloc[i, 3]
        if column_name not in addfile_info:
            addfile_info[column_name] = {}
        for j in range(len(field_names)):
            # if old info is nan or empty, update it
            if field_names[j] not in addfile_info[column_name] or pd.isna(addfile_info[column_name][field_names[j]]) or addfile_info[column_name][field_names[j]] == "":
                addfile_info[column_name][field_names[j]] = addfile_data.iloc[i, j]

else:
    assert False, "tabular_data_path should contain 'nacc' or 'adni'"

    
def find_the_most_related_origin_column(column_name, addfile_column_name):
    if column_name in addfile_column_name:
        # return json string(dump from dict: addfile_info[column_name])
        return json.dumps(addfile_info[column_name])
    # if column_name is not in addfile_column_name, find the most related column
    # use maximum common substring to find the most related column
    max_common_substring = ""
    most_related_column = ""
    for origin_column_name in addfile_column_name:
        if len(origin_column_name) < 5:
            continue
        for i in range(len(column_name) - 4):
            for j in range(i + 5, len(column_name) + 1):
                if column_name[i:j] in origin_column_name:
                    if j - i > len(max_common_substring):
                        max_common_substring = column_name[i:j]
                        most_related_column = origin_column_name
    # ic(column_name, most_related_column, max_common_substring)
    if most_related_column:
        return json.dumps(addfile_info[most_related_column])
    return column_name


column_info = load_pkl(step=1)
column_info_next = load_pkl(step=2)


for column_name in tqdm(column_info):
    if focus_on_this_column != None:
        if column_name != focus_on_this_column:
            continue
    if columns_that_need_to_update:
        if column_name not in columns_that_need_to_update:
            continue
    # description = column_name # you can use other detailed description 
    description = find_the_most_related_origin_column(column_name, addfile_column_name)

    if 'adni' in tabular_data_path:
        # based on addfile, use "CODE" field to determine the data type
        if column_name in addfile_info:
            if "CODE" in addfile_info[column_name]:
                code_str = addfile_info[column_name]["CODE"]
                # code_str could be nan or empty, skip it
                if not pd.isna(code_str) and code_str != "":
                    if ".." in code_str:
                        column_info[column_name]["type"] = "numerical"
                    elif "Step=" in code_str:
                        column_info[column_name]["type"] = "numerical"
    
    rewritten_descriptions = rewrite_column_description(background, description)
    column_info[column_name]["rewritten_descriptions"] = rewritten_descriptions
    # ic(rewritten_descriptions)
    # ic(column_info_next[column_name])
    if columns_that_need_to_update:
        column_info_next[column_name] = column_info[column_name]


if columns_that_need_to_update:
    save_pkl(column_info_next, step=2)
elif focus_on_this_column is None:
    save_pkl(column_info, step=2)
save_pkl(column_info, step=2)

  0%|          | 0/483 [00:00<?, ?it/s]

100%|██████████| 483/483 [58:38<00:00,  7.29s/it] 

column info saved to /openbayes/home/LMTDE/data/datasets/nacc_new/meta/column_info_step2.pkl





In [7]:
def concatenate_and_rewrite(column_description, category_description, expected_lines=None):
    if len(category_description) > 2000:
        ic("category_description is too long, rejected")
        ic(column_description)
        return [column_description]
    prompt_template = PromptTemplate(
        input_variables=["column_description", "category_description", "expected_lines"],
        template="""
        [Instruction]
        The following text describes one of the columns in the tabular data (categorical data) and includes the category descriptions that actually exist in the real data (some data mentioned in the column description may not exist in the real data). Please concatenate the column description with each of the category descriptions and rewrite it as a declarative sentence (utilizing the description of the category it belongs to). 
        1. Because the real data contains {expected_lines} categories, you should output exactly {expected_lines} lines. Do not output these results in only one line. (Separate the results with a line break, no empty lines, and no serial numbers before the results.) 
        2. Do not include the original column name (the string of uppercase letters) in your rewritten results; express the meaning using normal language instead.

        [Column description]
        {column_description}

        [Category description]
        {category_description}
        """
    )
    #  Limit each response to 30 words or fewer. If there are particularly many categories (more than 10), limit each response to 15 words or fewer.
    #  Do not include the original column name (the string of uppercase letters) in your rewritten results; express the meaning using normal language instead.
    #  Later this rewrite result will be used to concatenate with each category data, so you should preserve enough important information.
    llm = init_chat_model("gpt-4o-mini", model_provider="openai")
    chain = LLMChain(llm=llm, prompt=prompt_template)
    result = chain.run(column_description=column_description, category_description=category_description, expected_lines=expected_lines)
    rewritten_descriptions = result.strip().split("\n")
    # drop empty line
    rewritten_descriptions = [desc for desc in rewritten_descriptions if desc]
    return rewritten_descriptions


column_info = load_pkl(step=2)
column_info_next = load_pkl(step=3)
columns_that_need_to_update = ['exam_CORTDEF']
give_up_columns = []
for column_name in tqdm(column_info):
    if focus_on_this_column != None:
        if column_name != focus_on_this_column:
            continue
    if columns_that_need_to_update:
        if column_name not in columns_that_need_to_update:
            continue
    if column_info[column_name]["type"] == "categorical":
        all_possible_data = tabular_data[column_name].dropna().unique()
        all_possible_data.sort()
        all_possible_data = [str(data) for data in all_possible_data]
        category_description = "The column contains the following categories: " + ", ".join(all_possible_data)
        assert(len(all_possible_data) == column_info[column_name]["num_categories"])
        retry_count = 0
        while True:
            all_concate_result_for_different_rewrite = []
            for description in column_info[column_name]["rewritten_descriptions"]:
                if len(all_possible_data) > 40:
                    rewritten_descriptions = [description]
                else:
                    rewritten_descriptions = concatenate_and_rewrite(description, category_description, len(all_possible_data))
                if len(all_possible_data) != len(rewritten_descriptions):
                    map_data_to_description = dict(zip([all_possible_data[0]], rewritten_descriptions))
                else:
                    map_data_to_description = dict(zip(all_possible_data, rewritten_descriptions))
                all_concate_result_for_different_rewrite.append(map_data_to_description)
            column_info[column_name]["rewritten_descriptions"] = all_concate_result_for_different_rewrite
            if len(rewritten_descriptions) == column_info[column_name]["num_categories"]:
                break
            retry_count += 1
            print(f"retry {retry_count} times for {column_name}")
            if retry_count > 10:
                give_up_columns.append(column_name)
                break
    if columns_that_need_to_update:
        column_info_next[column_name] = column_info[column_name]
        # ic(column_name, column_info_next[column_name])

if columns_that_need_to_update:
    save_pkl(column_info_next, step=3)
elif focus_on_this_column is None:
    save_pkl(column_info, step=3)

print("give up columns: ", give_up_columns)
print("giveup number: ", len(give_up_columns))

column_info_updated = load_pkl(step=3)
for column_name in columns_that_need_to_update:
    # ic(column_name, column_info_updated[column_name]["rewritten_descriptions"][0].keys())
    pass

# columns_that_need_to_update = ['his_ARTHTYPX', 'updrs_HANDMVRX', 'NACCID', 'updrs_SPEECHX', 'his_PSYCDISX', 'his_CVOTHRX', 'updrs_TRESTLFX', 'his_ABUSX', 'cvd_CVDIMAGX', 'his_RACESECX', 'updrs_TAPSLFX', 'his_NACCOMX', 'updrs_HANDMVLX', 'updrs_LEGRTX', 'updrs_POSTUREX', 'updrs_HANDATRX', 'ODE', 'his_PRIMLANX', 'his_NCOTHRX', 'updrs_RIGDLORX', 'updrs_HANDATLX', 'updrs_RIGDUPRX', 'his_RACEX', 'bat_NPSYLANX', 'updrs_BRADYKIX', 'exam_CORTDEF', 'updrs_TRACTLHX', 'updrs_POSSTABX', 'updrs_ARISINGX', 'exam_OTHNEURX', 'gds_ENERGY', 'updrs_GAITX', 'updrs_TAPSRTX', 'his_OTHSLEEX', 'exam_APRAXL', 'updrs_RIGDUPLX', 'updrs_RIGDNEX', 'npiq_NPIQINFX', 'his_HISPORX', 'updrs_LEGLFX', 'updrs_TRACTRHX', 'updrs_RIGDLOLX', 'VISITDATE']
# still_need_to_update = []
# for column_name in columns_that_need_to_update:
#     # if num_categories matches the number of rewritten descriptions, then it is already updated
#     if column_info_updated[column_name]["num_categories"] == len(column_info_updated[column_name]["rewritten_descriptions"][0].keys()):
#         continue
#     still_need_to_update.append(column_name)
# columns_that_need_to_update = still_need_to_update
# print(len(columns_that_need_to_update))
# print(columns_that_need_to_update)
# ic(column_info_updated["GDWORTH"])
# ic(tabular_data["GDWORTH"].dropna().unique())

  0%|          | 0/483 [00:00<?, ?it/s]

retry 1 times for exam_CORTDEF
retry 2 times for exam_CORTDEF
retry 3 times for exam_CORTDEF
retry 4 times for exam_CORTDEF


100%|██████████| 483/483 [00:33<00:00, 14.57it/s]

column info saved to /openbayes/home/LMTDE/data/datasets/nacc_new/meta/column_info_step3.pkl
give up columns:  []
giveup number:  0





Fourth, we generate the embedding (you can choose any llm or any embedding model)

In [12]:
def generate_embeddings(text):
    embedding_model = OpenAIEmbeddings(model="text-embedding-3-large")
    embedding = embedding_model.embed_query(text)
    return embedding

focus_on_this_column = None
columns_that_need_to_update = []
column_info = load_pkl(step=3)
column_info_next = load_pkl(step=4)
for column_name in tqdm(column_info):
    if focus_on_this_column != None:
        if column_name != focus_on_this_column:
            continue
    if columns_that_need_to_update:
        if column_name not in columns_that_need_to_update:
            continue
    if column_info[column_name]["type"] == "categorical":
        assert(isinstance(column_info[column_name]["rewritten_descriptions"][0], dict))
        embeddings = []
        for i in range(len(column_info[column_name]["rewritten_descriptions"])):
            embeddings_for_key = {}
            for key in column_info[column_name]["rewritten_descriptions"][i]:
                if not isinstance(column_info[column_name]["rewritten_descriptions"][i][key], str):
                    continue
                embedding = generate_embeddings(column_info[column_name]["rewritten_descriptions"][i][key])
                embeddings_for_key[key] = embedding
            embeddings.append(embeddings_for_key)
        column_info[column_name]["embeddings"] = embeddings
    else:
        assert(isinstance(column_info[column_name]["rewritten_descriptions"][0], str))
        embeddings = []
        for description in column_info[column_name]["rewritten_descriptions"]:
            embedding = generate_embeddings(description)
            embeddings.append(embedding)
        column_info[column_name]["embeddings"] = embeddings
    if columns_that_need_to_update:
        column_info_next[column_name] = column_info[column_name]
        
if columns_that_need_to_update:
    save_pkl(column_info_next, step=4)
elif focus_on_this_column is None:
    save_pkl(column_info, step=4)

  0%|          | 0/483 [00:00<?, ?it/s]

100%|██████████| 483/483 [2:08:52<00:00, 16.01s/it]  


column info saved to /openbayes/home/LMTDE/data/datasets/nacc_new/meta/column_info_step4.pkl


Finally, filter the unexpected columns according to your rules

In [14]:
column_info = load_pkl(step=4)

# example: only want the features in "column_type.toml", drop labels and other columns
import toml
with open("/openbayes/home/LMTDE/data/datasets/nacc_new/meta/conf_mri.toml", "r") as f:
# with open("/openbayes/home/LMTDE/data/datasets/adni/meta/adni.toml", "r") as f:
    column_type = toml.load(f)
    # do anything you want with column_type
    # check if column type match
    for column_name in column_type['feature']:
        if column_name not in column_info:
            print(f"column {column_name} not found in the data")
            continue
        if column_info[column_name]['type'] != column_type['feature'][column_name]['type']:
            # if category - binary and num_categories=2, match
            # if category - multi and num_categories>2, match
            if column_info[column_name]['type'] == "categorical" and column_type['feature'][column_name]['type'] == "binary" and column_info[column_name]['num_categories'] == 2:
                continue
            if column_info[column_name]['type'] == "categorical" and column_type['feature'][column_name]['type'] == "multiple" and column_info[column_name]['num_categories'] > 2:
                continue
            print(f"column {column_name} type not match, expect {column_type['feature'][column_name]['type']} but got {column_info[column_name]['type']}")

    # drop columns not in column_type
    column_info = {column_name: column_info[column_name] for column_name in column_info.keys() if column_name in column_type['feature']}
    # save to pkl file
    save_pkl(column_info)

column his_PDOTHR not found in the data
column npiq_DEL not found in the data
column npiq_HALL not found in the data
column npiq_AGIT not found in the data
column npiq_DEPD not found in the data
column npiq_ANX not found in the data
column npiq_ELAT not found in the data
column npiq_APA not found in the data
column npiq_DISN not found in the data
column npiq_IRR not found in the data
column npiq_MOT not found in the data
column npiq_NITE not found in the data
column npiq_APP not found in the data
column his_BIRTHMO type not match, expect categorical but got numerical
column bat_PENTAGON not found in the data
column bat_LOGIMO type not match, expect categorical but got numerical
column bat_LOGIDAY type not match, expect categorical but got numerical
column bat_DIGBACCT not found in the data
column bat_DIGBACLS not found in the data
column bat_REY1REC not found in the data
column bat_REY1INT not found in the data
column bat_REY2REC not found in the data
column bat_REY2INT not found in th

In [None]:
# column_info = load_pkl(step=4)
# # dump to toml, if want to update the toml
# with open("/openbayes/home/LMTDE/data/datasets/adni/meta/adni.toml", "w") as f:
#     # dump like this
#     # [feature.TRAILS]
#     # type = "binary"
#     # num_categories = 2
#     for column_name in column_info:
#         f.write(f"\t[feature.{column_name}]\n")
#         f.write(f"\ttype = \"{column_info[column_name]['type']}\"\n")
#         if column_info[column_name]['type'] == "categorical":
#             f.write(f"\tnum_categories = {column_info[column_name]['num_categories']}\n")
#         else:
#             f.write("\tshape = [1]\n")
#         f.write("\n")

In [18]:
focus_on_this_column = "his_RACETER"
# have a look at the pkl, do not output embeddings(too large output)
column_info = load_pkl(step=3)
print(len(column_info.keys()))
print(column_info.keys())
print(column_info[focus_on_this_column].keys())
print(column_info[focus_on_this_column]['type'])
print(column_info[focus_on_this_column]['num_categories'])
ic(column_info[focus_on_this_column]['rewritten_descriptions'])
print(tabular_data[focus_on_this_column].dropna().unique())

# update_these_columns = []
# diff = []
# for column_name in column_info:
#     if column_info[column_name]['type'] == "categorical":
#         keys = column_info[column_name]['rewritten_descriptions'][0].keys()
#         if 'num_categories' not in column_info[column_name]:
#             update_these_columns.append(column_name)
#             diff.append(len(keys))
#         elif column_info[column_name]['num_categories'] != len(keys):
#             # ic(column_name, column_info[column_name]['num_categories'], len(keys))
#             update_these_columns.append(column_name)
#             # diff.append(len(keys) - column_info[column_name]['num_categories'])
# print(update_these_columns)
# print(len(update_these_columns))
# print(diff)
# print(len(diff))
# print(sum(diff))
# print(len(column_info.keys()))


ic| column_info[focus_on_this_column]['rewritten_descriptions']: [{'0.0': 'The data indicates that the respondent identifies as White.  ',
                                                                   '1.0': 'The data indicates that the respondent identifies as Black or '
                                                                          'African American.  ',
                                                                   '2.0': 'The data indicates that the respondent identifies as American Indian '
                                                                          'or Alaska Native.  ',
                                                                   '3.0': 'The data indicates that the respondent identifies as Native Hawaiian '
                                                                          'or Other Pacific Islander.  ',
                                                                   '4.0': 'The data indicates that the respondent identifies as As

483
dict_keys(['npiq_APPSEV', 'bat_RAY5REC', 'his_ARTHTYPX', 'his_PDOTHRYR', 'cvd_EMOT', 'exam_SLOWINGL', 'updrs_HANDMVRX', 'updrs_BRADYKIN', 'faq_GAMES', 'bat_RAY6INT', 'bat_MOCAORPL', 'bat_RAY2REC', 'cvd_SOMATIC', 'his_NACCFADM', 'bat_DIGFORSL', 'his_HANDED', 'med_NACCAMD', 'bat_MOCAORDY', 'bat_UDSVERTN', 'med_NACCAHTN', 'updrs_HANDALTR', 'exam_RESTTRL', 'his_HISPOR', 'his_ALCFREQ', 'exam_PARKSIGN', 'cvd_CVDIMAG3', 'bat_RAYDINT', 'updrs_RIGDUPLF', 'bat_MOCACUBE', 'ph_BPDIAS', 'NACCID', 'updrs_SPEECHX', 'ALCDEM', 'his_NACCFMS', 'his_PDYR', 'bat_MENTAGON', 'bat_CRAFTDVR', 'his_NACCAMS', 'ANXIET', 'bat_NACCMOCB', 'his_RESIDENC', 'bat_MINTTOTS', 'gds_WRTHLESS', 'his_TOBAC30', 'bat_DIGIBLEN', 'bat_ANIMALS', 'exam_DYSTONR', 'bat_RESPHEAR', 'bat_MOCAORYR', 'his_TIAMULT', 'cvd_CVDIMAG', 'med_ANYMEDS', 'med_NACCAANX', 'bat_UDSVERLC', 'updrs_FACEXP', 'his_STROKMUL', 'npiq_NITESEV', 'bat_TRAILBLI', 'faq_REMDATES', 'npiq_HALLSEV', 'his_PSYCDISX', 'bat_MOCAREGI', 'cdr_CDRSUM', 'bat_MINTTOTW', 'ex

In [11]:
data_without_nan = tabular_data[focus_on_this_column].dropna()
ic(data_without_nan)
# print number of unique value
ic(tabular_data[focus_on_this_column].nunique())

ic| data_without_nan: 0        0.0
                      1        0.0
                      2        0.0
                      3        0.0
                      4        0.0
                              ... 
                      36770    0.0
                      36771    0.0
                      36772    0.0
                      36773    0.0
                      36774    0.0
                      Name: his_HISPOR, Length: 36697, dtype: float64
ic| tabular_data[focus_on_this_column].nunique(): 8


8