In [1]:
from datasets import load_dataset, DownloadConfig
import pandas as pd
import ssl
import os

In [None]:
# Option 1: Set SSL verification globally before using DownloadConfig:
os.environ['REQUESTS_CA_BUNDLE'] = "/proj/sas/hsar/midsize_projects/python_data/ssl_certificates/cacert.pem"

download_config = DownloadConfig(
    proxies=None,
    user_agent="my-user-agent"
)
# load SDOH-NLI from Hugging Face
ds = load_dataset("tasksource/SDOH-NLI", download_config=download_config)

In [None]:
# print splits
print(ds)

DatasetDict({
    train: Dataset({
        features: ['premise', 'hypothesis', 'label'],
        num_rows: 21090
    })
    validation: Dataset({
        features: ['premise', 'hypothesis', 'label'],
        num_rows: 4033
    })
    test: Dataset({
        features: ['premise', 'hypothesis', 'label'],
        num_rows: 4212
    })
})


In [4]:
sample = ds['train']#.select(range(100))   # first 100 for quick look
df = sample.to_pandas()
print("Columns:", df.columns.tolist())
print(df.head(10))
# show label distribution (if labels exist)
if 'label' in df.columns:
    print("Label counts:\n", df['label'].value_counts())
else:
    print("No 'label' column found; inspect fields manually.")


Columns: ['premise', 'hypothesis', 'label']
                            premise  \
0  1-2 packs of cigarettes per day.   
1  1-2 packs of cigarettes per day.   
2  1-2 packs of cigarettes per day.   
3  1-2 packs of cigarettes per day.   
4  1-2 packs of cigarettes per day.   
5  1-2 packs of cigarettes per day.   
6  1-2 packs of cigarettes per day.   
7  1-2 packs of cigarettes per day.   
8  1-2 packs of cigarettes per day.   
9  1-2 packs of cigarettes per day.   

                                          hypothesis  label  
0           The person has access to transportation.  False  
1               The person was a smoker in the past.  False  
2  The person is able to obtain food on a consist...  False  
3                  The person is currently a smoker.   True  
4               The person currently drinks alcohol.  False  
5              The person drank alcohol in the past.  False  
6                  The person is employed part time.  False  
7                            T

In [40]:
for premise in df['premise'].unique():
    print(premise) 

1-2 packs of cigarettes per day.
1-2 ppd Cigarettes
10+pack-year h/o Tobacco use; quit 2 years ago
2-4 Beers/day
2ppd smoker since his teens; quit 2 years ago
50+pack-yr cigarette use.
6-pack beer plus 2 drinks per day for many years: now claims he has been dry for 2 years
A 76-year-old who used to smoke a pack a day and quit in 1985
A pack of cigarettes would last for more than a week
According to the chart, the patient also drinks wine everyday for the last 50 years, usually one to two drinks per day.
Active smoker
Admits to heroin use, alcohol abuse as well
Admits to no history of using alcohol
Alcohol drinking history, quit smoking about 30 years ago
Alcohol one to two drinks monthly.
Also admits today using cocaine.
At home also live 2 dogs and 2 outside cats
At home live mom, dad, and 18-, 16-, 14-, 12-year-old brothers, and a 3-year-old sister
At home lives mom, dad, a 2-1/2-year-old brother, and a 5-1/2-year-old maternal stepbrother
At present, he is on independent studies, whi

In [None]:
# Data exploration
# Look at the actual 'F' value more closely
for premise in df['premise'].unique():
    if 'F' in str(premise):
        print(f"Found: '{premise}' (length: {len(str(premise))})")
        print(f"Repr: {repr(premise)}")
# See if there are any exact 'F' matches
exact_f = df['premise'].str.strip() == 'F'
print(f"Exact F matches: {exact_f.sum()}")
len(df['premise'].value_counts())
df['hypothesis'].value_counts()
df.query("label==False").iloc[range(378, 560),:]

570

In [5]:
# premises repeat to accommadate a set of 36 hypothese templates. The actual premises are only of 570 records. the cartesian product of 576 
# premises x 36 hypothesis templates with True False judgements. 
# Pivot: rows=premises, columns=hypotheses, values=label
pivot_df = df.pivot_table(
    index="premise",
    columns="hypothesis",
    values="label",
    aggfunc='first' # each premise-hyposis pair is unique
)
pivot_df=pivot_df.reset_index()

In [6]:
print(pivot_df.head())
print("Shape:", pivot_df.shape)

hypothesis                                         premise  \
0                         1-2 packs of cigarettes per day.   
1                                       1-2 ppd Cigarettes   
2           10+pack-year h/o Tobacco use; quit 2 years ago   
3                                            2-4 Beers/day   
4            2ppd smoker since his teens; quit 2 years ago   

hypothesis  The person currently does not drink alcohol.  \
0                                                  False   
1                                                  False   
2                                                  False   
3                                                  False   
4                                                  False   

hypothesis  The person currently drinks alcohol.  \
0                                          False   
1                                          False   
2                                          False   
3                                           True   
4     

*Mapping 36 hypotheses → categories*

In [10]:
# Mapping hypotheses → broader SDoH categories
hypothesis_to_category = {
    # --- Employment / Occupation ---
    "The person is employed.": "employment",
    "The person is employed part time.": "employment",
    "The person is a homemaker.": "employment",
    "The person is not employed.": "employment",
    "The person is retired due to age or preference.": "employment",
    "The person is retired due to disability.": "employment",
    "The person is retired due to an unknown reason.": "employment",
    "The person is a student.": "employment",

    # --- Housing ---
    "The person lives in their own or their family's home.": "housing",
    "The person lives in a housing facility.": "housing",

    # --- Transportation ---
    "The person has access to transportation.": "transportation",

    # --- Food security ---
    "The person is able to obtain food on a consistent basis.": "food",

    # --- Smoking / Tobacco ---
    "The person is currently a smoker.": "smoking",
    "The person was a smoker in the past.": "smoking",
    "The person wasn't a smoker in the past.": "smoking",
    "The person is currently not a smoker.": "smoking",

    # --- Alcohol use ---
    "The person currently drinks alcohol.": "alcohol",
    "The person drank alcohol in the past.": "alcohol",
    "The person currently does not drink alcohol.": "alcohol",
    "The person did not drink alcohol in the past.": "alcohol",

    # --- Drug use: Opioids ---
    "The person uses opioids.": "opioids",
    "The person used opioids in the past.": "opioids",
    "The person did not use opioids in the past.": "opioids",
    "The person does not use opioids.": "opioids",

    # --- Drug use: Marijuana ---
    "The person uses marijuana.": "marijuana",
    "The person used marijuana in the past.": "marijuana",
    "The person did not use marijuana in the past.": "marijuana",
    "The person does not use marijuana.": "marijuana",

    # --- Drug use: Cocaine ---
    "The person uses cocaine.": "cocaine",
    "The person used cocaine in the past.": "cocaine",
    "The person did not use cocaine in the past.": "cocaine",
    "The person does not use cocaine.": "cocaine",

    # --- Drug use: General ---
    "The person is a drug user.": "drug_use",
    "The person was a drug user in the past.": "drug_use",
    "The person wasn't a drug user in the past.": "drug_use",
    "The person is not a drug user.": "drug_use",
}


In [7]:
# Collapse hypotheses into categories
category_df = pivot_df.copy()

In [8]:
# For each category, take OR across relevant hypotheses
final_df = pd.DataFrame()
final_df["premise"] = category_df["premise"]

In [11]:
categories = set(hypothesis_to_category.values())
categories

{'alcohol',
 'cocaine',
 'drug_use',
 'employment',
 'food',
 'housing',
 'marijuana',
 'opioids',
 'smoking',
 'transportation'}

In [12]:
for cat in categories:
    cols=[h for h, c in hypothesis_to_category.items() if c==cat]
    final_df[cat]=category_df[cols].max(axis=1)

In [13]:
final_df.head()

Unnamed: 0,premise,employment,transportation,smoking,marijuana,opioids,cocaine,housing,drug_use,food,alcohol
0,1-2 packs of cigarettes per day.,False,False,True,False,False,False,False,False,False,False
1,1-2 ppd Cigarettes,False,False,True,False,False,False,False,False,False,False
2,10+pack-year h/o Tobacco use; quit 2 years ago,False,False,True,False,False,False,False,False,False,False
3,2-4 Beers/day,False,False,False,False,False,False,False,False,False,True
4,2ppd smoker since his teens; quit 2 years ago,False,False,True,False,False,False,False,False,False,False


In [14]:
final_df.shape

(570, 11)