In [1]:
import sys
import os

parent_path = ".."
sys.path.append(os.path.abspath(parent_path))

In [2]:
import pandas as pd
import pathlib
from health_causenet import constants
from tqdm.autonotebook import tqdm
import numpy as np

  from tqdm.autonotebook import tqdm


In [15]:
full_causenet = pd.DataFrame()
paths = sorted(pathlib.Path(constants.CAUSENET_PARQUET_PATH).glob("causenet_*.parquet"))
for path in tqdm(paths):
    from_file = pd.read_parquet(
        path, columns=["cause", "effect", "support", "reference", "sentence"]
    )
    full_causenet = pd.concat([full_causenet, from_file])
full_causenet = full_causenet.reset_index(drop=True)
full_causenet

  0%|          | 0/12 [00:00<?, ?it/s]

Unnamed: 0,cause,effect,support,reference,sentence
0,accident,death,38,Forensic science,"For example, the book also described how to di..."
1,accident,death,38,"Goodsprings, Nevada",The accident resulted in her death.
2,accident,death,38,Léon Gambetta,It seems she had just consented to become his ...
3,accident,death,38,Accidental death and dismemberment insurance,"In insurance, accidental death and dismemberme..."
4,accident,death,38,Plaster,Plaster is used by many morticians and funeral...
...,...,...,...,...,...
24423078,treatment,changes in fat composition,1,http://www.the-funneled-web.com/N&V_2010(Jan-D...,Such treatment resulted in very significant ch...
24423079,mechanical stress,gradual straightening of elastin fibers,1,http://www.mednet.gr/archives/2005-1/54abs.html,"In the longitudinal histological sections, mec..."
24423080,controlled substance violations,inadmissibility,1,http://www.zzi.net/news/20081208213119.shtml,Controlled substance violations will cause ina...
24423081,controlled substance violations,deportability,1,http://www.zzi.net/news/20081208213119.shtml,Controlled substance violations will cause ina...


In [6]:
def p_mean_threshold_combiner(cause, effect, p):
    return ((cause ** p + effect ** p) / 2) ** (1 / p)

def max_combiner(cause, effect):
    return np.maximum(cause, effect)

def min_combiner(cause, effect):
    return np.minimum(cause, effect)

ops = {
    "and": lambda cause, effect: min_combiner(cause, effect),
    "p=1_mean": lambda cause, effect: p_mean_threshold_combiner(cause, effect, 1),
    "p=2_mean": lambda cause, effect: p_mean_threshold_combiner(cause, effect, 2),
    "p=5_mean": lambda cause, effect: p_mean_threshold_combiner(cause, effect, 5),
    "p=10_mean": lambda cause, effect: p_mean_threshold_combiner(cause, effect, 10),
    "p=inf_mean": lambda cause, effect: max_combiner(cause, effect),
}

In [16]:
file_patterns = {
    "full_mcc": {
        "pattern": "discriminative_weight-encyclopedia-2-(1_2)*.parquet",
        "threshold": 77.7778,
        "op": "p=1_mean",
    },
    "full_prec": {
        "pattern": "term_domain_specificity-encyclopedia-1-(1_2)*.parquet",
        "threshold": 1.5152,
        "op": "p=1_mean",
    },
    "support_mcc": {
        "pattern": "term_domain_specificity-encyclopedia-2-(1_3)*.parquet",
        "threshold": 1.1515,
        "op": "p=1_mean",
    },
    "support_prec": {
        "pattern": "term_domain_specificity-encyclopedia-1-(1_3)*.parquet",
        "threshold": 1.2121,
        "op": "p=1_mean",
    }
}


full_causenet_medical = full_causenet.copy()
for name, kwargs in tqdm(list(file_patterns.items())):
    medical_score = pd.DataFrame()
    paths = sorted(pathlib.Path(os.path.join(constants.CAUSENET_PARQUET_PATH)).glob(kwargs["pattern"]))
    for path in paths:
        medical_score = pd.concat([medical_score, pd.read_parquet(path)])
    medical = ops[kwargs["op"]](medical_score["medical_score-cause"], medical_score["medical_score-effect"]) >= kwargs["threshold"]
    medical = pd.Series(medical, name=name).reset_index(drop=True)
    full_causenet_medical = full_causenet_medical.join(medical)
    
full_causenet_medical.loc[full_causenet_medical.support == 1, list(filter(lambda x: "support" in x, file_patterns.keys()))] = np.nan
causenet_medical = full_causenet_medical.drop(["reference", "sentence"], axis=1).drop_duplicates(["cause", "effect"]).reset_index(drop=True)
full_causenet_medical

  0%|          | 0/4 [00:00<?, ?it/s]

Unnamed: 0,cause,effect,support,reference,sentence,full_mcc,full_prec,support_mcc,support_prec
0,accident,death,38,Forensic science,"For example, the book also described how to di...",False,False,False,False
1,accident,death,38,"Goodsprings, Nevada",The accident resulted in her death.,False,False,False,False
2,accident,death,38,Léon Gambetta,It seems she had just consented to become his ...,False,False,False,False
3,accident,death,38,Accidental death and dismemberment insurance,"In insurance, accidental death and dismemberme...",False,False,False,False
4,accident,death,38,Plaster,Plaster is used by many morticians and funeral...,False,False,False,False
...,...,...,...,...,...,...,...,...,...
24423078,treatment,changes in fat composition,1,http://www.the-funneled-web.com/N&V_2010(Jan-D...,Such treatment resulted in very significant ch...,True,True,,
24423079,mechanical stress,gradual straightening of elastin fibers,1,http://www.mednet.gr/archives/2005-1/54abs.html,"In the longitudinal histological sections, mec...",True,False,,
24423080,controlled substance violations,inadmissibility,1,http://www.zzi.net/news/20081208213119.shtml,Controlled substance violations will cause ina...,False,False,,
24423081,controlled substance violations,deportability,1,http://www.zzi.net/news/20081208213119.shtml,Controlled substance violations will cause ina...,False,True,,


In [52]:
full_causenet_medical.loc[:, list(file_patterns.keys())].agg(["sum", "mean", "count"]).astype(str)

Unnamed: 0,full_mcc,full_prec,support_mcc,support_prec
sum,7842464.0,4928379.0,1320318.0,1259339.0
mean,0.321108682306816,0.2017918458533674,0.6810189509779294,0.6495660323540198
count,24423083.0,24423083.0,1938739.0,1938739.0


In [54]:
causenet_medical.loc[:, list(file_patterns.keys())].agg(["sum", "mean", "count"]).astype(str)

Unnamed: 0,full_mcc,full_prec,support_mcc,support_prec
sum,3206964.0,1851488.0,109985.0,103273.0
mean,0.2766313019724628,0.1597085392995965,0.6104851243339254,0.573229351687389
count,11592918.0,11592918.0,180160.0,180160.0


In [26]:
resources = [
#     "full_mcc",
    "full_prec",
#     "support_mcc",
#     "support_prec"
]
for resource in resources:
    print(resource)
    samples = full_causenet_medical.loc[full_causenet_medical.loc[:, resource] & ~full_causenet_medical.loc[:, resource].isna(), ["cause", "effect", "support", "reference", "sentence"]].sample(n=1000)
    samples = samples.reset_index(drop=True)
    samples.to_csv(constants.BASE_PATH + f"resources/{resource}.csv")

full_prec
