In [1]:
import sys
import os

parent_path = ".."
sys.path.append(os.path.abspath(parent_path))

In [2]:
import pandas as pd
import pathlib
from health_causenet import constants
import extract_medical
from tqdm.autonotebook import tqdm
import numpy as np
import json

2022-09-27 10:51:20.566071: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0
2022-09-27 10:53:44.627709: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcuda.so.1
2022-09-27 10:53:44.635674: E tensorflow/stream_executor/cuda/cuda_driver.cc:328] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
2022-09-27 10:53:44.635706: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:169] retrieving CUDA diagnostic information for host: gammaweb06
2022-09-27 10:53:44.635713: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:176] hostname: gammaweb06
2022-09-27 10:53:44.635809: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:200] libcuda reported version is: Not found: was unable to find libcuda.so DSO loaded into this program
2022-09-27 10:53:44.636039: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:204] kernel reported version

In [11]:
full_causenet = pd.DataFrame()
paths = sorted(pathlib.Path(constants.CAUSENET_PARQUET_PATH).glob("causenet_*.parquet"), key=lambda x: int(str(x).split("_")[-1][:-8]))
for path in tqdm(paths):
    from_file = pd.read_parquet(
        path, columns=["cause", "effect", "support", "reference", "sentence"]
    )
    full_causenet = pd.concat([full_causenet, from_file])
full_causenet = full_causenet.reset_index(drop=True)
full_causenet

  0%|          | 0/12 [00:00<?, ?it/s]

Unnamed: 0,cause,effect,support,reference,sentence
0,accident,death,38,Forensic science,"For example, the book also described how to di..."
1,accident,death,38,"Goodsprings, Nevada",The accident resulted in her death.
2,accident,death,38,Léon Gambetta,It seems she had just consented to become his ...
3,accident,death,38,Accidental death and dismemberment insurance,"In insurance, accidental death and dismemberme..."
4,accident,death,38,Plaster,Plaster is used by many morticians and funeral...
...,...,...,...,...,...
24423078,parkinson 's disease,rigidity,3,http://www.integrative-healthcare.org/mt/archi...,Because Parkinson's disease typically causes m...
24423079,parkinson 's disease,rigidity,3,http://www.ninds.nih.gov/disorders/hypertonia/...,Drugs that affect the dopamine system (dopamin...
24423080,parkinson 's disease,slowness of movement,2,http://grey.colorado.edu/CompCogNeuro/index.ph...,Parkinson's disease (PD) is associated with de...
24423081,parkinson 's disease,slowness of movement,2,http://www.ccun.com.cn/gjpd/2007-09-05/content...,They are administered with levodopa and carbid...


In [4]:
def p_mean_threshold_combiner(cause, effect, p):
    return ((cause ** p + effect ** p) / 2) ** (1 / p)

def max_combiner(cause, effect):
    return np.maximum(cause, effect)

def min_combiner(cause, effect):
    return np.minimum(cause, effect)

ops = {
    "and": lambda cause, effect: min_combiner(cause, effect),
    "p=1_mean": lambda cause, effect: p_mean_threshold_combiner(cause, effect, 1),
    "p=2_mean": lambda cause, effect: p_mean_threshold_combiner(cause, effect, 2),
    "p=5_mean": lambda cause, effect: p_mean_threshold_combiner(cause, effect, 5),
    "p=10_mean": lambda cause, effect: p_mean_threshold_combiner(cause, effect, 10),
    "p=inf_mean": lambda cause, effect: max_combiner(cause, effect),
}

In [12]:
best_mcc = pd.read_csv("./test_best_approaches_mcc.csv", index_col=0).set_index(["dataset", "method_class"])
best_prec = pd.read_csv("./test_best_approaches_recall_precision_0.9.csv", index_col=0).set_index(["dataset", "method_class"])

full_mcc = best_mcc.loc["random_full"].loc[["contrastive_weight", "term_domain_specificity", "discriminative_weight"]].sort_values("mcc").iloc[-1]
full_prec = best_prec.loc["random_full"].loc[["contrastive_weight", "term_domain_specificity", "discriminative_weight"]].sort_values("recall").iloc[-1]
support_mcc = best_mcc.loc["random_support"].loc[["contrastive_weight", "term_domain_specificity", "discriminative_weight"]].sort_values("mcc").iloc[-1]
support_prec = best_prec.loc["random_support"].loc[["contrastive_weight", "term_domain_specificity", "discriminative_weight"]].sort_values("recall").iloc[-1]

file_patterns = {
    "full_mcc": {
        "method": full_mcc.method,
        "threshold": full_mcc.threshold,
        "op": full_mcc.operator,
    },
    "full_prec": {
        "method": full_prec.method,
        "threshold": full_prec.threshold,
        "op": full_prec.operator,
    },
    "support_mcc": {
        "method": support_mcc.method,
        "threshold": support_mcc.threshold,
        "op": support_mcc.operator,
    },
    "support_prec": {
        "method": support_prec.method,
        "threshold": support_prec.threshold,
        "op": support_prec.operator,
    }
}

print(json.dumps(file_patterns, indent=2))

full_causenet_medical = full_causenet.copy()
for name, kwargs in tqdm(list(file_patterns.items())):
    medical_score = pd.DataFrame()
    paths = []
    while True:
        paths = sorted(pathlib.Path(constants.CAUSENET_PARQUET_PATH).glob(kwargs["method"].replace(", ", "_") + "_*.parquet"), key=lambda x: int(str(x).split("_")[-1][:-8]))
        if paths:
            break
        termhood, corpus, n_gram_size, p = kwargs["method"].split("-")
        n_grams = (n_gram_size[1], n_gram_size[4])
        args = [termhood, "--corpora", corpus, "--n_gram_size", *n_grams, "--p", p]
        extract_medical.main(args)
        
    for path in paths:
        medical_score = pd.concat([medical_score, pd.read_parquet(path)])
    medical = ops[kwargs["op"]](medical_score["medical_score-cause"], medical_score["medical_score-effect"]) >= kwargs["threshold"]
    medical = pd.Series(medical, name=name).reset_index(drop=True)
    full_causenet_medical = full_causenet_medical.join(medical)
    
full_causenet_medical.loc[full_causenet_medical.support == 1, list(filter(lambda x: "support" in x, file_patterns.keys()))] = np.nan
causenet_medical = full_causenet_medical.drop(["reference", "sentence"], axis=1).drop_duplicates(["cause", "effect"]).reset_index(drop=True)
full_causenet_medical

{
  "full_mcc": {
    "method": "discriminative_weight-encyclopedia-(1, 2)-2",
    "threshold": 77.7778,
    "op": "p=1_mean"
  },
  "full_prec": {
    "method": "discriminative_weight-encyclopedia-(1, 1)-1",
    "threshold": 100.0,
    "op": "p=1_mean"
  },
  "support_mcc": {
    "method": "discriminative_weight-encyclopedia-(1, 3)-5",
    "threshold": 88.8889,
    "op": "p=1_mean"
  },
  "support_prec": {
    "method": "term_domain_specificity-encyclopedia-(1, 3)-1",
    "threshold": 1.2121,
    "op": "p=1_mean"
  }
}


  0%|          | 0/4 [00:00<?, ?it/s]

Unnamed: 0,cause,effect,support,reference,sentence,full_mcc,full_prec,support_mcc,support_prec
0,accident,death,38,Forensic science,"For example, the book also described how to di...",False,False,False,False
1,accident,death,38,"Goodsprings, Nevada",The accident resulted in her death.,False,False,False,False
2,accident,death,38,Léon Gambetta,It seems she had just consented to become his ...,False,False,False,False
3,accident,death,38,Accidental death and dismemberment insurance,"In insurance, accidental death and dismemberme...",False,False,False,False
4,accident,death,38,Plaster,Plaster is used by many morticians and funeral...,False,False,False,False
...,...,...,...,...,...,...,...,...,...
24423078,parkinson 's disease,rigidity,3,http://www.integrative-healthcare.org/mt/archi...,Because Parkinson's disease typically causes m...,True,True,True,True
24423079,parkinson 's disease,rigidity,3,http://www.ninds.nih.gov/disorders/hypertonia/...,Drugs that affect the dopamine system (dopamin...,True,True,True,True
24423080,parkinson 's disease,slowness of movement,2,http://grey.colorado.edu/CompCogNeuro/index.ph...,Parkinson's disease (PD) is associated with de...,True,True,True,False
24423081,parkinson 's disease,slowness of movement,2,http://www.ccun.com.cn/gjpd/2007-09-05/content...,They are administered with levodopa and carbid...,True,True,True,False


In [17]:
causenet_medical

Unnamed: 0,cause,effect,support,full_mcc,full_prec,support_mcc,support_prec
0,accident,death,38,False,False,False,False
1,disease,death,37,True,True,True,True
2,pneumonia,death,37,True,True,True,True
3,cancer,death,36,True,True,True,True
4,heart attack,death,36,True,False,True,False
...,...,...,...,...,...,...,...
11592913,depression,alzheimer 's disease,1,True,True,,
11592914,hypertension,alzheimer 's disease,1,True,True,,
11592915,parkinson 's disease,rigidity,3,True,True,True,True
11592916,parkinson 's disease,slowness of movement,2,True,True,True,False


In [18]:
causenet_medical.drop("support", axis=1).to_csv(constants.CAUSENET_PARQUET_PATH + "/health-causenet.tsv", index=False, sep="\t")

In [10]:
full_causenet_medical.loc[:, list(file_patterns.keys())].agg(["sum", "mean", "count"]).astype(str)

Unnamed: 0,full_mcc,full_prec,support_mcc,support_prec
sum,7842464.0,5680635.0,1340873.0,1259339.0
mean,0.321108682306816,0.2325928712603564,0.6916212032666594,0.6495660323540198
count,24423083.0,24423083.0,1938739.0,1938739.0


In [9]:
causenet_medical.loc[:, list(file_patterns.keys())].agg(["sum", "mean", "count"]).astype(str)

Unnamed: 0,full_mcc,full_prec,support_mcc,support_prec
sum,3206964.0,2201071.0,112707.0,103273.0
mean,0.2766313019724628,0.1898634148882964,0.62559391651865,0.573229351687389
count,11592918.0,11592918.0,180160.0,180160.0


In [14]:
tmp

Unnamed: 0,cause,effect,support,reference,sentence,full_mcc,full_prec,support_mcc,support_prec
3987,cancer,death,36,Afghan Hound,"In the 2004 UK Kennel Club survey, the most co...",True,True,True,True
3988,cancer,death,36,Edsger W. Dijkstra,According to officials at the University of Te...,True,True,True,True
3989,cancer,death,36,Hungary,The second most important cause of death was c...,True,True,True,True
3990,cancer,death,36,Polonium,"In addition to the acute effects, radiation ex...",True,True,True,True
3991,cancer,death,36,Demographics of Russia,The second leading cause of death was cancer w...,True,True,True,True
...,...,...,...,...,...,...,...,...,...
1935634,cancer,exudates,2,http://www.clinlabnavigator.com/Test-Interpret...,Increased numbers of neutrophils are seen with...,True,True,True,True
1936291,cancer,burden of disease,2,http://www.thewomens.org.au/Mindfulnessbasedco...,Psychosocial interventions that effectively re...,True,True,True,True
1936292,cancer,burden of disease,2,http://www.quitnow.gov.au/internet/quitnow/pub...,Cancer is the leading cause of death4 and burd...,True,True,True,True
1936454,cancer,water,2,http://buffalo.indymedia.org/content/what-will...,Now the residents and the next generations of ...,True,True,True,True


In [24]:
# tmp = full_causenet_medical.loc[full_causenet_medical.support_prec.fillna(False)]
tmp = full_causenet_medical.loc[(full_causenet_medical.cause == "cancer") | (full_causenet_medical.cause == "cancer")]
tmp["medical_score"] = medical_score.loc[~medical_score.index.duplicated()].sum(axis=1)
tmp.sort_values("medical_score")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tmp["medical_score"] = medical_score.loc[~medical_score.index.duplicated()].sum(axis=1)


Unnamed: 0,cause,effect,support,reference,sentence,full_mcc,full_prec,support_mcc,support_prec,medical_score
3424262,cancer,chemical tags on certain genes,1,http://blog.biopeer.com/biopeer/2006/07/new_te...,"The screening tool, called quantitative multip...",True,True,,,0.247400
3424258,cancer,chemical tags on certain genes,1,http://blog.biopeer.com/biopeer/2006/07/page/3/,"The screening tool, called quantitative multip...",True,True,,,0.318123
3424259,cancer,chemical tags on certain genes,1,http://blog.biopeer.com/biopeer/2006/07/new_te...,"The screening tool, called quantitative multip...",True,True,,,0.318123
3424260,cancer,chemical tags on certain genes,1,http://blog.biopeer.com/biopeer/diagnostics/pa...,"The screening tool, called quantitative multip...",True,True,,,0.318123
3482782,cancer,germline mutations,1,http://annieappleseedproject.org/caasmedi.html,While cancer causing germline mutations are ra...,True,True,,,0.331266
...,...,...,...,...,...,...,...,...,...,...
24409570,cancer,physical or mental disabilities,1,http://www.sucss.state.il.us/sar_report.asp?ID...,(B) has one or more physical or mental disabil...,True,True,,,
24409571,cancer,physical or mental disabilities,1,http://www.hr.niu.edu/ServiceAreas/ContractsAn...,(B) has one or more physical or mental disabil...,True,True,,,
24409572,cancer,physical or mental disabilities,1,http://www.sucss.state.il.us/sar_report_all.asp,(B) has one or more physical or mental disabil...,True,True,,,
24418711,cancer,loss of her leg,1,http://ska-t.diaryland.com/ColdHardTru.html,Valley Guy and his family making us welcome......,True,True,,,


In [41]:
for sentence in tmp.loc[tmp.sentence.str.contains("Virgo", case=True), "sentence"].drop_duplicates().values:
    print(sentence, "\n")

The cost of our daily bread (not food just bread) has almost tripled since 2003, the Moon and Cancer, Virgo and Mercury are associated with food and nourishment. 

A Libra stellium would cause lower back pain (kidney connected) when work and work relationships become chaotic, Virgo would produce skin irritations or constipation (holding it all in), Leo causes back problems from shouldering too much of a work load, Cancer would cause stomach and digestive disorders due to fears and insecurities in work activities, Gemini could cause allergies and respiratory problems when the individual feels trapped and cannot “breath” in the workplace. 



In [26]:
tmp = full_causenet_medical.loc[full_causenet_medical.support_prec.fillna(False)]
tmp = tmp.loc[tmp.reference.str.contains("http://www.nlm.nih.gov/medlineplus/")].sample(10)
for sample in tmp.sample(10).values:
    print(sample[0], "->", sample[1], sample[3], sample[4])
    print()

ear disorders -> deafness http://www.nlm.nih.gov/medlineplus/eardisorders.html Some ear disorders can result in hearing disorders and deafness.

hay fever -> runny nose http://www.healthcite.com/force-frame.php?getcontentsfrom=http://www.nlm.nih.gov/medlineplus/druginfo/meds/a682539.html&hostname=www.nlm.nih.gov For additional information: Diphenhydramine is used to relieve red, irritated, itchy, watery eyes; sneezing; and runny nose caused by hay fever, allergies, or the common cold.

bacteria -> infections http://www.healthcite.com/force-frame.php?getcontentsfrom=http://www.nlm.nih.gov/medlineplus/druginfo/meds/a688016.html&hostname=www.nlm.nih.gov Ciprofloxacin is used to treat or prevent certain infections caused by bacteria.

problems -> short stature http://www.nlm.nih.gov/medlineplus/dwarfism.html Other genetic conditions, kidney disease and problems with metabolism or hormones can also cause short stature.

cyclosporine -> high blood pressure http://www.healthcite.com/force-fra

In [25]:
tmp = full_causenet_medical.loc[full_causenet_medical.support_prec.fillna(False)]
# tmp.loc[tmp.cause.str.contains("jupiter")].drop_duplicates(["cause", "effect"]).head(20)
for sample in tmp.sample(10).values:
    print(sample[0], "->", sample[1], sample[3], sample[4])
    print()

abnormal protein -> disease http://igreens.org.uk/red_wine_and_sunshine_ain.htm No matter, researchers recently believed they had identified the presence of the abnormal protein that causes the disease in two of 2,000 specimens of human tonsils they examined.

untreated dental caries -> potential tooth loss http://drc.hhs.gov/report/summary.htm Untreated dental caries may result in pain, infection, and potential tooth loss.

severe type -> shock http://www.rsdrx.com/rsdrx_com/rsdpuz4.0/puz_15.htm The patient's life is in danger because of the severe type of pain causing shock to the system and aggravation of the acute illness which requires the pain medication to begin with.

condition -> symptoms http://www.ise.osu.edu/ISEFaculty/sommerich/appendix2.html This condition may cause symptoms of numbness and tingling that are consistent with CTS (Lundborg, Gelberman, and Minter-Convery, 1962; Gelberman et al., 1981; Werner et al., 1997.)

hypertension -> premature death http://arizona.camp

In [26]:
resources = [
#     "full_mcc",
    "full_prec",
#     "support_mcc",
#     "support_prec"
]
for resource in resources:
    print(resource)
    samples = full_causenet_medical.loc[full_causenet_medical.loc[:, resource] & ~full_causenet_medical.loc[:, resource].isna(), ["cause", "effect", "support", "reference", "sentence"]].sample(n=1000)
    samples = samples.reset_index(drop=True)
    samples.to_csv(constants.BASE_PATH + f"resources/{resource}.csv")

full_prec
