## from saved file

## using WGET or curl or ...

In [None]:
from dataclasses import dataclass, field
from pathlib import Path
import requests
from typing import Optional

import pandas as pd

from casskit.data.io.base import DataURLMixin
from casskit.data.io.config import CACHE_DIR
from casskit.data.io.utils import cache_on_disk, column_janitor

Write the simplest python package to illustrate best practices and implement your suggested approach. Ignore packaging files that are unrelated to the task at hand. The response should be something like
```
# Directory tree with unrelated files omitted.
# demo_pkg/
# │
# ├── demo_pkg/
# │   ├── __init__.py
# │   └── main.py
# ...

# __init__.py
# TODO:

# main.py
# TODO:

```

In [7]:
from pypath.share import settings

# the current values:
print(settings.get('cachedir'))
# '/home/denes/.pypath/cache'
print(settings.get('pickle_dir'))
# '/home/denes/.pypath/pickles'

# setting new values:
settings.setup(cachedir = './pypath_cache', pickle_dir = './pypath_pickles')

/Users/silvers/.pypath/cache


'/Users/silvers/.pypath/pickles'

My package has an optional dependency, `pypath`, which uses its own caching system. The following code can be used to change the caching directory (via argument `cachedir`) and the pickle directory within that (via `pickle_dir`):
```
from pypath.share import settings

settings.setup(cachedir='', pickle_dir='')
```
I would like `pypath` to use the same caching directory as the rest of the package. Modify the code to implement this in the best, most pythonic manner.

In [13]:
from pypath.share import settings

In [20]:
from pathlib import Path

In [26]:
cachedir = './pypath_cache'
settings.setup(cachedir = cachedir, pickle_dir = f'{cachedir}/pickles')

In [27]:
# the current values:
print(settings.get('cachedir'))
# '/home/denes/.pypath/cache'
print(settings.get('pickle_dir'))
# '/home/denes/.pypath/pickles'

./pypath_cache
./pypath_cache/pickles


In [3]:
from pypath import omnipath

In [11]:
from pypath import omnipath
tft = omnipath.db.get_db('tf_target')

KeyboardInterrupt: 

Should I just use pypath/omnipath with caching turned off and save the pickled data separately in the desired directory?

In [None]:
tft.make_df()

In [None]:
tft.df

In [None]:
cu = omnipath.db.get_db('curated')
cu.make_df()
cu.df

In [4]:
co = omnipath.db.get_db('complex')

In [5]:
print(co.pickle_file)

'/Users/silvers/.pypath/pickles/complexes.pickle'

In [10]:
co.df.query("identifiers.str.contains('CORUM')")

Unnamed: 0,name,components,components_genesymbols,stoichiometry,sources,references,identifiers
0,NFY,P23511_P25208_Q13952,NFYA_NFYB_NFYC,1:1:1,ComplexPortal;PDB;SPIKE;SIGNOR;CORUM;hu.MAP2;C...,14755292;9372932;15243141,SIGNOR:SIGNOR-C1;CORUM:4478;Compleat:HC1449;in...
3,SCF-betaTRCP,P63208_Q13616_Q9Y297,BTRC_CUL1_SKP1,1:1:1,SPIKE;Compleat;SIGNOR;CORUM,9990852,SIGNOR:SIGNOR-C5;CORUM:227;Compleat:HC757
11,NfKb-p65/p50,P19838_Q04206,NFKB1_RELA,4:4,ComplexPortal;PDB;SIGNOR;CORUM;KEGG-MEDICUS;Co...,14755292;30205516;9556555;17072323,SIGNOR:SIGNOR-C13;CORUM:5460;Compleat:HC1876;i...
12,IKK-complex,O14920_O15111_Q9Y6K9,CHUK_IKBKB_IKBKG,1:1:1,ComplexPortal;SPIKE;SIGNOR;CORUM;KEGG-MEDICUS;...,10893415;9751060;14755292;24375677,SIGNOR:SIGNOR-C14;CORUM:2121;Compleat:HC1074;r...
15,CyclinB/CDK1,P06493_P14635,CCNB1_CDK1,1:1,ComplexPortal;SPIKE;SIGNOR;CORUM;KEGG-MEDICUS;...,14755292;8070405;17495531,SIGNOR:SIGNOR-C17;CORUM:5543;Compleat:HC968;re...
...,...,...,...,...,...,...,...
7481,FOXO1-ESR1 complex,P03372_Q12778,ESR1_FOXO1,0:0,CORUM,11353774,CORUM:7586
7482,APP(AICD)-FOXO3 complex,O43524_P05067,APP_FOXO3,0:0,CORUM,24832605,CORUM:7587
7483,APP(AICD)-FOXO1 complex,P05067_Q12778,APP_FOXO1,0:0,CORUM,24832605,CORUM:7588
7484,APP(AICD)-FOXO4 complex,P05067_P98177,APP_FOXO4,0:0,CORUM,24832605,CORUM:7589


In [7]:
co

<Complex database: 34941 complexes>

In [None]:
import os
import pathlib
import shutil

class CacheManager:
    def __init__(self, cache_dir=None):
        # Prioritize passed cache directory, then environment variable, then default
        if cache_dir is not None:
            self.cache_dir = pathlib.Path(cache_dir)
        else:
            self.cache_dir = pathlib.Path(os.getenv('PKG_CACHE', pathlib.Path.home() / '.my_pkg_cache'))

        self.cache_dir.mkdir(parents=True, exist_ok=True)

    def get_cache_dir(self):
        return str(self.cache_dir)

    def set_cache_dir(self, new_dir):
        self.cache_dir = pathlib.Path(new_dir)
        self.cache_dir.mkdir(parents=True, exist_ok=True)

    def clear_cache(self):
        shutil.rmtree(self.cache_dir)
        self.cache_dir.mkdir(parents=True, exist_ok=True)

    def cache_size(self):
        return sum(f.stat().st_size for f in self.cache_dir.glob('**/*') if f.is_file())

# Usage:
# cache = CacheManager()  # Uses PKG_CACHE env var or defaults to $HOME/.my_pkg_cache
# cache = CacheManager('/path/to/my/cache')  # Uses the specified directory


In [None]:
# TODO: Refactor biogrid to use REST API?
BIOGRID_ASSET = pd.read_csv(
    Path(__file__).parent / "assets/BIOGRID-MV-Physical-4.4.218.tab3.txt",
    sep="\t", low_memory=False
)

# BIOGRID_URL = "https://downloads.thebiogrid.org/Download/BioGRID/Release-Archive/BIOGRID-4.4.212/BIOGRID-ALL-4.4.212.tab3.zip"
# BIOGRID_URL = "https://downloads.thebiogrid.org/Download/BioGRID/Latest-Release/BIOGRID-ALL-LATEST.tab3.zip"
BIOGRID_URL = "https://downloads.thebiogrid.org/Download/BioGRID/Latest-Release/BIOGRID-MV-Physical-LATEST.tab3.zip"


@dataclass
class BioGRID(DataURLMixin):
    """Fetch BioGRID interaction data."""
    cache_dir: Optional[Path] = field(init=True, default=CACHE_DIR)
    url: str = BIOGRID_URL
    organism: str = "Homo sapiens"
    
    @cache_on_disk
    def fetch(self) -> pd.DataFrame:
        if False:
            data = pd.read_csv(self.url, compression="zip", sep="\t")
        else:
            data = BIOGRID_ASSET

        return (data
                .pipe(column_janitor)
                .query("""
                       organism_name_interactor_a == @self.organism and \
                       organism_name_interactor_b == @self.organism
                       """)
                .filter([
                    "official_symbol_interactor_a", "official_symbol_interactor_b",
                    "synonyms_interactor_a", "synonyms_interactor_b",
                ]))

    def set_cache(self, cache_dir: Path) -> Path:
        self.path_cache = Path(cache_dir, f"biogrid.pkl")
        self.read_cache = lambda cache: pd.read_pickle(cache)
        self.write_cache = lambda data, cache: data.to_pickle(cache)

    @classmethod
    def get(cls, cache_only: bool = False) -> pd.DataFrame:
        data = cls().fetch()
        if cache_only is False:
            return data

    def __post_init__(self):
        self.set_cache(self.cache_dir)

get_biogrid = BioGRID.get
"""Convencience functions for loading BioGRID data."""

## using `pypath-omnipath`

## Use REST service

In [1]:
from dataclasses import dataclass, field
from pathlib import Path
import requests
from typing import Optional

import pandas as pd

In [2]:
gene_list = None
exclude_genes = "false"
max = 10000
request_url = "https://webservice.thebiogrid.org/interactions"
params = {
    "accesskey": "6f5b1db04594466337d179d76c147877",
    "format": "json",  # Return results in TAB2 format
    "max": max,
    "excludeGenes": exclude_genes,
    "searchNames": "true",  # Search against official names
    "includeInteractors": "true",  # Set to true to get any interaction involving EITHER gene, set to false to get interactions between genes
    "includeInteractorInteractions": "false",  # Set to true to get interactions between the geneList’s first order interactors
    "taxId": 9606,  # Limit to H Sapiens
    "throughputTag": "low",  # Limit to low throughput
}
if gene_list is not None:
    params.update({"geneList": "|".join(gene_list)})

r = requests.get(request_url, params=params)
interactions = r.json()

# Create a hash of results by interaction identifier
data = {}
for interaction_id, interaction in interactions.items():
    data[interaction_id] = interaction
    # Add the interaction ID to the interaction record, so we can reference it easier
    data[interaction_id]["INTERACTION_ID"] = interaction_id

# Load the data into a pandas dataframe
dataset = pd.DataFrame.from_dict(data, orient="index")

# Re-order the columns and select only the columns we want to see

columns = [
    "INTERACTION_ID",
    "ENTREZ_GENE_A",
    "ENTREZ_GENE_B",
    "OFFICIAL_SYMBOL_A",
    "OFFICIAL_SYMBOL_B",
    "EXPERIMENTAL_SYSTEM",
    "PUBMED_ID",
    "PUBMED_AUTHOR",
    "THROUGHPUT",
    "QUALIFICATIONS",
]
dataset = dataset[columns]

In [3]:
dataset

Unnamed: 0,INTERACTION_ID,ENTREZ_GENE_A,ENTREZ_GENE_B,OFFICIAL_SYMBOL_A,OFFICIAL_SYMBOL_B,EXPERIMENTAL_SYSTEM,PUBMED_ID,PUBMED_AUTHOR,THROUGHPUT,QUALIFICATIONS
103,103,6416,2318,MAP2K4,FLNC,Two-hybrid,9006895,Marti A (1997),Low Throughput,-
117,117,84665,88,MYPN,ACTN2,Two-hybrid,11309420,Bang ML (2001),Low Throughput,-
183,183,90,2339,ACVR1,FNTA,Two-hybrid,8599089,Wang T (1996),Low Throughput,-
278,278,2624,5371,GATA2,PML,Two-hybrid,10938104,Tsuzuki S (2000),Low Throughput,-
418,418,6118,6774,RPA2,STAT3,Two-hybrid,10875894,Kim J (2000),Low Throughput,-
...,...,...,...,...,...,...,...,...,...,...
275053,275053,1019,1029,CDK4,CDKN2A,Reconstituted Complex,8259215,Serrano M (1993),Low Throughput,-
275054,275054,1019,5111,CDK4,PCNA,Affinity Capture-Western,8259215,Serrano M (1993),Low Throughput,-
275055,275055,1019,1029,CDK4,CDKN2A,Affinity Capture-Western,8259215,Serrano M (1993),Low Throughput,-
275056,275056,1019,595,CDK4,CCND1,Affinity Capture-Western,8259215,Serrano M (1993),Low Throughput,-
