In [51]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
import time
import numpy as np

In [63]:
samples_df = pd.read_csv('drivers_and_data/screen-tsv-download.tsv', header=None, delimiter='\t')
samples_df.columns = ['cell_type','tissue']

##  Creating a Comprehensive Profile of Available Leukemia Cell Lines in ENCODE

Acute Lymphoblastic Leukemia Cell Lines: 

https://www.cell.com/trends/immunology/abstract/S1471-4906(20)30307-0 

https://www.nature.com/articles/s41467-022-29224-5

Acute Myleoid Leukemia Cell lines: 

https://www.proteinatlas.org/humanproteome/cell+line/leukemia 

https://pmc.ncbi.nlm.nih.gov/articles/PMC10049680/table/ijms-24-05377-t001/ 



In [28]:
samples_ALL = ['DND-41','Jurkat','Loucy']
samples_AML = ['HL-60', 'NB4', 'CMK']


## Scraping Protein Atlas for Cell Lines to Query for all Leukemia Samples for Verification of Above

## Implementing Cellosaurus API to Search all known synonyms/aliases of cell-lines to query files available in ENCODE 

https://www.proteinatlas.org/humanproteome/cell+line/leukemia

In [29]:
# URL of the Protein Atlas Leukemia page
url = "https://www.proteinatlas.org/humanproteome/cell+line/leukemia"

# Send a GET request to the webpage
response = requests.get(url)

# Check if the request was successful
if response.status_code == 200:
    # Parse the webpage content
    soup = BeautifulSoup(response.text, 'html.parser')

    # Locate the script tag containing the desired identifier
    script_tags = soup.find_all('script')
    for script in script_tags:
        # Look for the specific identifier in the script content
        if "var plot = $('#celline_prio_leukemia_LAML')" in script.text:
            # Extract the full line containing the scatterPlot data
            full_line = re.search(r"var plot = \$\('#celline_prio_leukemia_LAML'\)\.scatterPlot\((.*?)\);", script.text, re.DOTALL)
            # if full_line:
            full_line_text = full_line.group(1)  # Extract the full JSON-like data
            cell_lines = re.findall(r'"name":"(.*?)"', full_line_text)
            #Strip trailers
            cell_lines = [x.split(' ')[0].replace('\\','') for x in cell_lines]
            print('All Leukemia Cell Lines in Protein Atlas:',cell_lines)  


All Leukemia Cell Lines in Protein Atlas: ['Kasumi-6', 'MOLM-6', 'KO52', 'MUTZ-3', 'GDM-1', 'Kasumi-1', 'Mono-Mac-1', 'OCI-AML-5', 'KG-1', 'OCI-AML-2', 'SKM-1', 'SIG-M5', 'NOMO-1', 'P31/FUJ', 'AML-193', 'Mono-Mac-6', 'UKE-1', 'EoL-1', 'CMK', 'Loucy', 'U-937', 'ME-1', 'THP-1', 'MOLT-13', 'NALM-1', 'HNT-34', 'NB4', 'HL-60', 'MOLT-16', 'PLB-985', 'MHH-CALL-2', 'EM-2', 'MOLM-13', 'SET-2', 'MHH-CALL-4', 'MV4-11', 'SEM', 'PL-21', 'SKNO-1', 'HEL', 'LAMA-84', 'M-07e', 'MHH-CALL-3', 'KCL-22', 'TF-1', 'Kasumi-2', 'Ku812', 'KE-37', 'CML-T1', 'ML-2', 'MOLM-16', 'NCO2', 'SUP-B15', 'HEL', 'JK-1', 'P12-Ichikawa', 'NALM-19', 'PF-382', 'RPMI-8402', 'MOLT-3', 'OCI-AML-3', 'MUTZ-5', 'HMC-1', 'F-36P', 'TALL-1', 'CCRF-SB', 'RCH-ACV', 'Peer', 'JURKAT', 'JURL-MK1', 'Jurkat', 'KOPN-8', 'JVM-3', 'NALM-6', 'REH', 'ALL-SIL', 'MEC-1', 'DND-41', 'BDCM', 'BV-173', 'OCI-M1', 'MEG-01', 'MOLT-4', 'SUP-T11', 'HPB-ALL', 'SUP-T1', 'JM-1', '697', 'OCI-M2', 'RS4;11', 'K-562', 'KYO-1', 'HAP1']


In [30]:
def fetch_disease_label(cell_line_entry):
    """
    Fetch the label of a disease from a cell line entry, prioritizing diseases containing
    "leukemia" or "lymphoma." If not found, fetch the next available label.

    Args:
        cell_line_entry (dict): A single cell line dictionary.

    Returns:
        str: The label of a matching disease, or None if no diseases are found.
    """
    disease_list = cell_line_entry.get('disease-list', [])
    if isinstance(disease_list, list):
        for disease in disease_list:
            label = disease.get('label', '')
            if "leukemia" in label.lower() or "lymphoma" in label.lower():
                return label
        # Return the first label if no match for "leukemia" or "lymphoma"
        if disease_list:
            return disease_list[0].get('label')
    return None


def extract_name_list_synonyms(data, query):
    """
    Extract the name list, synonyms, and disease label for a specific query from the API response.

    Args:
        data (dict): Parsed JSON response from the Cellosaurus API.
        query (str): The cell line query string.

    Returns:
        dict: A dictionary containing the identifier, its synonyms, and the disease label.
    """
    results = {}
    # Iterate over each cell line in the 'cell-line-list'
    for cell_line_entry in data.get('cell-line-list', []):
        # Find the identifier
        identifier = None
        synonyms = []
        for name_entry in cell_line_entry.get('name-list', []):
            if name_entry['type'] == 'identifier':
                identifier = name_entry['value']
            elif name_entry['type'] == 'synonym':
                synonyms.append(name_entry['value'])

        # Fetch the disease label
        disease_label = fetch_disease_label(cell_line_entry)

        # Store the identifier, synonyms, and disease label if it matches the query
        if identifier and identifier.lower() == query.lower():
            results[identifier] = {
                "synonyms": synonyms,
                "disease_label": disease_label
            }

    return results

def find_matching_rows(cell_lines, samples_df):
    """
    Matches rows in the samples DataFrame based on cell line names, synonyms, and diseases, case-insensitively.

    Args:
        cell_lines (list): List of cell lines to query.
        samples_df (pd.DataFrame): DataFrame containing "cell_type" and "tissue" columns.

    Returns:
        pd.DataFrame: DataFrame with matched rows and additional columns for the identifier and disease label.
    """
    matched_rows = []

    # Normalize cell_lines to lowercase for case-insensitive matching
    normalized_cell_lines = [line.lower() for line in cell_lines]

    for query in normalized_cell_lines:
        url = f"https://api.cellosaurus.org/search/cell-line?q={query}"
        response = requests.get(url)

        if response.status_code == 200:
            # Parse JSON response
            api_data = response.json()
            # Extract synonyms and disease label
            synonyms_data = extract_name_list_synonyms(api_data.get('Cellosaurus', {}), query)

            for identifier, data in synonyms_data.items():
                # Combine identifier and synonyms into one list of query strings
                query_strings = [identifier.lower()] + [synonym.lower() for synonym in data['synonyms']]

                # Match rows where cell_type contains any of the query strings (case-insensitively)
                for _, row in samples_df.iterrows():
                    if any(query_string in row['cell_type'].lower() for query_string in query_strings):
                        matched_row = row.to_dict()
                        matched_row['cell_line'] = identifier
                        matched_row['disease_label'] = data['disease_label']
                        matched_rows.append(matched_row)
        else:
            print(f"Failed to fetch data for {query}. HTTP Status Code: {response.status_code}")

    # Create a new DataFrame from the matched rows
    return pd.DataFrame(matched_rows)


# Expanding Query to Include the LL-100 Leukemia Panel

In [None]:
# Path to your WebDriver
driver_path = 'drivers_and_data/geckodriver'

# Set up the Selenium WebDriver
service = Service(driver_path)
driver = webdriver.Firefox(service=service)

try:
    # Open the URL
    url = "https://www.cellosaurus.org/search?query=%22LL-100%20blood%20cancer%20cell%20line%20panel%22"
    driver.get(url)

    # Wait for the page to load completely
    time.sleep(5)  # Adjust the sleep time as needed

    # Locate the rows of the table
    rows = driver.find_elements(By.XPATH, "//table[@class='type-1']/tbody/tr")

    # Extract the second column values
    ll_100 = []
    for row in rows:
        cols = row.find_elements(By.TAG_NAME, "td")
        if len(cols) > 1:  # Ensure the row has at least two columns
            ll_100.append(cols[1].text.strip())

    # Print the extracted entries
    print(len(ll_100),'out of 100 Samples Found')

finally:
    # Close the WebDriver
    driver.quit()


100 out of 100 Samples Found


In [32]:
ll_100 = [value.split()[0] for value in ll_100]
new_vals_to_query = [value for value in ll_100 if value not in cell_lines]
cell_lines += new_vals_to_query

In [33]:
# Find matching rows
matched_df = find_matching_rows(cell_lines, samples_df)

# Display the output DataFrame
print(matched_df)

                                              cell_type               tissue  \
0                                                   CMK                blood   
1                                                 Loucy         bodily fluid   
2                                                   NB4          bone marrow   
3                                                 HL-60         bodily fluid   
4                              amniotic epithelial cell           epithelium   
...                                                 ...                  ...   
1323              keratinocyte female donor ENCDO302AAA           epithelium   
1324  muscle of arm tissue male embryo 96 days donor...  musculature of body   
1325  muscle of back tissue male embryo 96 days dono...  musculature of body   
1326  muscle of leg tissue male embryo 96 days donor...  musculature of body   
1327                    myocyte originated from LHCN-M2  musculature of body   

     cell_line                         

In [34]:
# matched_df.to_csv('hits2.csv')

In [35]:
matched_df = matched_df.drop_duplicates()

In [36]:
# Keywords to include
keywords = ["myelogenous", "myeloid", "acute", "aml", "lymphoblastic", "all", "lymphocytic"]

# Create masks
contains_keywords = matched_df['disease_label'].str.contains(
    "|".join(keywords), case=False, na=False
)
does_not_contain_chronic = ~matched_df['disease_label'].str.contains(
    "chronic", case=False, na=False
)
is_natural_killer = matched_df['disease_label'].str.contains(
    "Natural killer cell lymphoblastic leukemia/lymphoma", case=False, na=False
)

# Move "Natural killer cell lymphoblastic leukemia/lymphoma" to a new DataFrame
nat_killer_cell_lymphoblastic_leuk_lymph = matched_df[is_natural_killer]

# Filter the remaining rows
all_aml_all_samples = matched_df[contains_keywords & does_not_contain_chronic & ~is_natural_killer]

# Display the resulting DataFrames
print("Filtered DataFrame:")
print(all_aml_all_samples)

print("\nNatural Killer Cell DataFrame:")
print(nat_killer_cell_lymphoblastic_leuk_lymph)

Filtered DataFrame:
              cell_type        tissue cell_line  \
0                   CMK         blood       CMK   
1                 Loucy  bodily fluid     Loucy   
2                   NB4   bone marrow       NB4   
3                 HL-60  bodily fluid     HL-60   
158  Jurkat; Clone E6-1  bodily fluid    Jurkat   
160              DND-41  bodily fluid    DND-41   

                                      disease_label  
0    Myeloid leukemia associated with Down syndrome  
1              Adult T acute lymphoblastic leukemia  
2        Acute promyelocytic leukemia with PML-RARA  
3                      Adult acute myeloid leukemia  
158        Childhood T acute lymphoblastic leukemia  
160        Childhood T acute lymphoblastic leukemia  

Natural Killer Cell DataFrame:
                                              cell_type               tissue  \
1284                                          adipocyte    connective tissue   
1285    adipocyte originated from mesenchymal stem c

In [37]:
all_aml_all_samples

Unnamed: 0,cell_type,tissue,cell_line,disease_label
0,CMK,blood,CMK,Myeloid leukemia associated with Down syndrome
1,Loucy,bodily fluid,Loucy,Adult T acute lymphoblastic leukemia
2,NB4,bone marrow,NB4,Acute promyelocytic leukemia with PML-RARA
3,HL-60,bodily fluid,HL-60,Adult acute myeloid leukemia
158,Jurkat; Clone E6-1,bodily fluid,Jurkat,Childhood T acute lymphoblastic leukemia
160,DND-41,bodily fluid,DND-41,Childhood T acute lymphoblastic leukemia


In [38]:
ENCODE_API_URL = 'https://www.encodeproject.org/search/'

In [61]:
all_aml_all_samples_cell_types = np.array(all_aml_all_samples['cell_type'])
all_aml_all_samples_cell_types

array(['CMK', 'Loucy', 'NB4', 'HL-60', 'Jurkat; Clone E6-1', 'DND-41'],
      dtype=object)

In [57]:
aml_all_cell_types = all_aml_all_samples_cell_types[0]

In [59]:
aml_all_cell_types

'CMK'

In [60]:
import requests
import pandas as pd
import time


# Prepare to store results
results = []
headers = {'accept': 'application/json'}

# Query the ENCODE API for DNase-seq or ATAC-seq data related to these cell types
for cell_type in aml_all_cell_types:
    params = {
        'type': 'Experiment',
        'assay_title': ['DNase-seq', 'ATAC-seq'],
        'biosample_ontology.term_name': cell_type,
        'status': 'released',
        'format': 'json',
        'limit': 'all'
    }
    try:
        response = requests.get(ENCODE_API_URL, params=params, headers=headers)
        time.sleep(0.1)  # Ensure no more than 10 GET requests per second
        if response.status_code == 200:
            data = response.json()
            for experiment in data.get('@graph', []):
                results.append({
                    "Experiment": experiment.get('accession', 'N/A'),
                    "Assay": experiment.get('assay_title', 'N/A'),
                    "Cell Type": cell_type,
                    "Description": experiment.get('description', 'N/A')
                })
        else:
            print(f"Failed to fetch data for cell type {cell_type}: {response.status_code}")
    except Exception as e:
        print(f"An error occurred for cell type {cell_type}: {str(e)}")

# Convert results to a DataFrame
results_df = pd.DataFrame(results)

# Display or save the results
if not results_df.empty:
    print("Relevant open chromatin data found:")
    print(results_df.head())  # Show a preview of the results
else:
    print("No relevant experiments found for AML/ALL-related cell types.")

Failed to fetch data for cell type C: 404
Failed to fetch data for cell type M: 404
Failed to fetch data for cell type K: 404
No relevant experiments found for AML/ALL-related cell types.


---------------------------------------------------------------------------------
### Attempt at Scraping Screen Directly --> Do BIGWIG download from API directly 
---------------------------------------------------------------------------------

In [39]:
# from selenium import webdriver
# from selenium.webdriver.common.by import By
# from selenium.webdriver.common.keys import Keys
# import time

# # # Set up Selenium WebDriver
# # options = webdriver.ChromeOptions()
# # options.add_argument('--headless')
# # options.add_argument('--no-sandbox')
# # options.add_argument('--disable-dev-shm-usage')     

# # # Path to the chromedriver, make sure chromedriver is in your PATH or provide the full path
# # driver = webdriver.Chrome(executable_path='/usr/local/bin/chromedriver', options=options)

# # Use above selenium implementation if needed or API

# try:
#     # Navigate to SCREEN
#     url = "https://screen.encodeproject.org/search/?q=EIF6&assembly=GRCh38&uuid=985f9f7f-730a-49d4-8dbf-ee36eab1ab43"
#     driver.get(url)
#     time.sleep(5)  # Wait for the page to fully load

#     # Search for consensus cis-regulatory elements
#     search_box = driver.find_element(By.XPATH, "//input[@type='search']")
#     search_box.send_keys("EIF6")
#     search_box.send_keys(Keys.RETURN)
#     time.sleep(5)  # Wait for search results

#     # Extract consensus cis-regulatory elements information
#     results = driver.find_elements(By.CLASS_NAME, "result-item")
#     for result in results:
#         element_text = result.text
#         if "TP53" in element_text:
#             print("Found TP53 binding site information:\n")
#             print(element_text)

# finally:
#     # Close the WebDriver
#     driver.quit()

# # Note: Adjust the XPATHs and CLASS_NAME according to the actual HTML structure of the SCREEN page.
# # The sleep commands are used to wait for the page to load, but it's better to use explicit waits.
