# take a closer look into GSE's attributes

In [1]:
import concurrent.futures
import requests
from bs4 import BeautifulSoup
import pandas as pd
import csv
import re
import json
from tqdm import tqdm
import spacy

In [2]:
df_human = pd.read_json('data/metadata_human.json').transpose()
df_mouse = pd.read_json('data/metadata_mouse.json').transpose()

df = pd.concat([df_human, df_mouse])

df

Unnamed: 0,Title,Experiment type,Organism,Summary,Overall design,SRA,Samples,Platforms
GSE100027,Hit-and-run epigenetic editing prevents senesc...,Expression profiling by high throughput sequen...,Homo sapiens,Aberrant promoter DNA hypermethylation is a ha...,RNA-seq experiment with n=3 biological replica...,SRP109188,"[GSM2668081, GSM2668082, GSM2668083, GSM266808...",GPL20301\nIllumina HiSeq 4000 (Homo sapiens)\n
GSE100040,Human TFIIH kinase CDK7 regulates transcriptio...,Expression profiling by high throughput sequen...,Homo sapiens,CDK7 phosphorylates the RNA polymerase II (pol...,WT and analogue sensitive Cdk7as mutant cells ...,SRP109292,"[GSM2670975, GSM2670976, GSM2670977, GSM267097...",GPL20301\nIllumina HiSeq 4000 (Homo sapiens)\n...
GSE100075,Discovery of naturally occurring ESR1 mutation...,Expression profiling by high throughput sequen...,Homo sapiens,We report the first discovery of naturally occ...,RNAseq examination of gene expression changes ...,SRP109286,"[GSM2670874, GSM2670875, GSM2670876, GSM267087...",GPL16791\nIllumina HiSeq 2500 (Homo sapiens)\n...
GSE100081,Mining the stiffness-sensitive transcriptome i...,Expression profiling by high throughput sequen...,Homo sapiens,Vascular extracellular matrix (ECM) stiffening...,4 unique sample types with 4 replicates (16 to...,SRP109287,"[GSM2670929, GSM2670930, GSM2670931, GSM267093...",GPL16791\nIllumina HiSeq 2500 (Homo sapiens)\n
GSE100092,The E3 ubiquitin ligase HectD1 suppresses EMT ...,Expression profiling by high throughput sequen...,Homo sapiens,Cancer cells exploit the epithelial-to-mesench...,Differential gene expression profile following...,SRP109301,"[GSM2671095, GSM2671096, GSM2671097, GSM267109...",GPL11154\nIllumina HiSeq 2000 (Homo sapiens)\n
...,...,...,...,...,...,...,...,...
GSE99973,Thiol-linked alkylation for the metabolic sequ...,Expression profiling by high throughput sequen...,Mus musculus,Gene expression profiling by high-throughput s...,Wildtype (wt) mouse embryonic stem (mES) cells...,SRP109172,"[GSM2666852, GSM2666853, GSM2666854, GSM266685...",GPL17021\nIllumina HiSeq 2500 (Mus musculus)\n
GSE99974,Thiol-linked alkylation for the metabolic sequ...,Expression profiling by high throughput sequen...,Mus musculus,Gene expression profiling by high-throughput s...,Wildtype (wt) mouse embryonic stem (mES) cells...,SRP109093,"[GSM2666870, GSM2666871, GSM2666872, GSM266687...",GPL17021\nIllumina HiSeq 2500 (Mus musculus)\n
GSE99975,Thiol-linked alkylation for the metabolic sequ...,Expression profiling by high throughput sequen...,Mus musculus,Gene expression profiling by high-throughput s...,5 Âµg/ml Actinomycin D was added to wildtype m...,SRP109094,"[GSM2666888, GSM2666889, GSM2666890, GSM266689...",GPL17021\nIllumina HiSeq 2500 (Mus musculus)\n
GSE99977,Thiol-linked alkylation for the metabolic sequ...,Expression profiling by high throughput sequen...,Mus musculus,Gene expression profiling by high-throughput s...,Total RNA from wildtype mouse embryonic stem (...,SRP109095,"[GSM2666912, GSM2666913, GSM2666914, GSM266691...",GPL17021\nIllumina HiSeq 2500 (Mus musculus)\n


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7461 entries, GSE100027 to GSE99989
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Title            7461 non-null   object
 1   Experiment type  7461 non-null   object
 2   Organism         7461 non-null   object
 3   Summary          7461 non-null   object
 4   Overall design   7461 non-null   object
 5   SRA              7461 non-null   object
 6   Samples          7461 non-null   object
 7   Platforms        7461 non-null   object
dtypes: object(8)
memory usage: 524.6+ KB


In [4]:
unique_values = df['Experiment type'].unique()
print(unique_values)

['Expression profiling by high throughput sequencing'
 'Expression profiling by high throughput sequencingGenome binding/occupancy profiling by high throughput sequencing'
 'Expression profiling by high throughput sequencingNon-coding RNA profiling by high throughput sequencing'
 'Expression profiling by high throughput sequencingOther'
 'Genome binding/occupancy profiling by high throughput sequencingExpression profiling by high throughput sequencing'
 'Expression profiling by high throughput sequencingGenome binding/occupancy profiling by high throughput sequencingOther'
 'Expression profiling by high throughput sequencingGenome binding/occupancy profiling by high throughput sequencingMethylation profiling by high throughput sequencing'
 'Genome binding/occupancy profiling by high throughput sequencingExpression profiling by high throughput sequencingOther'
 'Expression profiling by high throughput sequencingMethylation profiling by high throughput sequencing'
 'OtherExpression profi

In [5]:
value_to_check = 'Experiment type not found on the page.'

# Create a boolean mask to identify rows where the specified column has the desired value
mask = df['Experiment type'] == value_to_check

# Use the boolean mask to filter the DataFrame and get the rows where the condition is True
rows_with_value = df[mask]

# Display the rows where the specified column has the desired value
print(rows_with_value)

                                  Title  \
GSE109702  Title not found on the page.   
GSE67934   Title not found on the page.   

                                  Experiment type  \
GSE109702  Experiment type not found on the page.   
GSE67934   Experiment type not found on the page.   

                                  Organism                         Summary  \
GSE109702  Organism not found on the page.  Summary not found on the page.   
GSE67934   Organism not found on the page.  Summary not found on the page.   

                                  Overall design                         SRA  \
GSE109702  Overall design not found on the page.  SRA not found on the page.   
GSE67934   Overall design not found on the page.  SRA not found on the page.   

          Samples                         Platforms  
GSE109702      []  Platforms not found on the page.  
GSE67934       []  Platforms not found on the page.  


https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE109702
GSE67934 
were deleted so we drop these GSEs

In [6]:
df = df[~mask]
#reset the index after dropping rows
#df_human = df_human.reset_index(drop=True)

In [7]:
df.shape

(7459, 8)

In [8]:
mask = df.applymap(lambda x: "HeLa" in str(x))

# Use any() along axis=1 to check if the keyword is present in any cell of each row
rows_with_keyword = mask.any(axis=1)

# Filter the DataFrame based on the rows with the keyword
result_df = df[rows_with_keyword]

result_df 


Unnamed: 0,Title,Experiment type,Organism,Summary,Overall design,SRA,Samples,Platforms
GSE100982,hMTR4 plays a central role in creating balance...,Expression profiling by high throughput sequen...,Homo sapiens,To examine whether the competition between hMT...,"rRNA-depleted RNAs isolated from control, ALYR...",SRP111407,"[GSM2698456, GSM2698457, GSM2698458, GSM2698459]",GPL20795\nHiSeq X Ten (Homo sapiens)\n
GSE101518,Large-scale low-cost NGS library preparation u...,Expression profiling by high throughput sequen...,Homo sapiens,"In recent years, tagmentation-based library pr...","For samples labeled i5i7, full-length cDNA was...",SRP112552,"[GSM2705279, GSM2705280, GSM2705281, GSM270528...",GPL18573\nIllumina NextSeq 500 (Homo sapiens)\n
GSE101519,Large-scale low-cost NGS library preparation u...,Expression profiling by high throughput sequen...,Homo sapiens,"In recent years, tagmentation-based library pr...","For samples labeled polyA, full-length cDNA wa...",SRP112553,"[GSM2705324, GSM2705325, GSM2705326, GSM270532...",GPL18573\nIllumina NextSeq 500 (Homo sapiens)\n
GSE102893,An Mtr4/ZFC3H1 complex facilitates turnover of...,Expression profiling by high throughput sequen...,Homo sapiens,Many long noncoding RNAs (lncRNAs) are unstabl...,"3'READS of siMtr4, siRBM7, siZCCHC8 and siCont...",SRP115916,"[GSM2747880, GSM2747881, GSM2747882, GSM2747883]",GPL11154\nIllumina HiSeq 2000 (Homo sapiens)\n
GSE104736,Non-synchronized cell cycle transcriptomics in...,Expression profiling by high throughput sequen...,Homo sapiens,Sorting U2OS and HeLa cells genetically modifi...,"HeLa cells were sorted at three timepoints, wh...",SRP119610,"[GSM2806897, GSM2806898, GSM2806899, GSM280690...",GPL11154\nIllumina HiSeq 2000 (Homo sapiens)\n
...,...,...,...,...,...,...,...,...
GSE95455,Assessing the impact of the R252W Charcot-Mari...,Expression profiling by high throughput sequen...,Homo sapiens,HeLa cells lacking MORC2 generated through CRI...,"Total RNA-seq of MORC2 knockout cells, either ...",SRP100835,"[GSM2514492, GSM2514493, GSM2514494]",GPL16791\nIllumina HiSeq 2500 (Homo sapiens)\n
GSE96777,ZBTB48 is both a vertebrate telomere-binding p...,Expression profiling by high throughput sequen...,Homo sapiens,Here we show that ZBTB48 binds directly both t...,ZBTB48 WT clones were generated by single sort...,SRP102096,"[GSM2543167, GSM2543168, GSM2543169, GSM254317...",GPL16791\nIllumina HiSeq 2500 (Homo sapiens)\n
GSE97009,Widespread activation of antisense transcripti...,Expression profiling by high throughput sequen...,Homo sapiens,We show that Herpes simplex virus 1 (HSV-1) in...,Human WI-38 and HeLa cells were infected with ...,SRP102444,"[GSM2549749, GSM2549750, GSM2549751, GSM254975...",GPL11154\nIllumina HiSeq 2000 (Homo sapiens)\n...
GSE97725,Episomal S/MAR-based replicons do not alter ex...,Expression profiling by high throughput sequen...,Homo sapiens,Methods: mRNA profiles of untransfected HeLa c...,Examination of mRNA profiles in two different ...,SRP103855,"[GSM2576185, GSM2576186]",GPL18573\nIllumina NextSeq 500 (Homo sapiens)\n


In [9]:
condition_organism = df['Organism'] == 'Mus musculus'
condition_keyword = df.applymap(lambda x: "cancer" in str(x)).any(axis=1)

result_df_mouse = df[condition_organism & condition_keyword]
result_df_mouse

Unnamed: 0,Title,Experiment type,Organism,Summary,Overall design,SRA,Samples,Platforms
GSE100276,Epigenetic memory of obesity predisposes to co...,Expression profiling by high throughput sequen...,Mus musculus,Colorectal cancer (CRC) is the third most comm...,Ribo-zero RNA-Seq was performed in colonic epi...,SRP109869,"[GSM2676966, GSM2676967, GSM2676968, GSM267696...",GPL19057\nIllumina NextSeq 500 (Mus musculus)\n
GSE100406,Binding of High Mobility Group A proteins to t...,Expression profiling by high throughput sequen...,Mus musculus,High mobility group (HMG) proteins are of simi...,"RNA-seq was performed in WT, Hmga1 KO and Hmga...",SRP110243,"[GSM2680435, GSM2680436, GSM2680437, GSM268043...",GPL17021\nIllumina HiSeq 2500 (Mus musculus)\n
GSE100771,cGAS surveillance of micronuclei links genome ...,Expression profiling by high throughput sequen...,Mus musculus,DNA is strictly compartmentalised within the n...,RNA-seq of 35 individual mouse embryonic fibro...,SRP111009,"[GSM2693064, GSM2693065, GSM2693066, GSM269306...",GPL21626\nNextSeq 550 (Mus musculus)\n
GSE101859,The RhoJ-BAD Signaling Network: An Achilles He...,Expression profiling by high throughput sequen...,Mus musculus,Genes and pathways that allow cells to cope wi...,5 Braf CA/+; Pten fl/+; Cre +; RhoJ +/+ melano...,SRP113541,"[GSM2717318, GSM2717319, GSM2717320, GSM271732...",GPL17021\nIllumina HiSeq 2500 (Mus musculus)\n
GSE102345,Whole tumor RNA-sequencing and deconvolution r...,Expression profiling by high throughput sequen...,Mus musculus,The concept that solid tumors are maintained b...,"We profiled homogenized optic nerve, optic gli...",SRP114994,"[GSM2735070, GSM2735071, GSM2735072, GSM273507...",GPL17021\nIllumina HiSeq 2500 (Mus musculus)\n...
...,...,...,...,...,...,...,...,...
GSE98860,Mtor knockout in PDAC cells,Expression profiling by high throughput sequen...,Mus musculus,A pancreatic ductal adenocarcinoma cell line w...,Tamoxifen induced knockout of Mtor in a Pdx1-F...,SRP106986,"[GSM2616852, GSM2616853, GSM2616854, GSM2616855]",GPL13112\nIllumina HiSeq 2000 (Mus musculus)\n
GSE98960,Epigenetic restriction of embryonic and extrae...,Expression profiling by high throughput sequen...,Mus musculus,Concerted efforts over past decades have estab...,Comparison of gene expression patterns in Extr...,SRP107205,"[GSM2628310, GSM2628311, GSM2628312, GSM262831...",GPL17021\nIllumina HiSeq 2500 (Mus musculus)\n
GSE99101,Enhancer-binding of Brd4 controls cell differe...,Expression profiling by high throughput sequen...,Mus musculus,The epigenomic reader Brd4 is an important dru...,"Expression profiling by RNA-seq, and profiling...",SRP107492,"[GSM2633453, GSM2633454, GSM2633455, GSM263345...",GPL17021\nIllumina HiSeq 2500 (Mus musculus)\n
GSE99364,Stabilization of c-Myc protein by CAMKIIÎ³ pro...,Expression profiling by high throughput sequen...,Mus musculus,Although high c-Myc protein expression is obse...,We investigated how deleting CAMKIIÎ³ suppress...,SRP108130,"[GSM2643072, GSM2643073, GSM2643074, GSM2643075]",GPL13112\nIllumina HiSeq 2000 (Mus musculus)\n


https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSM2676351
if characterstic contains "cell line", then classify it as cell line study

Source name	colonic epithelium
Characteristics	strain: C57BL/6
tissue: colon
diet: 10% fat diet (LF)

set condition, if source name is not cell line/ does not contain "cell line" && characteristics contain tissue, then classify it as tissue samples

In [10]:
tissue_keyword = df.applymap(lambda x: "colonic epithelium" in str(x)).any(axis=1)

result_df = df[tissue_keyword]

result_df

Unnamed: 0,Title,Experiment type,Organism,Summary,Overall design,SRA,Samples,Platforms
GSE62520,RNA-sequencing of mRNAs from control and CAP-D...,Expression profiling by high throughput sequen...,Homo sapiens,BACKGROUND & AIMS- More frequent interaction o...,Three RNA samples from 3 independent experimen...,SRP049063,"[GSM1528198, GSM1528199, GSM1528200, GSM152820...",GPL16791\nIllumina HiSeq 2500 (Homo sapiens)\n
GSE100276,Epigenetic memory of obesity predisposes to co...,Expression profiling by high throughput sequen...,Mus musculus,Colorectal cancer (CRC) is the third most comm...,Ribo-zero RNA-Seq was performed in colonic epi...,SRP109869,"[GSM2676966, GSM2676967, GSM2676968, GSM267696...",GPL19057\nIllumina NextSeq 500 (Mus musculus)\n
GSE63256,Reg4+ Deep Crypt Secretory cells function as e...,Expression profiling by high throughput sequen...,Mus musculus,Lgr5+ stem cells reside at crypt bottoms of th...,To define a global gene expression signature o...,SRP049774,"[GSM1544527, GSM1544528, GSM1544529, GSM1544530]",GPL17021\nIllumina HiSeq 2500 (Mus musculus)\n
GSE85731,Epigenetic memory of obesity in mouse colonic ...,Expression profiling by high throughput sequen...,Mus musculus,Colorectal cancer is the third most common can...,"Whole-genome bisulfite sequencing, bisulfite P...",SRP082360,"[GSM2283268, GSM2283269, GSM2283270, GSM228327...",GPL13112\nIllumina HiSeq 2000 (Mus musculus)\n...
GSE86111,Time course profiling of IEC and total colonic...,Expression profiling by high throughput sequen...,Mus musculus,In this experiment we profiled the transcripto...,Groups of three mice were inoculated with 2x10...,SRP083074,"[GSM2293852, GSM2293853, GSM2293854, GSM229385...",GPL13112\nIllumina HiSeq 2000 (Mus musculus)\n


In [11]:
cancer_cell_line_keyword = df.applymap(lambda x: "HepG2" in str(x)).any(axis=1)

result_df = df[cancer_cell_line_keyword]

result_df

Unnamed: 0,Title,Experiment type,Organism,Summary,Overall design,SRA,Samples,Platforms
GSE100210,RNA editing by ADAR1 leads to context-dependen...,Expression profiling by high throughput sequen...,Homo sapiens,Adenosine deaminase acting on RNA 1 (ADAR1) is...,Examination of control and ADAR1 KD HepG2 cell...,SRP109827,"[GSM2676351, GSM2676352, GSM2676353, GSM267635...",GPL16791\nIllumina HiSeq 2500 (Homo sapiens)\n
GSE102560,Cooperative and Antagonistic Transcriptional R...,Expression profiling by high throughput sequen...,Homo sapiens,We monitored the transcription changes associa...,Cells were transduced with shRNA targeting the...,SRP115304,"[GSM2740942, GSM2740943, GSM2740944, GSM274094...",GPL20301\nIllumina HiSeq 4000 (Homo sapiens)\n
GSE108366,RNA-seq Facilitates Quantitative Analysis of W...,Expression profiling by high throughput sequen...,Homo sapiens,The goals of this study are to check mRNA leve...,Examination of 2 different mRNA profiles in 2 ...,SRP121524,"[GSM2896229, GSM2896230]",GPL10999\nIllumina Genome Analyzer IIx (Homo s...
GSE109072,Toxicogenomics of the flame retardant tris (2-...,Expression profiling by high throughput sequen...,Homo sapiens,Tris (2-butoxyethyl) phosphate (TBOEP) is a co...,HepG2 cells were treated with low (2.5 uM) or ...,SRP128876,"[GSM2931164, GSM2931165, GSM2931166, GSM293116...",GPL18460\nIllumina HiSeq 1500 (Homo sapiens)\n
GSE109140,Valproic acid attenuates hyperglycemia induced...,Expression profiling by high throughput sequen...,Homo sapiens,Diabetes is a multifactorial disorder and epig...,RNA-seq was performed to understand gene expre...,SRP128998,"[GSM2932791, GSM2932792, GSM2932793, GSM293279...",GPL16791\nIllumina HiSeq 2500 (Homo sapiens)\n
GSE109463,A toxicogenomics approach to screen chlorinate...,Expression profiling by high throughput sequen...,Homo sapiens,Tris(2-chloroethyl) phosphate (TCEP) is a perv...,HepG2 cells were treated with low (25 uM) or h...,SRP131103,"[GSM2943835, GSM2943836, GSM2943837, GSM294383...",GPL18460\nIllumina HiSeq 1500 (Homo sapiens)\n
GSE35464,Genome-wide analysis of histone methylation re...,Expression profiling by high throughput sequen...,Homo sapiens,Hepatitis B virus (HBV) is a hepatotropic viru...,A large-scale analysis of gene expression of 2...,SRP010670,"[GSM869015, GSM869016]",GPL11154\nIllumina HiSeq 2000 (Homo sapiens)\n
GSE36242,Transcriptomic response to benzo[a]pyrene trea...,Expression profiling by high throughput sequen...,Homo sapiens,Whole-genome transcriptome measurements are pi...,Examination of 2 biological replicates at 2 di...,SRP011233,"[GSM884981, GSM884982, GSM884983, GSM884984, G...",GPL9052\nIllumina Genome Analyzer (Homo sapien...
GSE37001,METTL3 KD in HepG2 cells,Expression profiling by high throughput sequen...,Homo sapiens,To gain insight into possible processes that r...,Differential expression analysis of METTL3 KD ...,SRP012096,"[GSM908325, GSM908326, GSM908327, GSM908328]",GPL10999\nIllumina Genome Analyzer IIx (Homo s...
GSE37002,m6A mapping in human RNA (with treatments),Expression profiling by high throughput sequen...,Homo sapiens,"We developed a novel approach, m6A-seq, for hi...",Identification of m6A modified sequences in He...,SRP012098,"[GSM908329, GSM908330, GSM908331, GSM908332, G...",GPL10999\nIllumina Genome Analyzer IIx (Homo s...


ATCC cell lines: https://www.atcc.org/

    Hepa 1-6: https://www.atcc.org/products/crl-1830

    Hep G2 [HEPG2]: https://www.atcc.org/products/hb-8065

Cellosaurus - a knowledge resource on cell lines:
https://www.cellosaurus.org/index.html

In [12]:
cancer_cell_line_keywords = ['Hepa1-6','Hepa 1-6', 'TIB-75', 'BNL 1ME A.7R.1', 'Hep 55.1C', 'Hep-55.1C', '55.1C']

cancer_cell_line_keyword = df.applymap(lambda x: any(keyword in str(x) for keyword in cancer_cell_line_keywords)).any(axis=1)


result_df = df[cancer_cell_line_keyword]

result_df

# Hepa1-6 cells


Unnamed: 0,Title,Experiment type,Organism,Summary,Overall design,SRA,Samples,Platforms
GSE87511,Dietary interventions modulate global non-codi...,Expression profiling by high throughput sequen...,Mus musculus,Using high-throughput deep sequencing we profi...,Mouse Hepa1-6 cells were treated with lentivir...,SRP090682,"[GSM2332931, GSM2332932, GSM2332933, GSM2332934]",GPL17021\nIllumina HiSeq 2500 (Mus musculus)\n


In [13]:
# Load the English language model
nlp = spacy.load("en_core_sci_sm")

#en_core_web_sm

text = """
To define a global gene expression signature of DCS cells, we performed RNA-sequencing (RNA-seq) of sorted Reg4-dsRed+ and Lgr5-GFP+ cells from colonic epithelium.
Sorting and RNA-seq library preparation was performed twice, to obtain a biological replicate.

Examination of control and ADAR1 KD HepG2 cells using PARSseq method in order to find changes in RNA 2D structure induced by A-to-I RNA editing. 
We used PARSseq for HepG2 control and ADAR KD cells in two replicates with S1/V1 nucleases treatment for each ((control+KD)*2*2= 8 PARSseq samples). We used RNAseq of HepG2 control and ADAR KD cells with additional Ribo-seq (control and ADAR KD in 2 replicates=4 Ribo-seq samples) to further confirm the changes we observed in the PARSseq experiment.
"""

# Process the text using spaCy
doc = nlp(text)

entities = [(ent.text, ent.label_) for ent in doc.ents]

# expecting:  Reg4-dsRed+, Lgr5-GFP+, HepG2
print(entities)

  deserializers["tokenizer"] = lambda p: self.tokenizer.from_disk(  # type: ignore[union-attr]


[('global', 'ENTITY'), ('gene expression', 'ENTITY'), ('DCS cells', 'ENTITY'), ('RNA-sequencing', 'ENTITY'), ('RNA-seq', 'ENTITY'), ('sorted Reg4-dsRed+', 'ENTITY'), ('Lgr5-GFP+ cells', 'ENTITY'), ('colonic epithelium', 'ENTITY'), ('Sorting', 'ENTITY'), ('RNA-seq library', 'ENTITY'), ('preparation', 'ENTITY'), ('biological', 'ENTITY'), ('replicate', 'ENTITY'), ('ADAR1 KD', 'ENTITY'), ('HepG2 cells', 'ENTITY'), ('PARSseq method', 'ENTITY'), ('RNA', 'ENTITY'), ('2D structure', 'ENTITY'), ('induced', 'ENTITY'), ('A-to-I RNA', 'ENTITY'), ('PARSseq', 'ENTITY'), ('HepG2', 'ENTITY'), ('ADAR', 'ENTITY'), ('replicates', 'ENTITY'), ('S1/V1 nucleases', 'ENTITY'), ('treatment', 'ENTITY'), ('(control+KD)*2*2=', 'ENTITY'), ('PARSseq', 'ENTITY'), ('samples', 'ENTITY'), ('RNAseq', 'ENTITY'), ('HepG2 control', 'ENTITY'), ('ADAR', 'ENTITY'), ('Ribo-seq', 'ENTITY'), ('control', 'ENTITY'), ('Ribo-seq', 'ENTITY'), ('samples', 'ENTITY'), ('changes', 'ENTITY'), ('PARSseq', 'ENTITY'), ('experiment', 'ENTITY')

In [14]:
# List of known cell lines
known_cell_lines = ["HepG2"]

def classify_cell_line(entity_text, known_cell_lines):
    """ classify cell line entities"""
    for cell_line in known_cell_lines:
        if cell_line.lower() in entity_text.lower():
            return f"{entity_text} is a {cell_line} cell line"
    return f"{entity_text} is not recognized as a known cell line"

# Filter entities related to cell lines based on keywords
cell_line_entities = [ent.text for ent in doc.ents if 'cell' in ent.text.lower()]

# Classify cell line entities
classified_entities = [classify_cell_line(entity, known_cell_lines) for entity in cell_line_entities]

print(classified_entities)

['DCS cells is not recognized as a known cell line', 'Lgr5-GFP+ cells is not recognized as a known cell line', 'HepG2 cells is a HepG2 cell line']
