In [None]:
pip install lxml

In [None]:
pip install BeautifulSoup4

In [2]:
import time
from csv import DictWriter
from threading import Thread
from concurrent.futures import ThreadPoolExecutor
from multiprocessing import Queue, Process
import os
import sys
import re

import pandas as pd
import requests
import lxml
from lxml import etree
from bs4 import BeautifulSoup

In [6]:
class GEO_crawler:
    def __init__(self, GEO_id):
        self.GEO_id = GEO_id
        self.base_url = "https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi"
        
    def progress_bar(self, num, total):
        rate = float(num)/total
        ratenum = int(rate * 100)
        
        show = "\r[{}{}]{}".format("*" * ratenum, "-" * (total - ratenum), str(ratenum) + "%")
        
        if ratenum == 100:
            sys.stdout.write(show)
            sys.stdout.write("\n{} is over\n".format(self.GEO_id))
        else:
            sys.stdout.write(show)
            sys.stdout.flush()
        
        
        
    def item2csv(self, item):
       
            has_header = os.path.exists(self.GEO_id+".csv")

            with open(self.GEO_id +".csv", "a") as file:
                writer = DictWriter(file, list(item.keys()))
                if not has_header:
                    writer.writeheader()
                    has_header = True

                writer.writerow(item)
        
    def download(self):
        resp = requests.get(self.base_url, params={"acc": self.GEO_id})
        if resp.status_code == 200:
            resp.encoding = "utf-8"
            self.parse(resp.text)
    
    def parse(self, html):
        root = BeautifulSoup(html, "lxml")
        sites = root.select("a")
        
        url_ls = []
        
        number = 1
        total = 0
        
        for site in sites:
            text = site.get_text()
            if re.findall("^GSM[0-9]+", text):
                total +=1
        
        for site in sites:
            text = site.get_text()
            if text.startswith("GSM"):
                resp = requests.get(self.base_url, params={"acc": text})
                if resp.status_code == 200:
                    resp.encoding = "utf-8"
                    
                    root = BeautifulSoup(resp.text, "lxml")
                    tags = root.select("tr[valign = top]")
                    
                    item = {}
                    for num,tag in enumerate(tags):
                        if num == 0:
                            continue
                        else:
                            tag_ls = tag.select("td")
                            
                            if len(tag_ls) > 1:
                                if tag_ls[0].get_text() != "\xa0":
                                    item[tag_ls[0].get_text()] = tag_ls[1].get_text()
                       
                    self.item2csv(item)
                    self.progress_bar(num = number, total = total)
                    number += 1
    
                
                
            

In [7]:
# when you only download one sample, using following code, a csv file will be returned in your working directory
a = GEO_crawler("GSE201369")
a.download()

[****************************************************************************************************]100%
GSE201369 is over


In [8]:
# have a look the csv file
csv = pd.read_csv("GSE201369.csv")
csv.head(3)

Unnamed: 0,Status,Title,Sample type,Source name,Organism,Characteristics,Treatment protocol,Growth protocol,Extracted molecule,Extraction protocol,...,Organization name,Department,Street address,City,ZIP/Postal code,Country,Platform ID,Series (1),BioSample,SRA
0,"Public on Aug 24, 2022",CON1,SRA,ovary,Sus scrofa,tissue: ovarybreed: LandracexYorkshire crossbr...,The cells were teated with medium containing 5...,Porcine primary granulosa cells were maintaine...,total RNA,The TRIzol reagent was used to isolate total R...,...,Henan Academy of Agricultural Sciences,Institute of Animal Husbandry and Veterinary S...,No. 116 Huayuan Road,Zhengzhou,450002,China,GPL19176,GSE201369\nOxidative stress-mediated alteratio...,SAMN27779695,SRX14979532
1,"Public on Aug 24, 2022",CON2,SRA,ovary,Sus scrofa,tissue: ovarybreed: LandracexYorkshire crossbr...,The cells were teated with medium containing 5...,Porcine primary granulosa cells were maintaine...,total RNA,The TRIzol reagent was used to isolate total R...,...,Henan Academy of Agricultural Sciences,Institute of Animal Husbandry and Veterinary S...,No. 116 Huayuan Road,Zhengzhou,450002,China,GPL19176,GSE201369\nOxidative stress-mediated alteratio...,SAMN27779694,SRX14979533
2,"Public on Aug 24, 2022",CON3,SRA,ovary,Sus scrofa,tissue: ovarybreed: LandracexYorkshire crossbr...,The cells were teated with medium containing 5...,Porcine primary granulosa cells were maintaine...,total RNA,The TRIzol reagent was used to isolate total R...,...,Henan Academy of Agricultural Sciences,Institute of Animal Husbandry and Veterinary S...,No. 116 Huayuan Road,Zhengzhou,450002,China,GPL19176,GSE201369\nOxidative stress-mediated alteratio...,SAMN27779693,SRX14979534


In [None]:
# when you download multiple samples, use followiing codes with specifying GEO_id_ls
def main(max_workers = 4, GEO_id_ls = None):
    with ThreadPoolExecutor(max_workers=max_workers) as pool:
        start = time.time()
        for GEO_id in GEO_id_ls:
            if os.path.exists(GEO_id + ".csv"):
                os.remove(GEO_id + ".csv")
            pool.submit(GEO_crawler(GEO_id).download)
    end = time.time()
    print(f"consuming: {end -start:.3f} seconds")
    
if __name__ == "__main__":
    main(GEO_id_ls=["GSE201369", "GSE44801"])