# 这个文件是下载并导入的文件

In [None]:
import GEOparse
import datetime
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
import sys
warnings.filterwarnings('ignore')

In [None]:
GPL96 = GEOparse.get_GEO(geo='GPL96', destdir="./datasets", silent=True)
GPL570 = GEOparse.get_GEO(geo='GPL570', destdir="./datasets", silent=True)
GPL571 = GEOparse.get_GEO(geo='GPL571', destdir="./datasets", silent=True)
GPL2507 = GEOparse.get_GEO(geo='GPL2507', destdir="./datasets", silent=True)

In [None]:
GSE_IDs = [ "68310",
            "6269",
            "20346",
            "21802",
            "27131",
            "28750",
            "40012",
            "40396",
            "42026",
            "57065",
            "60244",
            "63990",
            # "66099", https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE66099 This dataset thus includes all unique patients from GSE4607, GSE8121, GSE9692, GSE13904, GSE26378, and GSE26440.
            # "68310", #  可能太大电脑爆炸 1610 名健康成年人 共有 880 个阵列，对应 133 个个体
            '111368',
            ]
none_key=["Uninfected type 2 diabetes mellitus","non-infectious illness","Vac","Healt","Blood_Control","Control_Healthy","PAXgene whole blood, healthy control","Blood_HV","whole blood-Healthy Control","healthy subject","Uninfected healthy","Uninfected healthy","HC","Vac","pathogen: None","WB-control", "disease: Control"]
bacteria_key=["bacterial","bacterial pneumonia","Experiment_Post_Surgical","Experiment_Sepsis","PAXgene whole blood, bacterial pneumonia", "Blood_P","whole blood-BACTERIA","intensive-care unit patient","Septicemic melioidosis","Other sepsis","PBMC_S.aureus_MRSA_INF","disease: Sepsis","disease: SepticShock","PBMC_S.pneumoniae_INF","PBMC_S.aureus_MSSA_INF","bacterial pneumonia_day","pathogen: E.coli","pathogen: Bacteria","pathogen: MRSA","pathogen: Salmonella","pathogen: MSSA","WB-bact","PBMC_E.coli"]
virus_key=["infection: influenza A virus","infection: influenza B virus","infection: human coronavirus NL63","infection: human coronavirus HKU1","infection: human rhinovirus","infection: enterovirus","infection: respiratory syncytial virus A","Vir","viral","Severe Influenza","Pande","blood_day","PAXgene whole blood, SIRS","PAXgene whole blood, influenza A pneumonia","whole blood-VIRUS","H1N1","H3N2","PBMC_InfluenzaA_INF","PBMC_InfluenzaB_INF","Severe Influenza_day","pathogen: Adenovirus","disease: SIRS","pathogen: HHV6","pathogen: Enterovirus","pathogen: Rhinovirus","WB-H1N1","WB-RSV","Adenovirus","pathogen: Adenovirus"]
both_key=["PAXgene whole blood, mixed bacterial and influenza A pneumonia","whole blood-COINFECTION"]
other_key=["infection: our tests did not detect one of the viruses sought","Sarcoid","Pneumonia","Cancer"]
infection_types = {
    "both": both_key,
    "bacteria": bacteria_key,
    "virus": virus_key,
    "other": other_key,
    "none": none_key
}
infection_map = {
    "non-infectious illness": "None",
    "None": "None",
    "Control": "None",
    "viral": "virus",
    "Adenovirus": "virus",
    "HHV6": "virus",
    "Enterovirus": "virus",
    "Rhinovirus": "virus",
    "bacterial_status: No": "virus",
    "bacterial_status: Excluded": "virus",
    "influenza A virus": "virus",
    "influenza A virus and human rhinovirus": "virus",
    "influenza A virus and human coronavirus OC43": "virus",
    "influenza A virus and respiratory syncytial virus B": "virus",
    "bacterial": "bacteria",
    "E.coli": "bacteria",
    "Bacteria": "bacteria",
    "MRSA": "bacteria",
    "Salmonella": "bacteria",
    "MSSA": "bacteria",
    "bacterial_status: N/A": "other",
    "our tests did not detect one of the viruses sought": "other",
    "bacterial_status: Yes": "both",
}
value_map = {
    "Log2 normalized signal": True,
    "RMA-normalised signal intensity from Partek Genomics Suite": True,
    "RMA signal intensity (log base 2)": True,
    "RMA": True,
    "log2 quantile normalized": True,
    "RMA normalized and log2 transformed": True,
    "RMA normalized and Log2 transformed": True,
    "gcRMA normalized gene expression values": True,
    "Raw signal intensities were normalized by background adjustment, variance stabilization transformation, and robust spline normalization in R package lumi": True,
    "MAS5-calculated Signal intensity": False,
    "Average normalized": False,
    "Normalized value from BeadStudio software": False,
    "cubic spline normalised intensity": False,
    "GenomeStudio quantile normalised average signal intensity": False,
    "quantile-normalized signal intensity": False,
    "Illumina calculated signal intensity [AVG_Signal]": False,
}

platforms = ["GPL570", "GPL571"]

In [None]:
data = pd.DataFrame(
    columns=['GSE','GSM','GPL','description','value','infection','type','title']
)
for gseid in GSE_IDs:
        GEONAME="GSE" + gseid
        print(GEONAME)
        gse = GEOparse.get_GEO(geo=GEONAME, destdir="./datasets", silent=True)

        n = 0
        no_new_gpl = 1

        for i in gse.gsms:
            n += 1

            title = gse.gsms[i].get_metadata_attribute('title')

            # 判断属于哪一类
            metadata = gse.gsms[i].metadata
            found = False
            
            characteristics_ch1 = gse.gsms[i].metadata['characteristics_ch1'] 
            useful_characteristics_ch1 = ['pathogen','infection_status','disease','infection']
            status = []
            for characteristics in characteristics_ch1:
                label = characteristics.split(':')[0].strip()
                if label in useful_characteristics_ch1:
                    status.append(characteristics.split(':')[1].strip())
                if label == 'bacterial_status':
                    status.append(characteristics)
            if len(status) == 1:
                datatype = status[0]
                if status[0] in ["non-infectious illness","None","Control"]:
                    infection = "None"
                elif status[0] in ["viral","Adenovirus","HHV6","Enterovirus","Rhinovirus","bacterial_status: No","bacterial_status: Excluded"] or ("bacterial" not in status[0] and ("virus" in  status[0] or "influenza" in  status[0])):
                    infection = "virus"
                elif status[0] in ["bacterial","E.coli","Bacteria","MRSA","Salmonella","MSSA"]:
                    infection = "bacteria"
                elif status[0] in ["bacterial_status: N/A"]:
                    infection = "other"
                elif status[0] in ["bacterial_status: Yes"]:
                    infection = "both"
                else:
                    print(status[0])
            elif len(status) > 1:
                print(status)
            else:
                infection = "unknown"
                matched_word = ""
                metadata_values = metadata.values()  # store metadata values in a separate variable
                n_k = 0
                for infection_type, keys in infection_types.items():
                    for value in metadata_values:
                        for k in keys:
                            if k in value[0]:
                                infection = infection_type
                                matched_word = k
                                break
                        if infection != "unknown":
                            break  # exit the loop once we find the infection type
                    if infection != "unknown":
                        break  # exit the loop once we find the infection type
                datatype = matched_word
                if infection == "unknown":
                    print('unknown')
                    print(metadata)

            # 数据处理
            value = gse.gsms[i].table["VALUE"]
            sample_gpl = gse.gsms[i].metadata['platform_id']
            description = gse.gsms[i].columns["description"][1]
            
            if description in value_map:
                if value_map[description]:
                    value = 2**value
            elif sample_gpl in platforms:
                value = 2**value
            else:
                print(gse.gsms[i].columns["description"][1])
                print(sample_gpl)
                print('Now just use the raw value')
                print('Samples: ', value.head())
                print('-'*50)

            data = data.append({
                'GSE' : GEONAME,
                'GSM' : metadata['geo_accession'][0],
                'GPL' : sample_gpl[0],
                'description' : gse.gsms[i].columns["description"][1],
                'value' : value,
                'infection' : infection,
                'type': datatype,
                'title' : title},ignore_index=True)
data
                

GSE68310
GSE6269
GSE20346
GSE21802
GSE27131
GSE28750
GSE40012
GSE40396
GSE42026
GSE57065
GSE60244
GSE63990
GSE111368


Unnamed: 0,GSE,GSM,GPL,description,value,infection,type,title
0,GSE68310,GSM1667382,GPL10558,Raw signal intensities were normalized by back...,0 174.014428 1 178.135385 2 ...,virus,influenza A virus,WholeBloodRNA_JR0021_Baseline
1,GSE68310,GSM1667383,GPL10558,Raw signal intensities were normalized by back...,0 178.375804 1 182.438011 2 ...,virus,influenza A virus,WholeBloodRNA_JR0021_Day0
2,GSE68310,GSM1667384,GPL10558,Raw signal intensities were normalized by back...,0 177.493280 1 180.617391 2 ...,virus,influenza A virus,WholeBloodRNA_JR0021_Day2
3,GSE68310,GSM1667385,GPL10558,Raw signal intensities were normalized by back...,0 175.914112 1 182.514468 2 ...,virus,influenza A virus,WholeBloodRNA_JR0021_Day4
4,GSE68310,GSM1667386,GPL10558,Raw signal intensities were normalized by back...,0 180.519156 1 181.402787 2 ...,virus,influenza A virus,WholeBloodRNA_JR0021_Day6
...,...,...,...,...,...,...,...,...
2452,GSE111368,GSM3029687,GPL10558,Log2 normalized signal,0 26.090496 1 22.641748 2 ...,both,bacterial_status: Yes,LON0177_H1N1_T1
2453,GSE111368,GSM3029688,GPL10558,Log2 normalized signal,0 41.539173 1 22.578267 2 ...,both,bacterial_status: Yes,LON0146_H1N1_T3
2454,GSE111368,GSM3029689,GPL10558,Log2 normalized signal,0 35.475081 1 27.385960 2 ...,virus,bacterial_status: No,LON0113_H1N1_T2
2455,GSE111368,GSM3029690,GPL10558,Log2 normalized signal,0 33.681343 1 26.777165 2 ...,other,bacterial_status: N/A,2027_HC_HC


In [None]:
databygpl = data.groupby(['GPL'])
for i in databygpl.groups.keys():
    values = pd.DataFrame()
    gpl = GEOparse.get_GEO(geo=i, destdir="./datasets", silent=True)
    save_data = pd.DataFrame()
    metadata = pd.DataFrame()
    for j in databygpl.groups[i]:
        value = data.loc[j, 'value']
        values = values.append(value, ignore_index=True)
        # save data will save the ['GSE','GSM','GPL','description','infection','type','title'] and value we just get
        metadata = metadata.append(data.loc[j, ['GSE','GSM','GPL','description','infection','type','title']], ignore_index=True)
    save_data = pd.concat([metadata, values], axis=1)
    # rename the columns to GB_ACC. The column name now is the order of the sample, which is not good for us. 
    try:
        GB_ACC = gpl.table['GB_ACC']
    except:
        print('error in rename columns for gpl: ', i, ' with error: ', sys.exc_info()[0])
        GB_ACC = gpl.table['ID']
    # create a dict to rename the columns. If the GB_ACC is null or empty, then use the original column name.
    save_data.rename(columns=dict(zip(save_data.columns[7:], GB_ACC)), inplace=True)

    # 其中id与GB_ACC对应关系存在 gpl.table 这个表格中， id的列为ID， GB_ACC的列为GB_ACC。部分id对应的GB_ACC为空，就保留原来的id列名。
    # try:
    #     cols_dict = dict(zip(gpl.table['ID'], gpl.table['GB_ACC']))
    #     save_data.rename(columns=cols_dict, inplace=True)
    # except KeyError:
    #     pass
    # except:
    #     # log the error
    #     print('error in rename columns for gpl: ', i, ' with error: ', sys.exc_info()[0])
    
    # save_data = pd.concat([data.loc[databygpl.groups[i], ['GSE','GSM','GPL','description','infection','type','title']], values], axis=1)
    save_data.to_csv('./datasets/' + i + '_save.csv', index=False)
    # save to pickle
    save_data.to_pickle('./datasets/' + i + '_save.pkl')