In [None]:
# To install the grein_loader package
#!pip install grein_loader
!pwd

In [1]:
import grein_loader as loader
import pandas as pd
import os
import time

In [2]:
# Load log file from disk to see if this data has already been added
def load_log(logFile):
    log = pd.read_csv(logFile, names=["GSE", "status"])
    log = log[log["status"] == "end"]
    return set(log["GSE"].unique())

In [15]:
def load_data(geo_accession, count_matrix_df):
    description, metadata, count_matrix = loader.load_dataset(geo_accession)
        
    # Merge in to existing count matrix DF
    if count_matrix_df.empty:
        return count_matrix
    else:
        return pd.merge(count_matrix_df, count_matrix,  how='left', on=["gene", "gene_symbol"])

In [11]:
# Loading grein overview

# loading a subset of the data
# number_of_datasets = 1000
# overview = loader.load_overview(number_of_datasets)

# loading all the data
overview = loader.load_overview()

# Accessing the geo_accession ID and study species from overview
geo_accession_ids = []
species = []
for i in range(len(overview)):
    geo_accession_ids.append(overview[i]['geo_accession'])
    species.append(overview[i]['species'])

In [12]:
# saving the species in each GSE
GSE_species = {'geo_accession_id': geo_accession_ids, 'species': species}
GSE_species_df = pd.DataFrame.from_dict(GSE_species)

In [13]:
# Prepare to download human data

# Filter dataframe for human data only
GSE_species_df_filtered = GSE_species_df[GSE_species_df['species'] == 'Homo sapiens']

GSE_species_df_filtered

# Check if there is already expression data downloaded or whether we are starting fresh
humanLog = "greinLoadHuman.log"
humanData = "download/grein_count_matrix_human.pkl"
if os.path.exists(humanLog) and os.path.exists(humanData): 
    print("...loading human data")
    count_matrix_human_df = pd.read_pickle(humanData)
    completed = load_log(humanLog)
else:
    print("starting human download from scratch")
    count_matrix_human_df = pd.DataFrame()
    completed = set()
    !touch {humanLog}

...loading human data


In [17]:
# Saving human gene expression data
with open(humanLog, "a") as logfile:
    for geo_accession in GSE_species_df_filtered['geo_accession_id']:
        
        # Check if this data was logged as loaded
        if geo_accession in completed:
            print("Already done: {0}".format(geo_accession))
            continue
            
        # If new data, download data and add to df
        print("********* |", geo_accession, "| ***********")
        logfile.write(geo_accession + ",start\n")
        
        count_matrix_human_df = load_data(geo_accession, count_matrix_human_df)
        
        logfile.write(geo_accession + ",write\n")
        count_matrix_human_df.to_pickle(humanData)
        logfile.write(geo_accession + ",end\n")
        


Already done: GSE100007
Already done: GSE100027
Already done: GSE100040
Already done: GSE100075
Already done: GSE100081
Already done: GSE100092
Already done: GSE100099
Already done: GSE100118
Already done: GSE100183
Already done: GSE100206
Already done: GSE100210
Already done: GSE100223
Already done: GSE100258
Already done: GSE100266
Already done: GSE100268
Already done: GSE100291
Already done: GSE100297
********* | GSE100327 | ***********
********* | GSE100338 | ***********
********* | GSE100359 | ***********
********* | GSE100382 | ***********
********* | GSE100392 | ***********
********* | GSE100408 | ***********
********* | GSE100417 | ***********
********* | GSE100427 | ***********
********* | GSE100501 | ***********
********* | GSE100520 | ***********
********* | GSE100530 | ***********
********* | GSE100562 | ***********
********* | GSE100568 | ***********
********* | GSE100572 | ***********
********* | GSE100574 | ***********
********* | GSE100618 | ***********
********* | GSE1

ConnectionError: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))

In [18]:
pd.read_pickle(humanData)

Unnamed: 0,gene,gene_symbol,GSM2667747,GSM2667748,GSM2667749,GSM2667750,GSM2667751,GSM2667752,GSM2667753,GSM2667754,...,GSM2692393,GSM2692394,GSM2692395,GSM2692396,GSM2692397,GSM2692398,GSM2692399,GSM2692400,GSM2692401,GSM2692402
0,ENSG00000000003,TSPAN6,1766.0677,420.8206,300.8009,3142.8439,2207.8392,4367.5056,228.5759,650.3287,...,36.5392,18.2983,28.3281,8.5205,17.8816,20.3450,17.4323,11.6830,11.4493,6.7943
1,ENSG00000000005,TNMD,43.9280,18.0091,0.0000,115.8917,63.5936,39.7803,11.7588,1.0153,...,0.0000,0.0000,2.9989,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000
2,ENSG00000000419,DPM1,1097.7855,367.2333,316.5226,3895.0588,1536.1375,1084.5276,127.7205,245.1014,...,152.1948,106.2851,223.5236,171.2681,188.3446,133.3678,127.3812,223.1960,165.0938,108.3981
3,ENSG00000000457,SCYL3,601.4702,270.9239,163.6869,1177.2241,873.9074,845.1423,229.8548,134.4694,...,378.9609,252.2508,307.5954,264.8942,297.0646,373.8453,272.0667,166.7782,220.1159,147.8496
4,ENSG00000000460,C1orf112,1040.8861,349.1418,298.1800,1967.3975,2115.2425,980.9359,253.4457,145.7038,...,141.2696,121.5965,555.7400,364.5267,360.3099,158.3518,89.2894,73.9050,56.6242,51.1808
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27985,ENSG00000283688,MIR6715B,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,...,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000
27986,ENSG00000283690,MIR3116-2,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,...,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000
27987,ENSG00000283694,MIR3202-2,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,...,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000
27988,ENSG00000283697,LOC101928917,1.7315,0.0000,0.0000,0.0000,2.0039,3.9899,2.5184,4.1298,...,,,,,,,,,,


In [None]:
# Prepare to download mouse data

# Filter dataframe for mouse data only
GSE_species_df_filtered = GSE_species_df[GSE_species_df['species'] == 'Mus musculus']

GSE_species_df_filtered

# Check if there is already expression data downloaded or whether we are starting fresh
miceLog = "greinLoadMice.log"
miceData = "download/grein_count_matrix_mice.pkl"
if os.path.exists(miceLog) and os.path.exists(miceData): 
    print("...loading mouse data")
    count_matrix_mice_df = pd.read_pickle(miceData)
    completed = load_log(miceLog)
else:
    print("starting mouse download from scratch")
    count_matrix_mice_df = pd.DataFrame()
    completed = set()
    !touch {miceLog}

In [None]:
# Saving mouse gene expression data
with open(miceLog, "a") as logfile:
    for geo_accession in GSE_species_df_filtered['geo_accession_id']:
        
        # Check if this data was logged as loaded
        if geo_accession in completed:
            print("Already done: {0}".format(geo_accession))
            continue
            
        # If new data, download data and add to df
        print("********* |", geo_accession, "| ***********")
        logfile.write(geo_accession + ",start\n")
        
        count_matrix_mice_df = load_data(geo_accession, count_matrix_mice_df)
        
        logfile.write(geo_accession + ",write\n")
        count_matrix_mice_df.to_pickle(miceData)
        logfile.write(geo_accession + ",end\n")

In [None]:
pd.read_pickle("download/grein_count_matrix_mice.pkl")


In [9]:
# Loading data from the data sets using the geo_accession IDs
# NOTE this does not save/pickle the description or metadata at all

with open("greinLoad.log", "a") as logfile:
    for geo_accession in geo_accession_ids:
        
        # Check if this data was succesfully pickled
        if geo_accession in completed:
            continue
            
        # If new data, download data and add to df
        print("********* |", geo_accession, "| ***********")
        logfile.write(geo_accession + ",start\n")
        description, metadata, count_matrix = loader.load_dataset(geo_accession)
        # print("Description: ", description)
        
        # Merge in to existing count matrix DF
        if count_matrix_df.empty:
            count_matrix_df = count_matrix
        else:
            count_matrix_df = pd.merge(count_matrix_df, count_matrix,  how='left', on=["gene", "gene_symbol"], suffixes=[None,"y"])
        
        logfile.write(geo_accession + ",write\n")
        count_matrix_df.to_pickle("download/grein_count_matrix.pkl")
        logfile.write(geo_accession + ",end\n")
        
        break

********* | GSE100297 | ***********


NameError: name 'count_matrix_df' is not defined

In [None]:
# Timimg the loading of data sets
start = time.time()
print(loader.load_overview(10))
end = time.time()
print(end-start)

## Save the data loaded above

In [None]:
import random
from sklearn.model_selection import train_test_split
import json
import sys

**Count_matrix**

In [None]:
# still relevant??
# path = os.path.join(os.getcwd(), 'download', 'count_matrix.tsv')
# with open(path, 'w') as file:
#     count_matrix_df.to_csv(path, sep="\t") 

**Description**

In [None]:
# still relevant??
# Save description. Currently saves description for only one data set at a time
# path = os.path.join('download', 'description.json')
# with open(path, 'w') as file:
#     # Serialize and write the variable to the file
#     json.dump(description, file)

**Metadata**

In [None]:
# still relevant??

# # Create a dataframe from metadata
# metadata_df = pd.DataFrame.from_dict(metadata)

# # list of row names in metadata_df
# row_names = list(metadata_df.index)

# # Creating a list of rows that we do not require to process this data set
# unneeded_rows = ['Consent', ' ', 'channel_count', 'organism_ch1', 'relation', 'status']
# for index in row_names:
#     if ('contact' in index) or ('date' in index) or ('data_processing' in index) or ('Hash' in index) or ('Date' in index) or ('library' in index):
#         unneeded_rows.append(index)
        
# # remove unnecessary roles from count_matrix
# metadata_df.drop(unneeded_rows, axis = 0)