In [None]:
# To install the grein_loader package
#!pip install grein_loader
!pwd

In [22]:
import grein_loader as loader
import pandas as pd
import os
import time

In [27]:
# Load log file from disk to see if this data has already been added
def load_log(species):
    log = pd.read_csv("greinLoad" + species + ".log", names=["GSE", "status"])
    log = log[log["status"] == "end"]
    return set(log["GSE"].unique())

In [24]:
# Loading metadata for the datasets

# loading a subset of the data
# number_of_datasets = 1000
# overview = loader.load_overview(number_of_datasets)

# loading all the data
overview = loader.load_overview()

# Accessing the geo_accession ID and study species from overview
geo_accession_ids = []
species = []
for i in range(len(overview)):
    geo_accession_ids.append(overview[i]['geo_accession'])
    species.append(overview[i]['species'])
    
# Check if there is already data downloaded or whether we are starting fresh
if os.path.exists("greinLoadHuman.log") and os.path.exists("download/grein_count_matrix_human.pkl"): 
    count_matrix_human_df = pd.read_pickle("download/grein_count_matrix_human.pkl")
    completed = load_log("Human")

else:
    count_matrix_human_df = pd.DataFrame()
    completed = set()
    os.system("touch greinLoadHuman.log")


In [25]:
# saving the species in each GSE
GSE_species = {'geo_accession_id': geo_accession_ids, 'species': species}
GSE_species_df = pd.DataFrame.from_dict(GSE_species)

In [26]:
# Loading gene expression data filtered by species using the geo_accession IDs
# NOTE this does not save/pickle the description or metadata at all

GSE_species_df_filtered = GSE_species_df[GSE_species_df['species'] == 'Homo sapiens']

GSE_species_df_filtered

Unnamed: 0,geo_accession_id,species
0,GSE100007,Homo sapiens
1,GSE100027,Homo sapiens
2,GSE100040,Homo sapiens
3,GSE100075,Homo sapiens
4,GSE100081,Homo sapiens
...,...,...
20357,GSE131592,Homo sapiens
20358,GSE131705,Homo sapiens
20359,GSE131512,Homo sapiens
20360,GSE226189,Homo sapiens


In [None]:
# Saving human gene expression data

with open("greinLoadHuman.log", "a") as logfile:
    for geo_accession in geo_accession_ids:
        
        # Check if this data was succesfully pickled
        if geo_accession in completed:
            continue
            
        # If new data, download data and add to df
        print("********* |", geo_accession, "| ***********")
        logfile.write(geo_accession + ",start\n")
        description, metadata, count_matrix = loader.load_dataset(geo_accession)
        # print("Description: ", description)
        
        # Merge in to existing count matrix DF
        if count_matrix_human_df.empty:
            count_matrix_human_df = count_matrix
        else:
            count_matrix_human_df = pd.merge(count_matrix_df, count_matrix,  how='left', on=["gene", "gene_symbol"])
        
        logfile.write(geo_accession + ",write\n")
        count_matrix_df.to_pickle("download/grein_count_matrix_human.pkl")
        logfile.write(geo_accession + ",end\n")
        
        break

In [None]:
# Loading data from the data sets using the geo_accession IDs
# NOTE this does not save/pickle the description or metadata at all

with open("greinLoad.log", "a") as logfile:
    for geo_accession in geo_accession_ids:
        
        # Check if this data was succesfully pickled
        if geo_accession in completed:
            continue
            
        # If new data, download data and add to df
        print("********* |", geo_accession, "| ***********")
        logfile.write(geo_accession + ",start\n")
        description, metadata, count_matrix = loader.load_dataset(geo_accession)
        # print("Description: ", description)
        
        # Merge in to existing count matrix DF
        if count_matrix_df.empty:
            count_matrix_df = count_matrix
        else:
            count_matrix_df = pd.merge(count_matrix_df, count_matrix,  how='left', on=["gene", "gene_symbol"])
        
        logfile.write(geo_accession + ",write\n")
        count_matrix_df.to_pickle("download/grein_count_matrix.pkl")
        logfile.write(geo_accession + ",end\n")
        
        break

In [None]:
# Timimg the loading of data sets
start = time.time()
print(loader.load_overview(10))
end = time.time()
print(end-start)

## Save the data loaded above

In [None]:
import random
from sklearn.model_selection import train_test_split
import json
import sys

**Count_matrix**

In [None]:
path = os.path.join(os.getcwd(), 'download', 'count_matrix.tsv')
with open(path, 'w') as file:
    count_matrix_df.to_csv(path, sep="\t") 

**Description**

In [None]:
# Save description. Currently saves description for only one data set at a time
path = os.path.join('download', 'description.json')
with open(path, 'w') as file:
    # Serialize and write the variable to the file
    json.dump(description, file)

**Metadata**

In [None]:
# Create a dataframe from metadata
metadata_df = pd.DataFrame.from_dict(metadata)

# list of row names in metadata_df
row_names = list(metadata_df.index)

# Creating a list of rows that we do not require to process this data set
unneeded_rows = ['Consent', ' ', 'channel_count', 'organism_ch1', 'relation', 'status']
for index in row_names:
    if ('contact' in index) or ('date' in index) or ('data_processing' in index) or ('Hash' in index) or ('Date' in index) or ('library' in index):
        unneeded_rows.append(index)
        
# remove unnecessary roles from count_matrix
metadata_df.drop(unneeded_rows, axis = 0)

In [None]:
print(type(metadata))

In [None]:
print(type(description))

In [None]:
print(count_matrix_df)

In [None]:
print(count_matrix)