In [1]:
import pandas as pd
import numpy as np
import os
from zipfile import ZipFile
import gzip
import shutil
from tqdm import tqdm
import time
from numpy import genfromtxt
import pickle
import tarfile
import glob
import json


In [3]:
def unzip_rna_seq_data(file_name, desired_folder_name):
    if 'tar' in file_name: 
        open_tar = tarfile.open(file_name)
        open_tar.extractall(f'{desired_folder_name}')
        open_tar.close()
    else:
        print('Not a tarfile')

In [4]:
unzip_rna_seq_data('/Users/dinakats/Desktop/SPICED/final_proj_git_renew/Genetics-to-Therapuetics/Data/kidney2test/kidney_v2_rna/gdc_download_20200529_001144.112503.tar.gz','spooky')

In [5]:
def unzip_individual_rna_seq_file(root_dir):
    files_to_unpack =[]
    dfs = []
    meta_data_file = ''.join(glob.glob('**/**metadata.cart**', recursive=True))
    with open(meta_data_file, 'r') as f:
        meta_data= json.load(f)
    convert_filename_caseuuid= {meta_data[i]['file_id']:meta_data[i]['associated_entities'][0]['case_id'] for i in range(0,len(meta_data))}
    #dictionary of file_id:case_id
    for directory in os.listdir(root_dir):
        try:
            for filename in os.listdir(root_dir+'/'+ directory):
                if ".gz" in filename: #change to .gz to unpack
                    files_to_unpack.append(os.path.join(root_dir,directory,filename))

        except NotADirectoryError:
            continue
    for file in files_to_unpack:
        dfs.append(pd.read_csv(file, compression = 'gzip',sep="\t", names =['gene', convert_filename_caseuuid[file.split('/')[-2]]], index_col='gene'))
        # these dfs already have the correct case id name
    return files_to_unpack, dfs, convert_filename_caseuuid

In [6]:
def concat_all_rna_seq(dfs):
    rna_seq_data = pd.concat(dfs, join = "outer", axis = 1).T
    if type(rna_seq_data.index[0])== str:
        rna_seq_data.reset_index(inplace=True)
    return rna_seq_data

In [19]:
def convert_ensg_to_gene_name(dataframe_with_genes):
    change_name_file = 'mart_export.txt'
    gene_names = {}
    with open(change_name_file) as fh:
        for line in fh:
            engs, gene_name = line.split(',', 1)
            gene_names[gene_name.split('.')[0]] = engs
    dataframe = dataframe_with_genes.rename(columns = lambda x: x.split('.')[0]).rename(columns = gene_names)
    genes = dataframe.columns[1:-1].tolist()
    return dataframe, genes, gene_names
   

In [8]:
def concat_rna_to_clinical_data(clinical_dataframe, rna_dataframe):
    full_data = pd.merge(rna_dataframe, clinical_dataframe,how = 'right', left_on =['index'], right_on=['bcr_patient_uuid'])
    return full_data
    

In [9]:
def limit_full_data_for_pca(full_data, genes):
    limit_full_data = full_data.loc[(full_data.standard_drugs!='')
                                &(full_data.standard_drugs!='[not available]')
                                &(full_data.standard_drugs!='[unknown]')].copy()

    limit_full_data.dropna(subset = ['index'], inplace=True)
    
    columns_needed = genes+['standard_drugs', 'vital_status']
   
    return limit_full_data.loc[:,columns_needed]

In [10]:
rootdir_test = '/Users/dinakats/Desktop/SPICED/final_proj_git_renew/Genetics-to-Therapuetics/Data/kidney2test/kidney_v2_rna/spooky'

In [11]:
kidney_files, kidney_dfs, kidney_metadata_dict = unzip_individual_rna_seq_file(rootdir_test)

In [12]:
kidney_dfs[0]

Unnamed: 0_level_0,455fd7d4-5ff4-423f-83ce-4630aef89bb7
gene,Unnamed: 1_level_1
ENSG00000242268.2,0.000000
ENSG00000270112.3,1.535474
ENSG00000167578.15,1.432444
ENSG00000273842.1,0.000000
ENSG00000078237.5,3.042826
...,...
ENSG00000105063.17,13.336782
ENSG00000231119.2,0.287535
ENSG00000280861.1,0.000000
ENSG00000123685.7,0.191725


In [13]:
len(kidney_files), len(kidney_dfs)

(241, 241)

In [14]:
kidney_rna_seq = concat_all_rna_seq(kidney_dfs)

In [15]:
kidney_rna_seq.shape

(241, 60484)

In [50]:
kidney_rna_seq.head()

gene,index,ENSG00000242268.2,ENSG00000270112.3,ENSG00000167578.15,ENSG00000273842.1,ENSG00000078237.5,ENSG00000146083.10,ENSG00000225275.4,ENSG00000158486.12,ENSG00000198242.12,...,ENSG00000238244.3,ENSG00000186115.11,ENSG00000216352.1,ENSG00000267117.1,ENSG00000273233.1,ENSG00000105063.17,ENSG00000231119.2,ENSG00000280861.1,ENSG00000123685.7,ENSG00000181518.3
0,455fd7d4-5ff4-423f-83ce-4630aef89bb7,0.0,1.535474,1.432444,0.0,3.042826,13.129536,0.0,0.013725,143.973173,...,0.0,0.0,0.0,0.0,0.032132,13.336782,0.287535,0.0,0.191725,0.0
1,822cf6c1-dd65-4814-94b1-0c335208ad9b,0.022254,0.074107,1.745773,0.0,2.043631,6.803041,0.0,0.014786,74.978455,...,0.0,25.374941,0.0,0.035828,0.03709,8.028534,0.148482,0.0,0.244604,0.0
2,9fc40b03-0b56-4b1f-93ed-dbc455ae3fd9,0.759959,0.143791,1.059033,0.0,2.141863,27.87986,0.0,1.682061,110.210481,...,0.0,0.546409,0.0,2.224577,0.0,9.560019,0.271155,0.0,1.084819,0.0
3,32377dd2-c2a2-44ff-8938-b8cfb35db79c,0.023698,0.003288,2.149666,0.258854,3.389593,5.223044,0.0,0.004499,125.32149,...,0.0,0.0,0.0,0.076306,0.0,14.921367,0.260428,0.0,5.873116,0.0
4,3f0a2571-c088-4735-8152-61f6fd60ac63,0.206307,0.004771,8.749481,0.0,0.605681,10.567453,0.0,0.009791,186.633033,...,0.0,0.147628,0.018097,0.0,0.286537,7.632067,0.337382,0.0,0.90885,0.0


In [51]:
kidney_rna_rename, genes, gene_names_dict = convert_ensg_to_gene_name(kidney_rna_seq)

In [52]:
genes = kidney_rna_rename.columns[1:-1].tolist()

In [53]:
clinical_data_df6= pickle.load(open('clinical_data_df6.pickle','rb'))

In [54]:
clinical_data_df6.head()

Unnamed: 0,bcr_patient_uuid,drug_name,karnofsky_performance_score,therapy_type,vital_status,standard_drugs
0,5cbf0134-023b-4bd0-97da-eb836ccbc729,lupron,,,alive,lupron
1,5cbf0134-023b-4bd0-97da-eb836ccbc729,leuprolide,,,alive,leuprolide
2,cff68090-09df-492b-874c-0caeb29f9361,[not available],,,alive,[not available]
3,5570758c-1f07-4ff9-a570-b7ae39d15a89,[not available],,,alive,[not available]
4,f9c593d1-f204-4026-8a9b-a6af0922c885,[not available],100.0,,alive,[not available]


In [55]:
full_data= concat_rna_to_clinical_data(clinical_data_df6, kidney_rna_rename)

In [56]:
full_data

Unnamed: 0,index,LINC02082,AC090241.2,RAB4B,ENSG00000273842,TIGAR,RNF44,NUP210P2,DNAH3,RPL23A,...,AL031666.1,ENSG00000280861,BATF3,OR8D4,bcr_patient_uuid,drug_name,karnofsky_performance_score,therapy_type,vital_status,standard_drugs
0,455fd7d4-5ff4-423f-83ce-4630aef89bb7,0.000000,1.535474,1.432444,0.0,3.042826,13.129536,0.0,0.013725,143.973173,...,0.287535,0.0,0.191725,0.0,455fd7d4-5ff4-423f-83ce-4630aef89bb7,sorafenib,,targeted molecular therapy,dead,sorafenib
1,455fd7d4-5ff4-423f-83ce-4630aef89bb7,0.049100,0.112411,3.236406,0.0,2.588924,12.977443,0.0,0.017477,102.836528,...,0.337241,0.0,0.507561,0.0,455fd7d4-5ff4-423f-83ce-4630aef89bb7,sorafenib,,targeted molecular therapy,dead,sorafenib
2,822cf6c1-dd65-4814-94b1-0c335208ad9b,0.022254,0.074107,1.745773,0.0,2.043631,6.803041,0.0,0.014786,74.978455,...,0.148482,0.0,0.244604,0.0,822cf6c1-dd65-4814-94b1-0c335208ad9b,oncophage vaccine,,"other, specify in notes",dead,oncophage vaccine
3,822cf6c1-dd65-4814-94b1-0c335208ad9b,0.029526,0.000000,7.044585,0.0,3.217665,8.207564,0.0,0.009809,358.025155,...,0.162237,0.0,3.121710,0.0,822cf6c1-dd65-4814-94b1-0c335208ad9b,oncophage vaccine,,"other, specify in notes",dead,oncophage vaccine
4,9fc40b03-0b56-4b1f-93ed-dbc455ae3fd9,0.759959,0.143791,1.059033,0.0,2.141863,27.879860,0.0,1.682061,110.210481,...,0.271155,0.0,1.084819,0.0,9fc40b03-0b56-4b1f-93ed-dbc455ae3fd9,interferon,0.0,immunotherapy,dead,interferon-alpha
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1628,,,,,,,,,,,...,,,,,01c5f902-ec3c-4eb7-9e38-1d29ae6ab959,,,,dead,
1629,,,,,,,,,,,...,,,,,0a4af73d-3c12-4740-bf92-c0eedd3dd135,,,,dead,
1630,,,,,,,,,,,...,,,,,ab706ea0-d642-4f58-ba8d-f5d930b91286,,,,dead,
1631,,,,,,,,,,,...,,,,,c849127a-2f15-4fab-8815-db09b017f48c,,,,dead,


In [57]:
limited_data = limit_full_data_for_pca(full_data, genes)

In [58]:
limited_data.head()

Unnamed: 0,LINC02082,AC090241.2,RAB4B,ENSG00000273842,TIGAR,RNF44,NUP210P2,DNAH3,RPL23A,EHD4-AS1,...,CYP4F2,AL590824.1,AC010525.1,AC097724.1,PPP6R1,AL031666.1,ENSG00000280861,BATF3,standard_drugs,vital_status
0,0.0,1.535474,1.432444,0.0,3.042826,13.129536,0.0,0.013725,143.973173,0.106523,...,0.0,0.0,0.0,0.032132,13.336782,0.287535,0.0,0.191725,sorafenib,dead
1,0.0491,0.112411,3.236406,0.0,2.588924,12.977443,0.0,0.017477,102.836528,0.226077,...,3.666961,0.0,0.0,0.081834,12.288496,0.337241,0.0,0.507561,sorafenib,dead
2,0.022254,0.074107,1.745773,0.0,2.043631,6.803041,0.0,0.014786,74.978455,0.184439,...,25.374941,0.0,0.035828,0.03709,8.028534,0.148482,0.0,0.244604,oncophage vaccine,dead
3,0.029526,0.0,7.044585,0.0,3.217665,8.207564,0.0,0.009809,358.025155,0.163139,...,0.226847,0.0,0.095072,0.0,5.626958,0.162237,0.0,3.12171,oncophage vaccine,dead
4,0.759959,0.143791,1.059033,0.0,2.141863,27.87986,0.0,1.682061,110.210481,0.0,...,0.546409,0.0,2.224577,0.0,9.560019,0.271155,0.0,1.084819,interferon-alpha,dead


In [61]:
limited_data.shape

(577, 60484)

In [62]:
limited_data.standard_drugs.value_counts()

gemcitabine      63
sunitinib        60
doxorubicin      51
avastin          34
sorafenib        34
                 ..
levothyroxine     1
inf               1
r1507             1
 mitomycin        1
gleevec           1
Name: standard_drugs, Length: 100, dtype: int64

In [63]:
pickle.dump(limited_data, open( "limited_data_0624.pickle", "wb" ) )

In [1]:
thing = 'YES'

In [3]:
thing.lower()

'yes'

In [7]:
for directory in os.listdir('/Data'):
    print (directory)

FileNotFoundError: [Errno 2] No such file or directory: '/Data'

In [8]:
cwd = os.getcwd()

In [10]:
cwd+'/Data'

'/Users/dinakats/Desktop/SPICED/final_proj_git_renew/Genetics-to-Therapuetics/Data'

In [11]:
bop = '/Data'

In [12]:
cwd+f'lakdjf{bop}'

'/Users/dinakats/Desktop/SPICED/final_proj_git_renew/Genetics-to-Therapueticslakdjf/Data'

In [13]:
os.getcwd+f'adkfja{bop}'

TypeError: unsupported operand type(s) for +: 'builtin_function_or_method' and 'str'

In [15]:
os.path.join(os.getcwd(), '/data')

'/data'