In [47]:
# Imports
import pandas as pd
import numpy as np
import sklearn
from sklearn import utils
from sklearn import model_selection

In [3]:
# Getting Labels
with open('data/MANIFEST.txt') as read:
    labels = list(read)
    for i in range(0, len(labels)):
        labels[i] = labels[i].rstrip('\n') # Not sure how else to read this in... there may be a better way

In [4]:
# Test for format
# Add to file list


files = dict() # Dictionary - key = label, value = list of filenames
features = list()
bad_files = list()

first_manifest = pd.read_csv('data/' + labels[0] + '/MANIFEST.txt', delimiter='\t')
first_file = pd.read_csv('data/' + labels[0] + '/' + first_manifest.at[0, 'filename'], delimiter='\t')
feature_order = first_file['miRNA_ID']
features = feature_order

for label in labels:
    print('starting with label: ', label)
    
    manifest = pd.read_csv('data/' + label + '/MANIFEST.txt', delimiter='\t')
    filenames = manifest['filename']
    
    files[label] = []
    
    for i in range(0, len(filenames)):
        filename = filenames[i]
        
        if filename.endswith('annotations.txt'):
            filenames.drop(i)
        else:
            file = pd.read_csv('data/' + label + '/' + filename, delimiter='\t')
            file_features = file['miRNA_ID']
            
            if not file_features.equals(feature_order):
                bad_files.append(filename)
            else:
                files[label].append(filename)
            
        
if len(bad_files) != 0:
    print(bad_files)
else:
    print('no bad files! :)')

starting with label:  Breast Invasive Carcinoma
starting with label:  Kidney Renal Clear Cell Carcinoma
starting with label:  Lung Adenocarcinoma
starting with label:  Lung Squamous Cell Carcinoma
starting with label:  Pancreatic Adenocarcinoma
starting with label:  Uveal Melanoma
no bad files! :)


In [45]:
# Reading in files

data = pd.DataFrame(columns=features)
label_list = list()
for label in labels:
    print('working on label: ', label)
    for file in files[label]:
        file_frame = pd.read_csv('data/' + label + '/' + file, delimiter='\t')
        data.loc[len(data.index)] = file_frame['reads_per_million_miRNA_mapped'].tolist()
        label_list.append(label)
        
data['label'] = label_list
data.head()

working on label:  Breast Invasive Carcinoma
working on label:  Kidney Renal Clear Cell Carcinoma
working on label:  Lung Adenocarcinoma
working on label:  Lung Squamous Cell Carcinoma
working on label:  Pancreatic Adenocarcinoma
working on label:  Uveal Melanoma


miRNA_ID,hsa-let-7a-1,hsa-let-7a-2,hsa-let-7a-3,hsa-let-7b,hsa-let-7c,hsa-let-7d,hsa-let-7e,hsa-let-7f-1,hsa-let-7f-2,hsa-let-7g,...,hsa-mir-942,hsa-mir-943,hsa-mir-944,hsa-mir-95,hsa-mir-9500,hsa-mir-96,hsa-mir-98,hsa-mir-99a,hsa-mir-99b,label
0,17225.641202,17168.697999,17165.904559,37132.340301,4217.020216,410.420821,1099.541016,4846.188888,4989.513855,655.169154,...,1.93392,0.0,3.86784,5.58688,0.0,31.157602,47.703362,1462.043595,37550.067043,Breast Invasive Carcinoma
1,9675.101346,9620.924588,9710.866472,11593.826262,1817.884192,334.477994,1189.031704,4199.545275,4264.938315,511.504784,...,5.925583,0.0,1.269768,1.05814,0.0,26.136053,44.547686,668.744359,13635.189582,Breast Invasive Carcinoma
2,9947.288063,10160.137808,10204.137755,9738.288314,1366.198361,243.649708,813.999023,735.899117,752.399097,195.799765,...,0.549999,0.0,0.0,3.849995,0.0,26.399968,24.199971,228.249726,33884.359339,Breast Invasive Carcinoma
3,18022.771624,18041.827151,18067.323984,17540.076324,3884.643741,230.679238,2396.299685,8376.112098,8537.815694,681.4364,...,3.891622,0.0,1.476132,2.549683,0.0,26.033608,49.785921,1360.323117,17850.466713,Breast Invasive Carcinoma
4,4686.419964,4688.795641,4698.100379,2814.682994,323.884043,234.697147,1773.938243,4904.982301,5131.364576,262.215412,...,0.29696,0.0,0.098987,0.29696,0.0,123.535234,43.554089,89.978788,28231.760414,Breast Invasive Carcinoma


In [49]:
# Data preprocessing

data = utils.shuffle(data)

train, test = model_selection.train_test_split(data, test_size=0.2)

X_train = train[features]
y_train = train['label']
X_test = test[features]
y_test = test['label']

miRNA_ID,hsa-let-7a-1,hsa-let-7a-2,hsa-let-7a-3,hsa-let-7b,hsa-let-7c,hsa-let-7d,hsa-let-7e,hsa-let-7f-1,hsa-let-7f-2,hsa-let-7g,...,hsa-mir-941-5,hsa-mir-942,hsa-mir-943,hsa-mir-944,hsa-mir-95,hsa-mir-9500,hsa-mir-96,hsa-mir-98,hsa-mir-99a,hsa-mir-99b
1505,4415.302601,4581.246859,4418.265891,18977.334639,1443.969048,226.480047,276.855983,1129.860272,1118.430438,382.687781,...,0.0,2.539963,0.0,0.0,5.503253,0.0,7.619889,29.632903,374.644564,11807.865335
494,7398.597818,7392.896502,7444.778478,6915.316262,2684.369633,258.649704,1198.036543,4762.499328,4911.303676,419.996948,...,0.0,2.47057,0.0,1.140263,2.090483,0.0,84.189433,48.461186,628.094983,14499.396896
2197,7415.229319,7385.274365,7510.974907,13856.646986,414.958806,1368.739231,1597.168724,3296.331308,3372.780761,566.938846,...,0.0,10.291272,0.918864,158.595859,1.653954,0.0,44.289226,156.390586,135.256723,63036.065029
729,10755.420113,10756.109585,10923.651306,20164.301203,910.792652,256.828359,1398.24943,1699.204004,1711.959238,445.054244,...,0.0,2.068416,0.0,2.068416,0.689472,0.0,39.644646,43.092007,203.049535,24656.556707
195,7557.176943,7276.388608,7396.929055,15782.998834,2509.368474,531.087027,882.781506,1939.992129,2013.73452,835.274389,...,0.0,1.418123,0.0,0.709061,2.127184,0.0,91.468927,53.179609,324.750144,21868.873266
