# Get Probe Annotation Function

In [82]:
def gene_ids(filename):
# gene_ids returns a list containing the probe annotation data including 
# Probe ID (ID) 
# GeneBank accession number (GB_ACC) 
# Gene symbol (GENE_SYMBOL) 
# Unigene Cluster ID (UNIGENE) 
# Descriptive gene name (GENE_NAME) 
# Entrez Gene ID (ENTREZ_GENE)
    with open(filename) as f: 
        gene_ids = []
        test = False
        for line in f:
            if line.strip() == '!platform_table_begin':
                test = True
            elif line.strip() == '!platform_table_end':
                return gene_ids
            elif test:
                gene_ids.append(line.strip())

#### Test Case for gene_ids (Thymus, Agemap, GSE7829.txt)

Note: GSE7829.txt originally downloaded from GEO as GSE7829_family.soft file.

Only changed to file was extension to '.txt'.

Series_matrix_file contains Z-scores of raw values for each array

In [78]:
test = gene_ids(r'GSE7829.txt')

In [79]:
len(test)

8449

In [80]:
test[0:2]

['ID\tGB_ACC\tGENE_SYMBOL\tUNIGENE\tGENE_NAME\tENTREZ_GENE',
 '1\tAA407331\tNo Value\tNo Value\tNo Value\tNo Value']

In [84]:
test[50] # look to see that data contains expected info

'50\tBG077957\tPdcd6\tMm.24254\tProgrammed cell death 6\t18570'

In [86]:
test[8448] # expected value BG070587 with No Value for all entries

'8448\tBG070587\tNo Value\tNo Value\tNo Value\tNo Value'

# Get Sample ID Function

In [87]:
def sample_id(filename):    
# sample_ids returns a list containing each sample id present in file    
    with open(filename) as f:
        sample_id = []
        for line in f:
            if '^SAMPLE' in line:
                sample_id.append(line.rstrip('/n')[9:])
        return sample_id

#### Test Case for sample_ids (Thymus, Agemap, GSE7829.txt)

In [89]:
second_test = sample_id(r'GSE7829.txt')

In [90]:
second_test

[' GSM189997\n',
 ' GSM189998\n',
 ' GSM189999\n',
 ' GSM190000\n',
 ' GSM190001\n',
 ' GSM190002\n',
 ' GSM190003\n',
 ' GSM190004\n',
 ' GSM190005\n',
 ' GSM190006\n',
 ' GSM190007\n',
 ' GSM190008\n',
 ' GSM190009\n',
 ' GSM190010\n',
 ' GSM190011\n',
 ' GSM190012\n',
 ' GSM190013\n',
 ' GSM190014\n',
 ' GSM190015\n',
 ' GSM190016\n',
 ' GSM190017\n',
 ' GSM190018\n',
 ' GSM190019\n',
 ' GSM190020\n',
 ' GSM190021\n',
 ' GSM190022\n',
 ' GSM190023\n',
 ' GSM190024\n',
 ' GSM190025\n',
 ' GSM190026\n',
 ' GSM190027\n',
 ' GSM190028\n',
 ' GSM190029\n',
 ' GSM190030\n',
 ' GSM190031\n',
 ' GSM190032\n',
 ' GSM190033\n',
 ' GSM190034\n',
 ' GSM190035\n',
 ' GSM190036\n',
 ' GSM190037\n',
 ' GSM190038\n',
 ' GSM190039\n',
 ' GSM190040\n',
 ' GSM190041\n',
 ' GSM190042\n',
 ' GSM190043\n',
 ' GSM190044\n',
 ' GSM190045\n',
 ' GSM190046\n',
 ' GSM190047\n',
 ' GSM190048\n',
 ' GSM190049\n',
 ' GSM190050\n',
 ' GSM190051\n',
 ' GSM190052\n',
 ' GSM190053\n',
 ' GSM190054\n',
 ' GSM190055\n

In [92]:
len(second_test) # expected 134 from combined platforms A/B

134

# Get Raw and Z-Transformed Data Function

In [128]:
def data_extractor(filename):    
    dataset = {}
    with open(filename) as f:
        array_data = []
        test = False
        for line in f:
            if '^SAMPLE' in line:
                thekey = line.rstrip('/n')[10:]
                if thekey not in dataset:
                    dataset[thekey] = []
                    continue
            if '!sample_table_begin' in line:
                test = True
                continue
            if test:
                array_data.append(line)
            if '!sample_table_end' in line:
                dataset[thekey] = array_data[:-1]
                test = False
                array_data = []
                continue
        return dataset

In [129]:
third_test = data_extractor(r'GSE7829.txt')

In [130]:
third_test.keys()

dict_keys(['GSM189998\n', 'GSM190076\n', 'GSM190003\n', 'GSM190006\n', 'GSM190128\n', 'GSM190067\n', 'GSM190122\n', 'GSM190046\n', 'GSM190055\n', 'GSM190066\n', 'GSM190028\n', 'GSM190120\n', 'GSM190106\n', 'GSM190124\n', 'GSM190025\n', 'GSM190035\n', 'GSM190060\n', 'GSM190084\n', 'GSM190073\n', 'GSM190115\n', 'GSM190098\n', 'GSM190129\n', 'GSM190030\n', 'GSM190105\n', 'GSM190126\n', 'GSM190002\n', 'GSM190114\n', 'GSM190127\n', 'GSM190039\n', 'GSM190096\n', 'GSM190019\n', 'GSM190029\n', 'GSM190101\n', 'GSM190080\n', 'GSM190069\n', 'GSM190023\n', 'GSM190108\n', 'GSM190059\n', 'GSM190123\n', 'GSM190086\n', 'GSM190047\n', 'GSM190087\n', 'GSM190041\n', 'GSM190104\n', 'GSM190112\n', 'GSM190000\n', 'GSM190021\n', 'GSM190063\n', 'GSM190103\n', 'GSM190051\n', 'GSM190043\n', 'GSM190013\n', 'GSM190012\n', 'GSM190075\n', 'GSM190119\n', 'GSM190016\n', 'GSM190050\n', 'GSM190099\n', 'GSM190053\n', 'GSM190007\n', 'GSM190088\n', 'GSM190130\n', 'GSM190083\n', 'GSM190020\n', 'GSM190017\n', 'GSM190040\n',

In [131]:
len([key for key in third_test]) # expected 134 unique sample id's from thymus dataset

134

In [134]:
len(third_test['GSM190019\n'])

8449

In [135]:
fourth_test = third_test['GSM190019\n']

In [136]:
fourth_test[-1] # expected last element of GSM190019 to be 8448, 1.63, 5842.98

'8448\t1.63\t5842.98\n'

In [137]:
fourth_test[0:2] # expected first elements to be list headers ID_REF, VALUE, RAW

['ID_REF\tVALUE\tRAW\n', '1\t1.68\t6190.01\n']

In [138]:
fifth_test = third_test['GSM190097\n'] 

In [139]:
fifth_test[-1] # expected last elements of GSM190097 to be 16896, -0.39, 241.84

'16896\t-0.39\t241.84\n'

In [140]:
sixth_test = third_test['GSM190130\n']

In [141]:
sixth_test[-1] # expected last elements of GSM190130 to be 16896, -0.20, 250.57

'16896\t-0.20\t250.57\n'

#### ALL TESTS PASSED!!!!

# FUNCTION TO MERGE DATA SETS AND WRITE TO LARGE TXT FILE