In [65]:

# The %... is an iPython thing, and is not part of the Python language.
# In this case we're just telling the plotting library to draw things on
# the notebook, instead of on a separate window.
%matplotlib inline
# See all the "as ..." contructs? They're just aliasing the package names.
# That way we can call methods like plt.plot() instead of matplotlib.pyplot.plot().
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
import time
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")
import urllib2
import re
from subprocess import PIPE, Popen
import os

In [79]:
geo_noncancer_list = []
other_noncancer_list = []
#read the non-cancer dataset list from csv file
df=pd.read_csv('paper_material/dna-methylation-non-cancer.csv')
#extract the geo identifiers
for i in df.Availability : 
    if re.search("GSE",i):
        geo_noncancer_list.append(i.replace(" ",""))
    else:
        other_noncancer_list.append(i.replace(" ",""))
geo_noncancer_list = list(set(geo_noncancer_list))
print len(geo_noncancer_list)

51


In [80]:
#ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE40nnn/GSE40700/matrix/GSE40700_series_matrix.txt.gz
def check_urllink(ftplink):
    #check if the ftp link exists and check for edge cases 
    try:
        urllib2.urlopen(ftplink)
        return ftplink
    except urllib2.HTTPError, e:
        print(e.code)
        return 1
    except urllib2.URLError, e:
        print(e.args)
        return 1
    
#iterate over the dataset list and fetch files.
def fetch_datasets(geo_noncancer_list):
    gse_noncancer_edge=[]
    failed_fetch_list = []
    for i in geo_noncancer_list:
        key_num = i.split("GSE")[1]
        url_key= "GSE"+key_num[0:2]
        file_n = i + "_series_matrix.txt.gz"
        url = "ftp://ftp.ncbi.nlm.nih.gov/geo/series/"+url_key.replace(" ","")+ "nnn/" + i + "/matrix/" 
        ftplink= (url + file_n).replace(" ","")
        result = check_urllink(ftplink)
        if result == 1:
            gse_noncancer_edge.append(i)
        else:
            print i, " success"
            '''pfetch = Popen("ftp " + ftplink,stdout=PIPE,stderr=PIPE,shell=True,close_fds=True)
            std_out, std_err = pfetch.communicate()
            exit_code = pfetch.returncode
            if exit_code:
                failed_fetch_list.append(i)'''
            #os.system("ftp " + ftplink)
    return gse_noncancer_edge, failed_fetch_list        

In [81]:
gse_noncancer_edge, failed_fetch_list = fetch_datasets(geo_noncancer_list)

GSE28746  success
GSE40279  success
GSE38873  success
GSE27317  success
GSE41826  success
GSE36064  success
('ftp error: [Errno ftp error] 550 GSE15745_series_matrix.txt.gz: No such file or directory',)
GSE37988  success
GSE19711  success
GSE20242  success
GSE32149  success
GSE26974  success
GSE32146  success
GSE38291  success
GSE41782  success
GSE40700  success
GSE37008  success
GSE30601  success
GSE41037  success
GSE30870  success
GSE43269  success
GSE38608  success
GSE34639  success
GSE27097  success
GSE26126  success
GSE30653  success
GSE20236  success
GSE36642  success
GSE37066  success
GSE22595  success
GSE42865  success
GSE35069  success
GSE47627  success
GSE42861  success
GSE36812  success
GSE36166  success
GSE34257  success
GSE34035  success
GSE31848  success
GSE25892  success
GSE42700  success
GSE42510  success
GSE26033  success
GSE30456  success
GSE41169  success
GSE32393  success
GSE30758  success
GSE20067  success
GSE17448  success
GSE44667  success
GSE30090  success


In [71]:
print "number of GEO non-cancer edge cases", len(gse_noncancer_edge)
print "number of other non-cancer sets", len(other_noncancer_list)
print "number of failed fetches for normal GEO data links" , len(failed_fetch_list)


number of GEO non-cancer edge cases 1
number of other non-cancer sets 19
number of failed fetches for normal GEO data links 0


In [166]:
import glob
import gzip
from itertools import islice

def create_methyldf(file1):
    #approx number of lines to fetch tissue information
    N = 2000
    maindf=pd.read_csv(file1,comment="!",sep="\t",index_col='ID_REF')
    f = gzip.open(file1, 'rb')
    #Reading first 2000 lines, header lines are inconsistent but cannot contain such large number. 
    headerlines = islice(f, N)
    for i in headerlines:
        if re.search('!Sample_source_name_ch1',i):
            #tissue_arr = i.replace("!Sample_source_name_ch1",'').split("\t")
            tissue_arr = i.split("\t")
            tissuedf = pd.DataFrame({'tissue_src':tissue_arr[1:]})
            break
    return maindf, tissuedf

In [177]:
datasets = glob.glob("data/geo-noncancer/*txt.gz")
#for now commenting this section out, we will only test one file
'''for file1 in datasets:
    maindf, tissuedf = create_methydf(file1)'''
file1 = 'data/geo-noncancer/GSE20067_series_matrix.txt.gz'
#file1 = 'data/geo-noncancer/GSE15745-GPL8490_series_matrix.txt.gz'
maindf, tissuedf = create_methyldf(file1)

we need to transpose maindf and concatenate with tissuedf to have all the tissue data associated with samples in one place.  Easy for group by operations. For now, just printing maindf and tissuedf below.

In [176]:
maindf

Unnamed: 0_level_0,GSM501487,GSM501488,GSM501489,GSM501490,GSM501491,GSM501492,GSM501493,GSM501494,GSM501495,GSM501496,GSM501497,GSM501498,GSM501499,GSM501500,GSM501501,GSM501502,GSM501503,GSM501504,GSM501505,GSM501506,GSM501507,GSM501508,GSM501509,GSM501510,GSM501511,GSM501512,GSM501513,GSM501514,GSM501515,GSM501516,GSM501517,GSM501518,GSM501519,GSM501520,GSM501521,GSM501522,GSM501523,GSM501524,GSM501525,GSM501526,GSM501527,GSM501528,GSM501529,GSM501530,GSM501531,GSM501532,GSM501533,GSM501534,GSM501535,GSM501536,...,GSM501632,GSM501633,GSM501634,GSM501635,GSM501636,GSM501637,GSM501638,GSM501639,GSM501640,GSM501641,GSM501642,GSM501643,GSM501644,GSM501645,GSM501646,GSM501647,GSM501648,GSM501649,GSM501650,GSM501651,GSM501652,GSM501653,GSM501654,GSM501655,GSM501656,GSM501657,GSM501658,GSM501659,GSM501660,GSM501661,GSM501662,GSM501663,GSM501664,GSM501665,GSM501666,GSM501667,GSM501668,GSM501669,GSM501670,GSM501671,GSM501672,GSM501673,GSM501674,GSM501675,GSM501676,GSM501677,GSM501678,GSM501679,GSM501680,GSM501681
ID_REF,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1
cg00000292,0.7450658,0.6817859,0.7086142,0.850587,0.894222,0.8384137,0.7854356,0.817623,0.8269932,0.8282334,0.8093318,0.8289963,0.8456739,0.8053415,0.8522336,0.8036212,0.819,0.8091733,0.8130969,0.8168403,0.7903646,0.7294981,0.8586541,0.8595303,0.8641366,0.7496689,0.8380737,0.7861302,0.7248227,0.7707404,0.7401157,0.8257223,0.7671681,0.8584147,0.8219367,0.8143022,0.75,0.8176665,0.8270572,0.8071842,0.8077446,0.809795,0.816152,0.7286968,0.8259136,0.8573334,0.7468737,0.8132734,0.7831825,0.8465495,...,0.8213097,0.7963875,0.7621998,0.7332293,0.8109366,0.8014842,0.8305476,0.7955233,0.7799876,0.7092449,0.7635025,0.7729694,0.7755539,0.7010551,0.6783349,0.6726886,0.7894737,0.7633726,0.3298969,0.7957983,0.7968606,0.6328012,0.5617021,0.7296768,0.7348485,0.6497396,0.7377819,0.8277512,0.8469914,0.7077315,0.7916386,0.7919109,0.7793765,0.832618,0.8557323,0.8129197,0.8102241,0.8360378,0.8528654,0.7496274,0.8586239,0.827748,0.8143048,0.8440703,0.8672183,0.8380041,0.8750833,0.8170009,0.7350944,0.8591328
cg00002426,0.7645161,0.7158301,0.8224734,0.7941364,0.7883087,0.829765,0.8144186,0.8180991,0.7498341,0.7288609,0.7437223,0.7602639,0.7754078,0.8109552,0.8236727,0.8055925,0.7740793,0.779402,0.7778447,0.6832765,0.7757921,0.8471154,0.8368915,0.7676259,0.8153682,0.7935035,0.8331429,0.7884498,0.7873183,0.8205128,0.8007419,0.7872341,0.8386483,0.8348008,0.8238532,0.7807592,0.7657841,0.797676,0.7736298,0.7727273,0.7151849,0.7946928,0.8215266,0.8342375,0.826178,0.8186302,0.8634051,0.6944206,0.8698558,0.75811,...,0.7101293,0.7480226,0.7448559,0.7468085,0.7766234,0.7827381,0.7402827,0.8239228,0.767364,0.7834483,0.8,0.7642857,0.7556237,0.7146433,0.7970732,0.6801427,0.7187808,0.7073171,0.6094069,0.7001128,0.6957123,0.6260032,0.5515587,0.7785433,0.784396,0.6100841,0.7280488,0.8239896,0.7928389,0.768797,0.7778846,0.8058902,0.7346514,0.8186453,0.8169717,0.7895414,0.8328004,0.880655,0.8391249,0.8446327,0.8592364,0.851864,0.809823,0.8461199,0.8265583,0.7975575,0.7604938,0.9015692,0.7930062,0.848166
cg00003994,0.2880215,0.2041932,0.2,0.1823755,0.1812245,0.1212121,0.1278882,0.1542253,0.2297078,0.1818996,0.1806209,0.1902874,0.1864275,0.1698473,0.193007,0.1398437,0.170339,0.2266527,0.2223169,0.1436123,0.2105263,0.2053291,0.1552483,0.1494141,0.1083004,0.2109557,0.1246032,0.2117963,0.1909354,0.1934959,0.148686,0.1628866,0.1556728,0.1624088,0.1734213,0.1739447,0.2681992,0.1499203,0.2151482,0.150289,0.2954048,0.2050833,0.2433735,0.2405063,0.1376518,0.2029851,0.1619355,0.2362288,0.2207715,0.1792693,...,0.2071429,0.189781,0.2837209,0.2174419,0.1875,0.2171779,0.1274725,0.1601273,0.1181102,0.138245,0.1340524,0.1809524,0.1576227,0.1996616,0.2296296,0.1689441,0.1878225,0.1845238,0.2698795,0.2238806,0.1551205,0.2318841,0.2583333,0.1720818,0.1708861,0.2659381,0.1824324,0.1437909,0.1410382,0.2205683,0.1767554,0.1268908,0.1272345,0.1717724,0.1184211,0.09774882,0.1131742,0.07986447,0.08927424,0.09357923,0.1560219,0.07921929,0.1235955,0.129085,0.1508197,0.09598031,0.1034226,0.07397737,0.1078509,0.09951587
cg00005847,0.1762134,0.1732909,0.2127822,0.1773224,0.1555036,0.2586588,0.2190753,0.2076304,0.2133205,0.1649233,0.1847015,0.211938,0.2407746,0.1908669,0.1736043,0.1932682,0.2302968,0.1497191,0.18976,0.1976436,0.2211713,0.1591038,0.1587237,0.1100478,0.1752222,0.1445131,0.1835473,0.2172785,0.1917225,0.1420815,0.171985,0.2928548,0.2026266,0.1715508,0.2296395,0.1516449,0.2218128,0.2176052,0.2042813,0.1968415,0.2601377,0.2039496,0.2159287,0.2870559,0.2171926,0.1643607,0.1671587,0.2313916,0.2542042,0.2283024,...,0.1465354,0.1917984,0.1902174,0.1980924,0.1948203,0.1937477,0.1966732,0.1798992,0.1898815,0.2004096,0.1725937,0.2036156,0.2010724,0.1981383,0.1806073,0.2277462,0.183941,0.2531095,0.343955,0.1681932,0.2350706,0.2510149,0.2339707,0.1523207,0.1532,0.2152493,0.1568908,0.1603075,0.1609421,0.1760887,0.1889566,0.1762568,0.1598952,0.1810802,0.1309592,0.2845588,0.1624238,0.1761948,0.1697063,0.1511296,0.1327485,0.1204998,0.1278668,0.162511,0.1158455,0.1679731,0.113879,0.1457286,0.1891377,0.1979577
cg00006414,0.2406111,0.1321462,0.1997292,0.1558609,0.1503889,0.1923251,0.1299709,0.1892562,0.191689,0.1434821,0.1578947,0.189781,0.1702396,0.1629779,0.1769448,0.1859485,0.1893491,0.1593006,0.214703,0.196701,0.1423221,0.1414691,0.1916606,0.1729167,0.1672055,0.2100508,0.1169715,0.1825813,0.1743221,0.1528384,0.1742682,0.1488908,0.1437934,0.1443228,0.1097301,0.1419244,0.2383991,0.1686593,0.2051492,0.227066,0.2574688,0.255466,0.2285149,0.1947326,0.1705749,0.1940195,0.1665601,0.2057113,0.1775266,0.2358444,...,0.1232571,0.1333333,0.184874,0.174685,0.1446446,0.1903266,0.1759494,0.1716172,0.1490978,0.1524598,0.2019802,0.1814375,0.1661538,0.1690141,0.167328,0.2098009,0.2114785,0.1985185,0.4466258,0.2025886,0.1394065,0.2398815,0.2232747,0.151071,0.1597701,0.197378,0.1580098,0.1709717,0.156304,0.1127737,0.1546811,0.153112,0.1550498,0.23219,0.1337611,0.06187138,0.07822553,0.04932735,0.05726341,0.05811369,0.06853967,0.05022458,0.05719733,0.07337237,0.04782848,0.06864785,0.08174289,0.08267801,0.1179818,0.06479736
cg00007981,0.1082902,0.09004093,0.06759546,0.07819992,0.07059994,0.05051857,0.07845986,0.06199366,0.09944191,0.05378062,0.08152174,0.05975522,0.04684318,0.06156015,0.05259698,0.08187135,0.06842312,0.07790368,0.06175018,0.06452973,0.07479464,0.05469047,0.05422222,0.04678363,0.04604262,0.04916114,0.04737839,0.06950673,0.04809763,0.05613057,0.03318584,0.06062819,0.05727763,0.05119644,0.06315382,0.04658583,0.05735528,0.04072044,0.06326964,0.06434108,0.0793312,0.07628866,0.08233789,0.1130155,0.05795352,0.05609146,0.04626334,0.05947955,0.05602801,0.05043695,...,0.07076719,0.06504065,0.08263198,0.06670224,0.05535437,0.06564799,0.07058088,0.04616588,0.05741627,0.04958678,0.06535488,0.0540054,0.05181347,0.07448276,0.09634318,0.08484456,0.05310621,0.06065858,0.2256097,0.07953773,0.07853403,0.08709677,0.1377245,0.04643206,0.05710241,0.06699929,0.07469717,0.04669811,0.04822778,0.05581181,0.0520175,0.05352591,0.05998868,0.05400697,0.06684492,0.04766399,0.05633803,0.0392198,0.03574811,0.03915023,0.05837626,0.06027907,0.05909752,0.04125326,0.04195804,0.06828745,0.06019307,0.06555513,0.05787424,0.04243176
cg00008493,0.9493656,0.9561678,0.9614012,0.9670108,0.9484871,0.9553334,0.9488436,0.9582141,0.9390407,0.9496972,0.9551049,0.95902,0.9501087,0.9443041,0.9459545,0.9309537,0.9428968,0.9465551,0.9429644,0.9500429,0.9470129,0.9524735,0.9513249,0.9618924,0.9579272,0.9598323,0.9488893,0.9507834,0.9592328,0.9576535,0.958698,0.9608967,0.9572526,0.9516734,0.9608039,0.9567311,0.9423843,0.953243,0.9494447,0.9329194,0.9539401,0.9434409,0.9466006,0.9465759,0.9452068,0.9476952,0.9487695,0.9562296,0.9571684,0.9558107,...,0.9667188,0.9605739,0.9597242,0.9554473,0.9633517,0.9592476,0.9584756,0.9598132,0.9557269,0.9683791,0.958654,0.9549757,0.9499246,0.9519047,0.9598477,0.9491741,0.9498388,0.9469697,0.7108434,0.9528959,0.9568042,0.9702884,0.9550394,0.9640322,0.9578186,0.9639377,0.9638695,0.9675777,0.9610351,0.9656782,0.9676468,0.9621789,0.9650901,0.9585479,0.9465621,0.9746382,0.9768856,0.976245,0.9774122,0.9746366,0.9790815,0.9689292,0.9682993,0.9763979,0.9782133,0.9714704,0.9673973,0.9738263,0.9725274,0.9706984
cg00008713,0.03946289,0.05534143,0.03953338,0.03646573,0.0410409,0.03823367,0.03738202,0.0341444,0.03638071,0.03337374,0.03350957,0.03311393,0.04250677,0.04076682,0.03959877,0.04426425,0.04194138,0.04578741,0.04443313,0.04015869,0.03880225,0.04615929,0.03818072,0.0360521,0.03200428,0.03754266,0.04447711,0.03594207,0.03938004,0.03106377,0.03486673,0.0311864,0.03695787,0.03341442,0.02790144,0.03760543,0.04652263,0.04224342,0.0464283,0.04828865,0.05292226,0.03709147,0.03557692,0.046306,0.04655855,0.03697955,0.044895,0.0377215,0.03743243,0.03373065,...,0.03044,0.03574565,0.02909409,0.04333606,0.02895944,0.02983088,0.03414512,0.03026084,0.03774792,0.03179425,0.02184376,0.03025961,0.03347245,0.02317537,0.03417382,0.03804085,0.04261314,0.04101715,0.1349107,0.03416856,0.03062417,0.02986888,0.02435111,0.03119367,0.03556742,0.02781806,0.02720807,0.02881229,0.03557284,0.02983076,0.02909711,0.03677166,0.0341281,0.03643595,0.0394525,0.01465931,0.01649418,0.01956968,0.01680108,0.01876344,0.02200732,0.0223976,0.01935024,0.019123,0.02377814,0.02167638,0.02671562,0.0322393,0.02724094,0.03113027
cg00009407,0.0850601,0.1469164,0.1151924,0.1157904,0.1132865,0.09066109,0.1101332,0.1089125,0.1011106,0.08953837,0.1125392,0.1058555,0.1336988,0.1169522,0.1208448,0.1335267,0.1400294,0.13365,0.1333479,0.1219512,0.09789695,0.08744471,0.1249105,0.1184966,0.1408969,0.1113057,0.09172473,0.1111957,0.1141395,0.119802,0.09838873,0.09976777,0.09152864,0.1275502,0.08985999,0.1149831,0.1456984,0.1329426,0.1008542,0.1027592,0.1225792,0.1253217,0.1403425,0.1189991,0.120485,0.1293402,0.1231978,0.1108202,0.1353792,0.1165734,...,0.07194245,0.1068313,0.09093776,0.117552,0.1067876,0.1161442,0.0722365,0.1241852,0.1004436,0.06909321,0.06839462,0.1002341,0.1074495,0.09079776,0.1198929,0.1030496,0.1158537,0.1084475,0.4392014,0.08359957,0.09232463,0.1165983,0.06732765,0.130068,0.1198068,0.1162909,0.1409051,0.1047093,0.1058747,0.1101579,0.1206333,0.09206696,0.1086253,0.1107191,0.07223476,0.01569428,0.0181188,0.01689492,0.01646294,0.01471176,0.01870306,0.01742574,0.01523608,0.01459538,0.01913378,0.01889449,0.01811758,0.02122855,0.02232639,0.02053566
cg00010193,0.6494058,0.6077065,0.6343441,0.626888,0.6266056,0.7045891,0.6602054,0.6625449,0.683715,0.642929,0.6694009,0.6086005,0.6417408,0.6705715,0.6606622,0.6604648,0.6218338,0.6719512,0.5992351,0.6646284,0.6513865,0.6336074,0.6403621,0.6168432,0.7149627,0.6703547,0.6473905,0.635127,0.6588649,0.6890182,0.6421662,0.671285,0.6341168,0.6413493,0.6787782,0.6253884,0.648738,0.6270241,0.6212682,0.6657378,0.6406356,0.6194836,0.7038405,0.644749,0.6927588,0.6635553,0.6905475,0.6709577,0.6708547,0.6411138,...,0.6109958,0.580662,0.5912662,0.6440338,0.6958305,0.639074,0.6779256,0.6170684,0.5843776,0.6636413,0.688739,0.7033432,0.6673641,0.6330541,0.6368272,0.6383992,0.6714017,0.6405677,0.1880854,0.6042331,0.5710297,0.5791447,0.6553329,0.6104368,0.6056776,0.625254,0.6216872,0.6221864,0.6126495,0.6264835,0.6462311,0.6388081,0.622058,0.5573181,0.621334,0.6091846,0.6418512,0.6392862,0.6090105,0.5944842,0.6006892,0.5893969,0.5966964,0.6291556,0.6058047,0.5526965,0.6390413,0.5883995,0.5545971,0.5655535


In [178]:
tissuedf

Unnamed: 0,tissue_src
0,"""genomic DNA from whole blood DDC117"""
1,"""genomic DNA from whole blood DN331"""
2,"""genomic DNA from whole blood DDC129"""
3,"""genomic DNA from whole blood DC478"""
4,"""genomic DNA from whole blood DC147"""
5,"""genomic DNA from whole blood DN120"""
6,"""genomic DNA from whole blood DN178"""
7,"""genomic DNA from whole blood DC133"""
8,"""genomic DNA from whole blood DN199"""
9,"""genomic DNA from whole blood DN150"""


In [None]:
#ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE38nnn/GSE38291/suppl/GSE38291_signal_intensities.txt.gz
from urllib2 import urlopen
import ftplib

def ftp_listfiles(ftplink):
    files = []
    ftp = ftplib.FTP("www.python.org")
    try:
        files = ftp.nlst()
    except ftplib.error_perm, resp:
        if str(resp) == "550 No files found":
            print "No files in this directory"
        else:
            raise
    for f in files:
        print f


def fetch_signalfiles(geo_noncancer_list):
    gse_noncancer_edge=[]
    failed_fetch_list = []
    for i in geo_noncancer_list:
        key_num = i.split("GSE")[1]
        url_key= "GSE"+key_num[0:2]
        file_n = i + "_series_matrix.txt.gz"
        url = "ftp://ftp.ncbi.nlm.nih.gov/geo/series/"+url_key.replace(" ","")+ "nnn/" + i + "/suppl/"
        print url
        ftp_listfiles(url)
        ftplink= (url + file_n).replace(" ","")
        if result == 1:
            gse_noncancer_edge.append(i)
        else:
            print i, " success"
            '''pfetch = Popen("ftp " + ftplink,stdout=PIPE,stderr=PIPE,shell=True,close_fds=True)
            std_out, std_err = pfetch.communicate()
            exit_code = pfetch.returncode
            if exit_code:
                failed_fetch_list.append(i)'''
            #os.system("ftp " + ftplink)
    return gse_noncancer_edge, failed_fetch_list    

In [None]:
fetch_signalfiles(geo_noncancer_list)