In [9]:
import sys
import ftplib
import pandas as pd

In [40]:
#connect to ftp and get the list of samples
ftp_url = 'ftp.pride.ebi.ac.uk'
samples_dir = 'pride/data/proteomes/proteogenomics/cell-lines'
id_dir = 'proteomics_lfq'
peps_file = 'out_triqler.tsv'

output_file = 'identified_peptides.tsv'

try:
    ftp = ftplib.FTP(ftp_url, timeout=3600)
    ftp.login()
    ftp.cwd(samples_dir)
    samples_list = ftp.nlst()
except ftplib.all_errors:
    print('Failed to connect to the ftp site')

print(samples_list)

['PXD005946-Sample-10', 'PXD005946-Sample-11', 'PXD005946-Sample-12', 'PXD005946-Sample-13', 'PXD005946-Sample-15', 'PXD005946-Sample-16', 'PXD005946-Sample-17', 'PXD005946-Sample-18', 'PXD005946-Sample-19', 'PXD005946-Sample-20', 'PXD005946-Sample-21', 'PXD005946-Sample-22', 'PXD005946-Sample-23', 'PXD005946-Sample-24', 'PXD005946-Sample-39', 'PXD012255-Sample-1', 'PXD015270-Sample-1']


In [41]:
#increase max limit to get larger files
ftplib.FTP.maxline = 100000

#get all peptides from the finished samples
peps_df = pd.DataFrame()
samples = []
for sample in samples_list:
    peps = []
    try:
        ftp.retrlines('RETR {}/{}/{}'.format(sample, id_dir, peps_file), peps.append)
        df = pd.DataFrame(data=[x.split('\t') for x in peps[1::]], 
                          index=[sample for x in peps[1::]],
                          columns = peps[0].split('\t'))
    except ftplib.error_perm as e: #the search is not done yet!
        print('failed to extract all lines from {}, error:{}'.format(sample, e))
        continue #to next sample
    peps_df = pd.concat([peps_df, df])
    
print(peps_df.head(2))

print('\nExtracted {} peptides from these {} samples:{}\n'.format(len(peps_df), len(set(peps_df.index)), '\n'.join(set(peps_df.index))))

failed to extract all lines from PXD005946-Sample-12, error:550 Failed to open file.
failed to extract all lines from PXD005946-Sample-17, error:550 Failed to open file.
failed to extract all lines from PXD012255-Sample-1, error:550 Failed to open file.
                    run condition charge searchScore    intensity  \
PXD005946-Sample-10  10      T47D      2   0.9166502  8.785152e05   
PXD005946-Sample-10   5      T47D      3   0.9568733  1.952683e06   

                                      peptide  \
PXD005946-Sample-10  .(Acetyl)AAALVVALESSPGGR   
PXD005946-Sample-10      .(Acetyl)AADIDQEVKER   

                                                              proteins  
PXD005946-Sample-10                  ncRNA_ENST00000438290.2_3_codon_4  
PXD005946-Sample-10  ENSP00000442318.1;altorf_ENST00000545606.6_1_c...  

Extracted 125048 peptides from these 14 samples:PXD005946-Sample-19
PXD005946-Sample-24
PXD015270-Sample-1
PXD005946-Sample-15
PXD005946-Sample-13
PXD005946-Sample-16
PXD

In [31]:
#close ftp connection
ftp.close()

In [43]:
peps_df.to_csv(output_file, sep='\t', index=False)