In [10]:
import os
import pandas as pd

# Define the folder path
folder_path = './PDB-query'

# Get a list of all the CSV files in the folder
file_list = [file for file in os.listdir(folder_path) if file.endswith('.csv')]

# Create an empty list to store the loaded dataframes
dataframes = []

# Load each CSV file into a dataframe and append it to the list
for file in file_list:
    file_path = os.path.join(folder_path, file)
    df = pd.read_csv(file_path, header=1)
    dataframes.append(df)

# Concatenate all the dataframes into a single dataframe
combined_df = pd.concat(dataframes, ignore_index=True)

In [2]:
combined_df = combined_df.drop(['Entry ID', 'Entity ID', 'Asym ID', 
                                'Auth Asym ID', 'Database Name',
                                'Accession Code(s)','Molecular Weight (Entity)',
                                'Unnamed: 10'],axis=1)

In [3]:
# Filter rows based on condition
filtered_df = combined_df[
    (combined_df['Entity Macromolecule Type'] == 'polypeptide(L)') |
    (combined_df['Entity Macromolecule Type'] == 'polypeptide(D)')
]

In [4]:
filtered_df = filtered_df.drop("Entity Macromolecule Type", axis=1)

In [5]:
filtered_df.to_csv("PDB_sequences-all-200k.csv")

In [6]:
filtered_df.shape

(387550, 2)

In [7]:
filtered_df

Unnamed: 0,Sequence,Polymer Entity Sequence Length
1,TSRPYACELCAKQFQSPSTLKMHMRCHTGEKPYQCKTCGRCFSVQG...,119.0
3,TSRPYACELCAKQFQSPSTLKMHMRCHTGEKPYQCKTCGRCFSVQG...,119.0
6,GSHMKQLEDKVEELLSKNYHLENEVARLKYKRNQEEIETYYEYTLK...,250.0
8,GSHMLDGCTPWPAEFAVRYREAGYWTGETFSDFVTDRTRRFADRLA...,546.0
9,GSWGQCSTGSIQCCQNVVPGDSDLGTLLLDELGIVLEDPTVLIGDG...,84.0
...,...,...
416617,MHHHHHHSSGVDLGTENLYFQSMSYDIQAWKKQCEELLNLIFQCED...,149.0
416618,MHHHHHHSSGVDLGTENLYFQSMSYDIQAWKKQCEELLNLIFQCED...,149.0
416619,MHHHHHHSSGVDLGTENLYFQSMSYDIQAWKKQCEELLNLIFQCED...,149.0
416620,MHHHHHHSSGVDLGTENLYFQSMSYDIQAWKKQCEELLNLIFQCED...,149.0


In [12]:
test = combined_df[
    (combined_df['Entity Macromolecule Type'] == 'polypeptide(L)') |
    (combined_df['Entity Macromolecule Type'] == 'polypeptide(D)')
]

In [30]:
test[test["Entity ID"]==1.0].shape

(197239, 11)

In [31]:
test

Unnamed: 0,Entry ID,Entity ID,Asym ID,Auth Asym ID,Database Name,Accession Code(s),Sequence,Polymer Entity Sequence Length,Entity Macromolecule Type,Molecular Weight (Entity),Unnamed: 10
1,,1.0,A,A,UniProt,Q8NAP3,TSRPYACELCAKQFQSPSTLKMHMRCHTGEKPYQCKTCGRCFSVQG...,119.0,polypeptide(L),14.235,
3,6E94,1.0,A,A,UniProt,Q8NAP3,TSRPYACELCAKQFQSPSTLKMHMRCHTGEKPYQCKTCGRCFSVQG...,119.0,polypeptide(L),14.263,
6,6E95,1.0,"A, B","A, B",UniProt,P03069,GSHMKQLEDKVEELLSKNYHLENEVARLKYKRNQEEIETYYEYTLK...,250.0,polypeptide(L),28.844,
8,6E97,1.0,"A, B","A, B",UniProt,Q47NR5,GSHMLDGCTPWPAEFAVRYREAGYWTGETFSDFVTDRTRRFADRLA...,546.0,polypeptide(L),59.746,
9,6E98,1.0,A,A,UniProt,K5VRK4,GSWGQCSTGSIQCCQNVVPGDSDLGTLLLDELGIVLEDPTVLIGDG...,84.0,polypeptide(L),8.286,
...,...,...,...,...,...,...,...,...,...,...,...
416617,7FVM,1.0,A,A,UniProt,Q8WWQ0,MHHHHHHSSGVDLGTENLYFQSMSYDIQAWKKQCEELLNLIFQCED...,149.0,polypeptide(L),17.628,
416618,7FVN,1.0,A,A,UniProt,Q8WWQ0,MHHHHHHSSGVDLGTENLYFQSMSYDIQAWKKQCEELLNLIFQCED...,149.0,polypeptide(L),17.628,
416619,7FVO,1.0,A,A,UniProt,Q8WWQ0,MHHHHHHSSGVDLGTENLYFQSMSYDIQAWKKQCEELLNLIFQCED...,149.0,polypeptide(L),17.628,
416620,7FVP,1.0,A,A,UniProt,Q8WWQ0,MHHHHHHSSGVDLGTENLYFQSMSYDIQAWKKQCEELLNLIFQCED...,149.0,polypeptide(L),17.628,
