# Lesson 11: File and IO

## Reading file

In [47]:
import os
import glob

In [1]:
with open('data/1OLG.pdb', 'r') as f:
    print(type(f))

<class '_io.TextIOWrapper'>


In [2]:
# Read file into string
with open('data/1OLG.pdb', 'r') as f:
    f_str = f.read()

# Let's look at the first 1000 characters
f_str[:1000]

'HEADER    ANTI-ONCOGENE                           13-JUN-94   1OLG              \nTITLE     HIGH-RESOLUTION SOLUTION STRUCTURE OF THE OLIGOMERIZATION             \nTITLE    2 DOMAIN OF P53 BY MULTI-DIMENSIONAL NMR                               \nCOMPND    MOL_ID: 1;                                                            \nCOMPND   2 MOLECULE: TUMOR SUPPRESSOR P53 (OLIGOMERIZATION DOMAIN);             \nCOMPND   3 CHAIN: A, B, C, D;                                                   \nCOMPND   4 ENGINEERED: YES                                                      \nSOURCE    MOL_ID: 1;                                                            \nSOURCE   2 ORGANISM_SCIENTIFIC: HOMO SAPIENS;                                   \nSOURCE   3 ORGANISM_COMMON: HUMAN;                                              \nSOURCE   4 ORGANISM_TAXID: 9606                                                 \nKEYWDS    ANTI-ONCOGENE                                                         \nEXPDTA    SOLUT

In [21]:
# Read contents of the file in as a list
with open('data/1OLG.pdb', 'r') as f:
    f_list = f.readlines()

# Look at the list (first ten entries)
f_list[:10]

['HEADER    ANTI-ONCOGENE                           13-JUN-94   1OLG              \n',
 'TITLE     HIGH-RESOLUTION SOLUTION STRUCTURE OF THE OLIGOMERIZATION             \n',
 'TITLE    2 DOMAIN OF P53 BY MULTI-DIMENSIONAL NMR                               \n',
 'COMPND    MOL_ID: 1;                                                            \n',
 'COMPND   2 MOLECULE: TUMOR SUPPRESSOR P53 (OLIGOMERIZATION DOMAIN);             \n',
 'COMPND   3 CHAIN: A, B, C, D;                                                   \n',
 'COMPND   4 ENGINEERED: YES                                                      \n',
 'SOURCE    MOL_ID: 1;                                                            \n',
 'SOURCE   2 ORGANISM_SCIENTIFIC: HOMO SAPIENS;                                   \n',
 'SOURCE   3 ORGANISM_COMMON: HUMAN;                                              \n']

In [4]:
#look at lines in a list, strips whitespace and newlines at end of a string
f_list[0].rstrip()

'HEADER    ANTI-ONCOGENE                           13-JUN-94   1OLG'

In [20]:
# Print the first ten lines of the file
with open('data/1OLG.pdb', 'r') as f:
    i = 0
    while i < 10:
        print(f.readline().rstrip())
        i += 1

HEADER    ANTI-ONCOGENE                           13-JUN-94   1OLG
TITLE     HIGH-RESOLUTION SOLUTION STRUCTURE OF THE OLIGOMERIZATION
TITLE    2 DOMAIN OF P53 BY MULTI-DIMENSIONAL NMR
COMPND    MOL_ID: 1;
COMPND   2 MOLECULE: TUMOR SUPPRESSOR P53 (OLIGOMERIZATION DOMAIN);
COMPND   3 CHAIN: A, B, C, D;
COMPND   4 ENGINEERED: YES
SOURCE    MOL_ID: 1;
SOURCE   2 ORGANISM_SCIENTIFIC: HOMO SAPIENS;
SOURCE   3 ORGANISM_COMMON: HUMAN;


## Writing to a file

In [None]:
#check if file exists
os.path.isfile('data/1OLG.pdb')

In [6]:
#creates a new file
with open('yogi.txt', 'w') as f:
    f.write('When you come to a fork in the road, take it.\n')
    f.write('You can observe a lot by just watching.\n')
    f.write('I never said most of the things I said.\n')

In [11]:
cat yogi.txt

When you come to a fork in the road, take it.
You can observe a lot by just watching.
I never said most of the things I said.


In [15]:
phi=1.61803398875
print(f'Phi= {phi:.8f}.')

Phi= 1.61803399.


In [17]:
with open('gimme_phi.txt', 'w') as f:
    f.write('The golden ratio is φ = ')
    f.write('{phi:.8f}'.format(phi=1.61803398875))

!cat gimme_phi.txt

The golden ratio is φ = 1.61803399

In [45]:
with open('data/1OLG.pdb', 'r') as f:
    print(type(f))

<class '_io.TextIOWrapper'>


In [None]:
with open('data/1OLG.pdb', 'r') as f, open('atoms_chain_A.txt', 'w') as f_out:
    # Put the ATOM lines from chain A in new file
    for line in f:
        if len(line) > 21 and line[:4] == 'ATOM' and line[21] == 'A':
            f_out.write(line)

In [48]:
file_list = glob.glob('data/*.pdb')

file_list

['data/1OLG.pdb', 'data/1J6Z.pdb', 'data/1FAG.pdb', 'data/2ERK.pdb']

In [None]:
# Dictionary to hold sequences
seqs = {}

# Loop through all matching files
for file_name in file_list:
    # Extract PDB ID
    pdb_id = file_name[file_name.find('/')+1:file_name.rfind('.')] #index from first letter after "/" to last "."

    # Initialize sequence string, which we build as we go along
    seq = ''
    with open(file_name, 'r') as f:
    for line in f:
        if len(line) > 11 and line[:6] == 'SEQRES' and line[11] == 'A':
            seq += line[19:].rstrip() + ' '

    # Build sequence with dash-joined three letter codes, use .split() to split seq by space, then use .join to join elements with '-'
    seq = '-'.join(seq.split())

    # Store in the dictionary (assign content of seq to pdb_id: seq in seqs dictionary)
    seqs[pdb_id] = seq