# Set-up the environment

In [None]:
!pip install pickle5
!pip install fastaparser

Collecting pickle5
[?25l  Downloading https://files.pythonhosted.org/packages/f7/4c/5c4dd0462c8d3a6bc4af500a6af240763c2ebd1efdc736fc2c946d44b70a/pickle5-0.0.11.tar.gz (132kB)
[K     |██▌                             | 10kB 13.6MB/s eta 0:00:01[K     |█████                           | 20kB 9.2MB/s eta 0:00:01[K     |███████▍                        | 30kB 5.7MB/s eta 0:00:01[K     |██████████                      | 40kB 5.4MB/s eta 0:00:01[K     |████████████▍                   | 51kB 2.9MB/s eta 0:00:01[K     |██████████████▉                 | 61kB 3.2MB/s eta 0:00:01[K     |█████████████████▍              | 71kB 3.4MB/s eta 0:00:01[K     |███████████████████▉            | 81kB 3.7MB/s eta 0:00:01[K     |██████████████████████▎         | 92kB 3.6MB/s eta 0:00:01[K     |████████████████████████▉       | 102kB 3.9MB/s eta 0:00:01[K     |███████████████████████████▎    | 112kB 3.9MB/s eta 0:00:01[K     |█████████████████████████████▊  | 122kB 3.9MB/s eta 0:00:01[K

In [None]:
import time
import numpy as np
import random
import csv
import os
import os.path
import pandas as pd
import math
import sys
import editdistance

import fastaparser
import zipfile, gzip, shutil
import json

import tabulate
import pickle5 as pickle

# We will mostly use dictionaries
def save_obj(obj, name):
    with open(name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name):
    with open(name + '.pkl', 'rb') as f:
        return pickle.load(f)

# Some helper methods
def format_time(t):
    return time.strftime('%H:%M:%S', time.gmtime(t))

# Store these information about each sequence (if available)
relevant_fields_seq = ['cdr3_aa', 'sequence_alignment_aa', 'germline_alignment_aa']
relevant_fields = ['Isotype','v_identity','j_identity', 'v_call','j_call']

# Used to purify sequences
chars_to_remove = ['X','Z','*']

# Clustering tool
url_usearch = 'https://drive5.com/downloads/usearch11.0.667_i86linux32.gz'

# Best neutralizing

Best neutralizing antibody ids (MAb) obtained from https://www.hiv.lanl.gov/content/immunology/tables/ab_best_neutralizing_summary.html. The corresponding nucleotide and amino acid sequences of the heavy chain where found here https://www.hiv.lanl.gov/components/sequence/HIV/neutralization/download_db.comp.

In the case we had only the amino-acid sequence, we used the tool available at http://meilerlab.org/index.php/servers/IgReconstruct to reconstruct the nucleotide sequence ('-' symbols are just removed). In all other cases the antibody was excluded from the training data (even searching on https://www.ncbi.nlm.nih.gov/protein/ didn't provide the missing sequences). We therefore had to remove 7 sequences, leaving a total of 70 useable best neutralizing antibodies. 

In [None]:
best_neutralizing = [
    ('10E8', 'GAGGTGCAGCTGGTGGAGTCTGGGGGAGGCTTGGTGAAGCCTGGAGGATCCCTTAGACTCTCATGTTCAGCCTCTGGTTTCGACTTCGATAACGCCTGGATGACTTGGGTCCGCCAGCCTCCAGGGAAGGGCCTCGAATGGGTTGGTCGTATTACGGGTCCAGGTGAAGGTTGGTCAGTGGACTATGCTGCACCCGTGGAAGGCAGATTTACCATCTCGAGACTCAATTCAATAAATTTCTTATATTTGGAGATGAACAATTTAAGAATGGAAGACTCAGGCCTTTACTTCTGTGCCCGCACGGGAAAATATTATGATTTTTGGAGTGGCTATCCGCCGGGAGAAGAATACTTCCAAGACTGGGGCCGGGGCACCCTGGTCACCGTCTCCTCA'),
    ('12A12', 'TCCCAGCATTTGGTGCAATCTGGGACTCAGGTGAAGAAGCCTGGGGCCTCAGTGAGGATCTCATGCCAGGCTTCTGGATACAGCTTCACCGACTACGTTCTCCACTGGTGGCGACAGGCCCCAGGCCAAGGGCTGGAGTGGATGGGGTGGATCAAGCCTGTCTACGGTGCCAGAAACTACGCGCGCAGGTTTCAGGGCAGGATAAACTTTGATCGGGACATCTACAGGGAGATAGCCTTCATGGACTTGAGTGGACTGAGATCTGACGACACGGCCCTATATTTTTGTGCGAGAGATGGGAGCGGGGACGACACCTCTTGGCACTTAGATCCCTGGGGCCAGGGAACGCTGGTCATTGTCTCCGCAGCGTCGACCAAGGGCC'),
    ('12A21', 'TCCCAGCATTTGGTGCAATCTGGGACTCAGGTGAAGAAGCCTGGGGCCTCAGTGCGGGTCTCCTGCCAGGCTTCTGGATATACCTTCACCAATTACATTCTCCACTGGTGGCGACAGGCCCCTGGACAAGGGCTGGAGTGGATGGGATTGATCAAGCCTGTCTTTGGTGCCGTAAATTACGCGCGCCAGTTTCAGGGCAGGATTCAGTTGACTAGGGACATCTACAGGGAGATAGCCTTCCTGGACCTGAGTGGCCTCAGATCTGACGACACGGCCGTCTATTACTGTGCGCGAGATGAGAGCGGGGACGACCTCAAGTGGCACCTACATCCCTGGGGCCAGGGAACGCAGGTCATAGTTTCCCCAGCGTCGACCAAGGGCCC'),
    ('1B2530', 'CAGGTGCAGCTGGAACAATCGGGGACTGCGGTGAGGAAGCCTGGGGCCTCGGTGACGCTTTCCTGCCAGGCGTCCGGTTACAACTTCGTCAAATACATCATTCACTGGGTGCGCCAGAAACCTGGACTCGGCTTTGAGTGGGTTGGCATGATCGACCCCTACCGTGGCCGGCCATGGTCCGCGCACAAATTTCAGGGTCGACTCTCCCTGAGTCGAGACACTTCCATGGAAATACTATATATGACCCTGACCAGCCTGAAATCTGACGACACGGCCACCTATTTCTGTGCGAGGGCTGAGGCAGCATCCGACAGTCATTCTCGACCCATCATGTTCGACCACTGGGGCCAGGGCTCCCTGGTCACCGTCTCCTCAGCGTCGACCAAGGGCCCATCGGTCTTC'),
    ('2F5', 'AGGATCACGTTAAAGGAATCGGGTCCTCCGCTGGTGAAACCCACACAGACTCTCACGCTGACCTGTTCCTTCTCTGGGTTCTCACTGTCCGATTTTGGAGTGGGTGTAGGCTGGATCCGTCAGCCCCCAGGAAAGGCCCTAGAGTGGCTTGCAATCATTTATTCGGATGATGATAAGCGCTACAGCCCATCGCTGAACACCAGACTCACCATCACCAAGGACACCTCCAAAAATCAAGTTGTCCTTGTCATGACTAGGGTGAGTCCTGTGGACACAGCCACGTATTTCTGTGCACAC'),
    ('2G12', 'GAGGTGCAGCTGGTGGAGTCTGGGGGAGGCCTGGTCAAGGCGGGAGGATCCCTCATACTCTCCTGTGGAGTCTCTAATTTTAGAATCTCTGCCCATACCATGAATTGGGTCCGCCGGGTTCCAGGGGGGGGGCTGGAGTGGGTCGCTTCCATTAGTACGAGTTCCACTTATAGAGACTATGCAGACGCTGTGAAGGGCCGATTCACCGTTTCCAGAGACGACCTCGAAGACTTTGTGTATTTGCAAATGCACAAAATGAGAGTCGAAGACACGGCTATTTATTACTGCGCCAGA'),
    ('35O22', 'CAGGGTCAACTAGTCCAGTCTGGAGCTGAATTGAAAAAGCCTGGCGCCTCGGTGAAGATTTCCTGTAAGACTTCGGGTTATAGGTTTAATTTCTATCATATTAATTGGATTCGACAAACTGCAGGACGTGGACCTGAGTGGATGGGATGGATCAGCCCTTACAGTGGTGACAAAAACCTCGCACCTGCCTTTCAAGACAGAGTCATTATGACGACAGACACAGAAGTCCCTGTGACCTCATTCACGTCCACGGGCGCAGCCTACATGGAAATAAGGAACCTGAAATTTGACGACACAGGCACCTATTTCTGTGCAAAAGGCCTCCTGCGTGACGGTTCGTCGACGTGGCTTCCTTATTTGTGGGGCCAGGGTACCCTACTCACCGTCTCGTCA'),
    ('3BNC117', 'CAGGTCCAATTGTTACAGTCTGGGGCAGCGGTGACGAAGCCCGGGGCCTCAGTGAGAGTCTCCTGCGAGGCTTCTGGATACAACATTCGTGACTACTTTATTCATTGGTGGCGACAGGCCCCAGGACAGGGCCTTCAGTGGGTGGGATGGATCAATCCTAAGACAGGTCAGCCAAACAATCCTCGTCAATTTCAGGGTAGAGTCAGTCTGACTCGACACGCGTCGTGGGACTTTGACACATTTTCCTTTTACATGGACCTGAAGGCACTAAGATCGGACGACACGGCCGTTTATTTCTGTGCGCGACAGCGCAGCGACTATTGGGATTTCGACGTCTGGGGCAGTGGAACCCAGGTCACTGTCTCGTCAGCGTCGACCAAGGGCCCA'),
    ('3BNC60', 'CAGGTCCATTTGTCACAGTCTGGGGCAGCGGTGACGAAGCCCGGGGCCTCAGTGAGAGTCTCCTGCGAGGCTTCCGGATACAAGATTAGTGACCACTTTATTCATTGGTGGCGACAGGCCCCAGGACAGGGCCTTCAGTGGGTGGGGTGGATCAATCCTAAGACTGGTCAGCCAAACAATCCTCGTCAATTTCAGGGTAGAGTCAGTCTGACTCGACAGGCGTCGTGGGACTTTGACACATATTCCTTTTACATGGACCTCAAGGCAGTAAGATCGGACGACACGGCCATTTATTTCTGTGCGCGACAACGCAGCGACTTTTGGGATTTCGACGTCTGGGGCAGCGGCACGCAGGTCACTGTCTCGTCAGCGTCGACCAAGGGCCCATCGG'),
    ('8ANC131', 'CAGGGGCAGTTGGTGCAGTCCGGGGGTGGACTGAAGAAACCTGGGACGTCAGTGACGATTTCCTGCCTGGCATCTGAATACACATTCAACGAATTCGTTATTCACTGGATTCGACAGGCCCCTGGACAGGGGCCTCTGTGGCTGGGTCTAATCAAACGTAGCGGTCGTTTGATGACTGCCTATAATTTTCAAGACAGACTCAGTCTGCGAAGAGACCGTTCGACGGGAACAGTCTTCATGGAGTTGCGGGGTCTCAGACCTGACGACACGGCCGTGTATTATTGTGCGAGGGATGGATTGGGGGAGGTAGCGCCGGACTATCGTTACGGCATTGACGTCTGGGGTCAGGGGTCCACGGTCATCGTCACCTCAGCGTCGACCAAGGGCCCATCGGTCTTCCC'),
    ('8ANC195', 'CAGATACACCTCGTACAATCTGGGACCGAAGTCAAGAAGCCTGGGTCCTCGGTGACGGTCTCCTGCAAGGCCTATGGAGTCAACACTTTCGGTCTCTATGCCGTCAATTGGGTGCGACAGGCCCCAGGACAAAGTCTTGAGTACATAGGACAGATATGGCGGTGGAAATCCAGCGCTTCACATCATTTCCGGGGCCGAGTCCTCATTTCCGCGGTGGACCTCACGGGCTCCTCGCCGCCTATTTCTTCCTTGGAGATCAAGAACCTGACCTCTGACGACACGGCCGTCTATTTTTGCACGACAACCTCGACGTACGACAAGTGGAGTGGCCTTCATCACGACGGGGTCATGGCATTTTCCTCTTGGGGCCAGGGAACCCTCATCTCCGTCTCCGCGGCGTCGACCAAGGGCCCATCGGTCTTC'),
    ('CH01', 'GAGGTTCAGCTGGTGGAGTCTGGGGCAAATGTTGTACGGCCGGGGGGGTCCCTGAGACTCTCCTGTAAAGCGTCCGGATTCATCTTTGAAAATTTTGGTTTTAGTTGGGTCCGCCAGGCTCCAGGGAAGGGGCTTCAGTGGGTCGCTGGTCTTAATTGGAATGGTGGTGACACACGTTATGCAGACTCTGTGAAGGGCCGATTCAGAATGTCCAGAGACAACTCCAGGAATTTTGTGTATTTGGACATGGATAAAGTGGGAGTCGACGACACGGCCTTCTATTACTGTGCGAGAGGGACCGATTACACTATTGACGACGCGGGGATCCATTACCAAGGTTCGGGGACCTTCTGGTACTTCGATCTCTGGGGCCGTGGCACCCTGGTCAGTGTCTCTTCAG'),
    ('CH02', 'GAGGTTCAGCTGGTGGAGTCTGGGGGAAGTGTGGTGCGGCCGGGGGGGTCCCTGAGACTCTCCTGTAGAGCGTCCGGATTCATCTTTGAGAACTATGGCCTGACTTGGGTCCGCCAAGTTCCAGGGAAAGGGCTACATTGGGTCTCCGGGATGAATTGGAATGGTGGTGACACGCGTTATGCAGACTCTGTGAGGGGCCGATTTAGCATGTCCAGAGACAACAGCAACAACATCGCATATCTGCAAATGAATAATCTGAGAGTGGAGGACACGGCCTTGTATTACTGCGCGAGAGGGACCGATTACACGATAGACGACCAGGGAAGATTTTATCAAGGATCGGGGACCTTCTGGTACTTCGATTTTTGGGGCCGTGGCACACTGGTCACTGTCTCTTCAG'),
    ('CH03', 'GAGGTTCAGCTGGTGGAGTCTGGGGGAGGTGTGGTGCGGCCGGGGGGGTCCCTGAGACTCTCCTGTGCAGCGTCCGGATTCATTTTTGAGAACTACGGCTTGACTTGGGTCCGCCAAGTTCCAGGGAAAGGGCTGCATTGGGTCTCCGGTATGAATTGGAATGGTGGTGACACGCGTTATGCAGACTCTGTGAGGGGCCGATTCAGCATGTCCAGAGACAACAGCAATAATATCGCATATCTGCAAATGAAAAATCTGAGAGTCGACGACACGGCCTTGTATTACTGTGCGAGAGGGACCGATTACACGATAGACGACCAGGGAATTTTTTATAAAGGTTCGGGGACCTTCTGGTACTTCGATCTCTGGGGCCGTGGCACCCTGGTCACTGTCTCTTCAG'),
    ('CH04', 'GAGGTTCAGCTGGTGGAGTCTGGGGGAGGTCTCATACGGCCGGGGGGGTCCCTGAGACTCTCCTGTAAAGGCTCCGGTTTCATCTTTGAGAATTTTGGCTTCGGCTGGGTCCGCCAAGGTCCAGGGAAGGGGCTGGAGTGGGTGTCTGGCACTAATTGGAATGGAGGTGACTCACGTTATGGAGACTCTGTGAAGGGCCGATTCACAATCTCCAGAGACAACAGCAACAATTTCGTCTACCTGCAAATGAACAGTCTGAGACCCGAGGACACGGCCATATATTATTGTGCGAGAGGGACCGATTACACTATTGACGATCAGGGGATCCGTTATCAAGGTTCGGGGACTTTCTGGTACTTCGATGTCTGGGGCCGCGGCACCCTGGTCACGGTCTCCTCAG'),
    ('CH103', 'TCGGAGACCCTGTCCCTCACTTGCACTGTCTCTGGTGGCTCCATGGGTGGGACTTATTGGAGTTGGCTGCGCCTGTCCCCCGGGAAGGGACTGGAATGGATTGGCTATATCTTTCATACTGGAGAGACCAATTACAGTCCCTCCCTGAAGGGTCGAGTCTCCATATCAGTGGACACGTCCGAGGACCAGTTCTCCCTGAGACTGAGGTCTGTGACCGCTGCGGACACGGCCGTCTATTTTTGTGCCAGTCTGCCCAGGGGCCAATTAGTCAATGCCTACTTTCGCAATTGGGGCCGCGGATCTCTGGTCTCCGTCACCGCA'),
    ('CH235.12', 'CAGGTGCAACTAGCCCAATATGGTGGTGGGGTGAAGAGGCTAGGGGCCACAATGACCCTTTCCTGCGTGGCATCTGGATACACCTTCAACGACTACTACATACATTGGGTGCGGCAGGCCCCTGGACAAGGCTTTGAGTTGTTGGGATACATCGACCCCGCTAATGGTCGCCCAGACTACGCAGGGGCGTTGAGGGAGAGACTCTCCTTCTACAGGGACAAGTCCATGGAGACGCTGTACATGGACCTGAGGAGCCTAAGATATGACGACACGGCCATGTATTATTGTGTTAGAAATGTGGGGACCGCTGGCAGCTTGCTGCATTATGACCACTGGGGCTCGGGAAGCCCGGTCATCGTCTCCTCC'),
    ('CH98', 'GAGGTTCAGCTGGTGGAGTCTGGGGGAGGCGTGGTCGAACCCGGAAAGTCCCTGACAGTCTCGTGTGCCGCCTTTGGAGTCAACATTGACACGTATGACTTCCACTGGGTCCGCCAGGCTCCAGGCAAGGGTCTGGAGTGGGTGGCCTGTTGTTCAAACAGTGGAGAACACTCTTTCAACGCAGACATTGTCAAGGGCCGCTTTAGCGTCTCCAGGGACAATATGAGGAACAAAATGACTTTGCAGATGACGAGAGTCATGGTTGACGACACGGCTACATACTTCTGTGTGAAGGAGAGGGGAAGACACCGAACACGGGACACACTTGAATTCGGAAGATTTTTTGACTCATGGGGGCAGGGAGCCCGGGTCACCGTCTCGTCA'),
    ('DH270.1', 'CAGGTGCAGCTGGTGCAGTCTGGGGCTGAGATGAAGAAGCCTGGGGCCTCAGTGAGGGTCTCCTGCAAGGCTTCTGGATACACCTTCACCGACTACTATATACACTGGGTGCGACAGGCCCCTGGACAAGGGCCTGAGTGGATGGGATGGATCAACCCTAGCACTGGTCGCACAAACTCTCCACAGAAGTTTCAGGGCAGGGTCACCATGACCAGGGACACGTCCATCAGCACAGCCTACATGGACCTGAACAGACTGACGTCTGACGACACGGCCATGTATTACTGTACGACCGGGGGGTGGATCGGTCTTTACTCTGATACTAGTGGTTACCCTAACTTTGACTACTGGGGCCAGGGAACCCTGGTCACCGTCTCCTCAG'),
    ('DH270.4', 'GAGGTGCAGCTGGTGCAGTCTGGGGCTGAAATGAAGAACCCTGGGGCCTCAGTGAAAGTCTCCTGCGCGGCTTCTGGATATGGTTTCACCGACTTCTACATACACTGGGTGCGACTGGCCCCTGGACACGGGCTCCAGTGGATGGGATGGATGAACCCTAAGACTGGTCGCACAAATAATGCACAAGATTTTCAGGGCAGGGTCACCCTGACCAGGGACACGTCCATCGGCACAGCCTACATGGAGCTGAGGAGGCTGACATCTGACGACACGGCCGTCTATTACTGTGTGACAGGGGGGTGGATCAGTCCTTATTATGATAGTAGTTATTACCCTAATTTTGACCACTGGGGTCAGGGAACCCTGATCACCGTCTCCTCAG'),
    ('DH270.5', 'CAGGTGCAGCTGGTGCAGTCTGGGGCTGAAGTGAAGAACCCTGGGGCCTCAGTGAAAGTCTCCTGCGCGCCTTCTGGATATACCTTCACTGACTTCTACATACACTGGGTGCGACTGGCCCCTGGACAAGGGCTTGAGTGGCTGGGGTGGATGAACCCTAAGACTGGTCGCACAAATCAAGGACAAAACTTTCAGGGCAGGGTCACCATGACCAGGGACACGTCCATCGGCACAGCCTACATGGAGTTGAGGAGCCTCACATCTGACGACACGGCCGTCTATTACTGTGTGACAGGGGCCTGGATCAGTGATTATTATGATAGTAGTTATTATCCTAACTTTGACCACTGGGGTCAGGGAACCCTGGTCACCGTCTCCTCAG'),
    ('DH270.6', 'CAGGTGCAGCTGGTGCAGTCTGGGGCTCAAATGAAGAACCCTGGGGCCTCAGTGAAGGTCTCCTGCGCGCCTTCTGGATATACCTTCACCGACTTTTACATACATTGGTTGCGCCAGGCCCCTGGCCAGGGGCTTCAGTGGATGGGATGGATGAACCCTCAGACTGGTCGCACAAACACTGCACGAAACTTTCAGGGGAGGGTCACCATGACCAGGGACACGTCCATCGGCACAGCCTACATGGAGTTGAGAAGCCTGACATCTGACGACACGGCCATATATTACTGTACGACAGGGGGATGGATCAGTCTTTACTATGATAGTAGTTATTACCCCAACTTTGACCACTGGGGTCAGGGAACCCTGCTCACCGTCTCCTCAG'),
    ('HJ16', 'CAGATGAAGTTGATGCAGTCGGGGGGAGTCATGGTCCGGCCTGGAGAATCGGCGACACTGTCTTGTGTTGCCTCTGGATTCGACTTCAGTCGCAATGGGTTCGAGTGGCTCCGCCAGGGTCCCGGCAAGGGGCTGCAGTGGCTGGCCACAGTCACCTTCGAAAGTAAGACACATGTCACGGCCTCCGCGCGGGGCCGATTCACTATTTCTAGAGACAATTCCAGGAGAACCGTCTATTTGCAAATGACTAATCTGCAGCCTGACGATACGGCAATGTATTTCTGTGTTAAAGACCAGACTATTTTTCACAAAAATGGAGCCGTCGATTTCTTCTCGTACTTCGACCTGTGGGGCCGTGGCGCCCCGGTCATAGTCTCCGCAG'),
    ('N6', 'CGAGCGCACCTGGTACAATCAGGGACTGCGATGAAGAAACCGGGGGCCTCAGTAAGAGTCTCCTGCCAGACCTCTGGATACACCTTTACCGCCCACATATTATTTTGGTTCCGACAGGCCCCCGGGCGAGGACTTGAGTGGGTGGGGTGGATCAAGCCACAATATGGGGCCGTGAATTTTGGTGGTGGTTTTCGGGACAGGGTCACATTGACTCGAGACGTATATAGAGAGATTGCGTACATGGACATCAGAGGCCTTAAACCTGACGACACGGCCGTCTATTACTGTGCGAGAGACCGTTCCTATGGCGACTCCTCTTGGGCCTTAGATGCCTGGGGACAGGGAACGACGGTCGTCGTCTCCGCG'),
    ('NIH45-46', 'CAAGTGCGACTGTCGCAGTCTGGAGGTCAGATGAAGAAGCCTGGCGAGTCGATGAGACTTTCCTGTCGGGCTTCCGGATATGAATTTCTGAATTGTCCAATAAATTGGATTCGCCTGGCCCCCGGAAGACGGCCTGAGTGGATGGGATGGCTGAAGCCTAGGGGAGGGGCCGTCAATTACGCACGTAAATTTCAGGGCAGAGTGACCATGACTCGAGACGTGTATTCCGACACAGCCTTTTTGGAGTTGCGCTCCTTGACATCAGACGACACGGCCGTCTATTTTTGTACTAGGGGAAAATATTGTACTGCGCGCGACTATTATAATTGGGACTTCGAACACTGGGGCCGGGGTGCCCCGGTCACCGTCTCATCAGCGTCGACCAAGGGCCCATCGGT'),
    ('PG16', 'CAGGAACAACTGGTGGAGTCTGGGGGAGGCGTGGTCCAGCCGGGGGGGTCCCTGAGACTCTCCTGTTTAGCGTCTGGATTCACGTTTCACAAATATGGCATGCACTGGGTCCGCCAGGCTCCAGGCAAGGGCCTGGAGTGGGTGGCACTCATCTCAGATGACGGAATGAGGAAATATCATTCAGACTCCATGTGGGGCCGAGTCACCATCTCCAGAGACAATTCCAAGAACACTCTTTATCTGCAATTCAGCAGCCTGAAAGTCGAAGACACGGCTATGTTCTTCTGTGCGAGAGAGGCTGGTGGGCCAATCTGGCATGACGACGTCAAATATTACGATTTTAATGACGGCTACTACAACTACCACTACATGGACGTCTGGGGCAAGGGGACCACGGTCACCGTCTCGAGC'),
    ('PG9', 'CAGCGATTAGTGGAGTCTGGGGGAGGCGTGGTCCAGCCTGGGTCGTCCCTGAGACTCTCCTGTGCAGCGTCCGGATTCGACTTCAGTAGACAAGGCATGCACTGGGTCCGCCAGGCTCCAGGCCAGGGGCTGGAGTGGGTGGCATTTATTAAATATGATGGAAGTGAGAAATATCATGCTGACTCCGTATGGGGCCGACTCAGCATCTCCAGAGACAATTCCAAGGATACGCTTTATCTCCAAATGAATAGCCTGAGAGTCGAGGACACGGCTACATATTTTTGTGTGAGAGAGGCTGGTGGGCCCGACTACCGTAATGGGTACAACTATTACGATTTCTATGATGGTTATTATAACTACCACTATATGGACGTCTGGGGCAAAGGGACCACGGTCACCGTCTCGAGC'),
    ('PGDM1400', 'CAGGTGCATCTGACGCAGTCTGGGCCTGAGGTGAGGAAGCCTGGGACCTCCGTAAAGGTCTCCTGCAAGGCCCCTGGAAACACATTGAAGACTTATGATCTACACTGGGTGCGCAGCGTCCCTGGACAAGGCCTTCAGTGGATGGGATGGATAAGCCATGAGGGCGACAAGAAGGTCATTGTGGAAAGATTCAAGGCCAAAGTCACCATTGATTGGGACAGGTCCACCAATACGGCCTATCTCCAACTGAGCGGCCTCACATCTGGCGACACGGCCGTCTATTATTGTGCGAAAGGCTCAAAACACAGGCTGCGAGATTACGCTCTCTACGACGACGACGGCGCATTGAATTGGGCTGTCGATGTTGACTACCTTTCGAACTTGGAATTCTGGGGCCAAGGGACCGCCGTCACCGTCTCTTCA'),
    ('PGT121', 'CAGATGCAGTTACAGGAGTCGGGCCCCGGACTGGTGAAGCCTTCGGAAACCCTGTCCCTCACGTGCAGTGTGTCTGGTGCCTCCATAAGTGACAGTTACTGGAGCTGGATCCGGCGGTCCCCAGGGAAGGGACTTGAGTGGATTGGGTATGTCCACAAAAGCGGCGACACAAATTACAGCCCCTCCCTCAAGAGTCGAGTCAACTTGTCGTTAGACACGTCCAAAAATCAGGTGTCCCTGAGCCTTGTGGCCGCGACCGCTGCGGACTCGGGCAAATATTATTGCGCGAGAACACTGCACGGGAGGAGAATTTATGGAATCGTTGCCTTCAATGAGTGGTTCACCTACTTCTACATGGACGTCTGGGGCAATGGGACTCAGGTCACCGTCTCCTCA'),
    ('PGT122', 'CAGGTTCATCTGCAGGAGTCGGGCCCCGGACTGGTGAAGCCTTCGGAGACCCTGTCCCTCACGTGCAATGTGTCTGGGACCCTCGTGCGTGATAACTACTGGAGCTGGATCAGACAACCCCTCGGGAAGCAACCTGAGTGGATTGGCTATGTCCATGACAGCGGGGACACGAATTACAACCCCTCCCTGAAGAGTCGAGTCCACTTATCGTTGGACAAGTCCAAAAACCTGGTGTCCCTGAGGCTGACCGGCGTGACCGCCGCGGACTCGGCCATATATTATTGCGCGACAACAAAACACGGGAGGAGGATTTATGGCGTCGTTGCCTTCAAAGAGTGGTTCACCTATTTCTACATGGACGTCTGGGGCAAAGGGACTTCGGTCACCGTCTCCTCA'),
    ('PGT123', 'CAGCTGCACCTGCAGGAGTCGGGCCCAGGGCTGGTGAAGCCTCCGGAGACCCTGTCCCTCACGTGTAGTGTGTCTGGCGCCTCCATCAATGATGCCTATTGGAGTTGGATTCGGCAGTCCCCAGGGAAGCGGCCTGAGTGGGTTGGATATGTCCATCACAGCGGTGACACAAATTATAATCCCTCACTCAAGAGGCGCGTCACGTTTTCATTAGACACGGCCAAGAATGAAGTGTCCCTGAAATTAGTAGACCTGACCGCTGCGGACTCGGCCACATATTTTTGTGCGCGAGCACTTCACGGGAAGAGGATTTATGGGATAGTTGCCCTCGGAGAGTTGTTCACCTACTTCTACATGGACGTCTGGGGCAAGGGGACTGCGGTCACCGTCTCCTCA'),
    ('PGT125', 'CAGTCGCAGCTGCAGGAGTCGGGCCCACGACTGGTGGAGGCCTCGGAGACCCTGTCACTCACGTGCAATGTGTCCGGCGAGTCCACTGGTGCCTGTACTTATTTCTGGGGCTGGGTCCGGCAGGCCCCAGGGAAGGGGCTGGAGTGGATCGGGAGTTTGTCCCATTGTCAGAGTTTCTGGGGTTCCGGTTGGACCTTCCACAACCCGTCTCTCAAGAGTCGACTCACGATTTCACTCGACACGCCCAAGAATCAGGTCTTCCTCAAGCTCACTTCTCTGACTGCCGCGGACACGGCCACTTACTACTGTGCGCGATTCGACGGCGAAGTCTTGGTCTATAATCATTGGCCAAAGCCGGCCTGGGTGGACCTCTGGGGCCGCGGAATACCGGTCACCGTCACCGTCTCCTCA'),
    ('PGT126', 'CAGCCGCAGCTGCAGGAGTCGGGGCCAGGACTGGTGGAGGCTTCGGAGACCCTGTCCCTCACCTGCACTGTGTCCGGCGACTCCACTGCTGCTTGTGACTATTTCTGGGGCTGGGTCCGGCAGCCCCCAGGGAAGGGCCTGGAGTGGATTGGGGGTTTGTCACATTGTGCAGGTTACTACAATACTGGCTGGACCTACCACAACCCGTCTCTCAAGAGTCGGCTCACGATTTCACTCGACACCCCCAAGAATCAGGTCTTCCTGAAGTTAAATTCTGTGACCGCCGCGGACACGGCCATTTACTACTGTGCGCGATTCGACGGCGAAGTTTTGGTGTACCACGATTGGCCAAAGCCGGCCTGGGTCGACCTCTGGGGCCGGGGAACTTTGGTCACCGTCACCGTCTCCTCA'),
    ('PGT127', 'CAGCCGCAGCTGCAGGAGTCGGGCCCAGGACTGGTGGAGGCTTCGGAGACCCTGTCCCTCACGTGCACTGTGTCCGGCGACTCCACTGGTCGTTGTAATTATTTCTGGGGCTGGGTCCGGCAGCCCCCAGGGAAGGGGCTGGAGTGGATTGGGAGTTTGTCCCACTGTAGAAGTTACTACAATACTGACTGGACCTACCACAACCCGTCTCTCAAGAGTCGACTCACTATTTCACTCGACACGCCCAAGAATCAGGTCTTCCTGAGATTGACCTCTGTGACCGCCGCGGACACGGCCACTTATTACTGTGCGCGATTCGGCGGCGAAGTTCTAGTGTACAGAGATTGGCCAAAGCCGGCCTGGGTCGACCTCTGGGGCCGGGGAACGCTGGTCACCGTCTCGAGC'),
    ('PGT128', 'CAGCCGCAGCTGCAGGAGTCGGGCCCAACACTGGTGGAGGCTTCGGAGACTCTGTCCCTCACCTGCGCTGTGTCCGGCGACTCCACTGCTGCATGTAATTCTTTCTGGGGCTGGGTCCGGCAGCCCCCAGGGAAGGGGCTGGAGTGGGTTGGGAGTTTGTCCCATTGTGCAAGCTATTGGAATCGTGGGTGGACCTACCACAACCCGTCTCTCAAGAGTCGGCTCACGCTTGCTCTCGACACACCCAAGAATCTGGTCTTCCTCAAATTAAATTCTGTGACTGCCGCGGACACGGCCACTTACTACTGTGCGCGATTCGGCGGCGAAGTTTTACGCTACACGGATTGGCCAAAGCCGGCCTGGGTCGACCTCTGGGGCCGGGGAACGCTGGTCACCGTCTCGAGC'),
    ('PGT130', 'CAGGTGCAGCTGCAGGAGTCGGGCCCAGGACTGGTGAAGCCTGCGGAGACCCTGTCCCTCACCTGCAGTGTCTCTGGAGAATCTATCAATACTGGTCATTACTACTGGGGCTGGGTCCGTCAGGTCCCAGGGAAGGGACTTGAGTGGATAGGTCATATCCATTATACGACGGCTGTCCTGCACAACCCGTCCCTCAAGAGTCGACTCACCATCAAAATTTACACGTTGAGAAACCAGATTACCCTGAGGCTCAGTAATGTGACGGCCGCGGACACGGCCGTCTATCACTGCGTACGATCCGGCGGCGACATCTTATATTATTATGAGTGGCAAAAGCCGCACTGGTTCTCTCCCTGGGGCCCGGGAATCCACGTCACCGTCTCGAGC'),
    ('PGT131', 'CAGGTGCAACTACAGGAGTCGGGCCCAGGACTGGTGAAGCCTTCGGAGACCCTTTCCCTCACCTGCACTGTCTCTGGTGACTCCATCAACACTGGTCATCACTACTGGGGCTGGGTCCGTCAGGTCCCAGGGAAGGGACCGGAATGGATTGCTCACATCCACTATAATACGGCTGTCTTACACAATCCGGCCCTCAAGAGTCGAGTCACCATTTCGATTTTCACCCTGAAGAATCTGATTACCCTGAGCCTCAGTAATGTGACCGCCGCGGACACGGCCGTCTATTTCTGCGTTCGATCCGGCGGCGACATTTTATACTATATTGAGTGGCAAAAACCCCACTGGTTCTATCCCTGGGGCCCGGGAATTTTGGTCACCGTCTCGAGC'),
    ('PGT135', 'CAGTTGCAGATGCAGGAGTCGGGCCCAGGACTGGTGAAGCCTTCGGAGACCCTGTCTCTGAGTTGCACTGTCTCTGGTGACTCCATAAGGGGTGGCGAGTGGGGCGATAAAGATTATCATTGGGGCTGGGTCCGCCACTCAGCAGGAAAGGGCCTGGAGTGGATTGGGAGTATCCATTGGAGGGGGACCACCCACTACAAAGAGTCCCTCAGGAGAAGAGTGAGTATGTCGATCGACACGTCCAGGAATTGGTTCTCCCTGAGGCTGGCCTCTGTGACCGCCGCGGACACGGCCGTCTACTTTTGTGCGAGACACCGACATCATGATGTTTTCATGTTGGTCCCTATTGCGGGCTGGTTCGACGTCTGGGGCCCGGGAGTCCAGGTCACCGTCTCGAGC'),
    ('PGT141', 'CAGGTGCAGCTGGTGCAGTCTGGGCCGGAGGTGAAGAAGCCTGGGTCCTCAGTGAAGGTCTCCTGCAAGGCCTCTGGAAACACCTTCAGTAAATATGATGTCCACTGGGTACGACAGGCCACTGGACAGGGGCTTGAATGGGTGGGATGGATGAGTCATGAGGGTGATAAGACAGAATCTGCACAGAGATTTAAGGGCCGAGTCACCTTCACGAGGGACACTTCCGCAAGCACAGCCTACATGGAACTGCGCGGCCTGACATCTGACGACACGGCCATCTATTATTGTACGAGAGGCTCAAAACATCGTTTGCGAGACTACGTTCTCTACGATGACTACGGCTTAATTAATTATCAAGAGTGGAATGACTACCTTGAATTTTTGGACGTCTGGGGCCATGGAACCGCGGTCACCGTCTCCTCA'),
    ('PGT142', 'CAGGTGCAGCTGGTGCAGTCTGGGCCTGAGGTGAAGAAGCCTGGGTCCTCAGTGAAGGTCTCCTGCAAGGCCTCTGGAAACACCTTCAGTAAATATGATGTCCACTGGGTACGACAGGCCACTGGACAGGGGCTTGAATGGGTGGGATGGATTAGTCATGAGCGTGATAAGACAGAATCTGCACAGAGATTTAAGGGCCGAGTCACCTTCACGAGGGACACTTCCGCAACCACAGCCTACATGGAACTGCGCGGCCTGACATCTGACGACACGGCCATTTATTATTGTACGAGAGGCTCAAAACATCGCTTGCGAGACTACGTTCTCTACGATGACTACGGCTTAATTAATTATCAAGAGTGGAATGACTACCTTGAATTTTTGGACGTCTGGGGCCATGGAACCGCGGTCACCGTCTCCTCA'),
    ('PGT143', 'CAGGTGCAGCTGGAGCAGTCTGGGGCTGAGGTGAAGAAGCCTGGGTCCTCAGTGAAGGTCTCCTGCAAGGCCTCTGGAAACACCTTCAGTAAATATGATGTCCACTGGGTACGACAGGCCACTGGACAGGGGCTTGAATGGGTGGGATGGATGAGTCATGAGGGTGATAAGACAGAATCTGCACAGAGATTTAAGGGGCGAGTCACCTTCACGAGGGACACTTCCGCAAGCACAGCCTACATGGAACTGCGCGGCCTGACATCTGACGACACGGCCATTTATTATTGTACGAGAGGTTCAAAACATCGCTTGCGAGACTACGTTCTCTACGATGACTACGGCTTAATTAATTATCAAGAGTGGAATGACTACCTTGAATTTTTGGACGTCTGGGGCCATGGAACCGCGGTCACCGTCTCCTCA'),
    ('PGT144', 'CAGGTGCAGCTGGTGCAGTCTGGGGCTGAGGTGAAGAAGCCTGGGTCCTCAGTGAAGGTCTCCTGCAAGGCCTCTGGAAACACCTTCAGGAAATATGATGTCCACTGGGTACGACAGGCCACTGGACAGGGGCTTGAATGGGTGGGATGGATGAGTCATGAGGGTGATAAGACAGAATCTGCACAGAGATTTAAGGGCCGAGTCTCTTTCACGAGGGACAATTCCGCAAGCACAGCCTACATTGAACTGCGCGGCCTGACATCTGACGACACGGCCATTTATTATTGTACCGGAGGCTCAAAACATCGCTTGCGAGACTACGTTCTCTACGATGATTACGGCCTAATAAATCAGCAAGAGTGGAATGACTACCTTGAATTTTTGGACGTCTGGGGCCATGGAACCGCGGTCACCGTCTCCTCA'),
    ('PGT145', 'CAGGTGCAGTTGGTGCAGTCTGGGGCTGAAGTGAAGAAGCCTGGGTCCTCAGTGAAGGTCTCCTGCAAGGCCTCTGGAAACAGTTTCAGTAATCATGATGTCCACTGGGTACGACAGGCCACTGGACAGGGGCTTGAATGGATGGGATGGATGAGTCATGAGGGTGATAAGACAGGCTTGGCACAAAAGTTTCAGGGCAGAGTCACCATCACGAGGGACAGTGGCGCAAGTACAGTCTACATGGAGTTGCGCGGCCTGACAGCTGACGACACGGCCATTTATTATTGTTTGACCGGCTCAAAACATCGCCTGCGAGATTATTTTCTGTACAATGAATATGGCCCCAATTATGAAGAGTGGGGTGACTACCTTGCGACTTTGGACGTCTGGGGCCATGGGACCGCGGTCACCGTCTCGAGC'),
    ('PGT151', 'CGGGTGCAGTTGGTGGAGTCGGGGGGAGGCGTGGTCCAGCCTGGGAAGTCCGTGAGACTTTCCTGTGTAGTCTCCGATTTCCCCTTCAGCAAGTATCCTATGTATTGGGTTCGCCAGGCTCCAGGCAAGGGGCTGGAGTGGGTGGCAGCCATCTCCGGTGATGCCTGGCATGTGGTCTACTCAAATTCCGTGCAGGGCCGATTTCTCGTCTCCAGGGACAATGTCAAGAACACTCTATATTTAGAAATGAACAGCCTGAAAATTGAGGATACGGCCGTATATCGCTGCGCGAGAATGTTCCAGGAGTCTGGTCCACCACGTTTGGATCGTTGGAGCGGTCGAAATTATTACTATTATTCTGGTATGGACGTCTGGGGCCAAGGGACCACGGTCACCGTCTCGAGC'),
    ('PGT152', 'CGGGTGCAGTTGGTGGAGTCGGGGGGAGGCGTGGTCCAGCCTGGGAAGTCCGTGAGACTTTCCTGTGTAGTCTCTGATTTCCCCTTCAGCAAGTATCCTATGTATTGGGTTCGCCAGGCTCCAGGCAAGGGGCTGGAGTGGGTGGCAGCCATCTCCGCTGATGCCTGGCATGTGGTCTACTCAGGCTCCGTGCAGGGCCGATTTCTCGTCTCCAGGGACAACTCCAAGAACATTCTGTATTTGGAAATGAACACCCTGAAAATTGAGGACACGGCCGTATATCGCTGCGCGAGAATGTTCCAGGAGTCTGGTCCACCACGTTTCGATTCTTGGAGCGGTCGAAATTACTACTATTACTCTGGTATGGACGTCTGGGGCCAAGGGACCACGGTCACCGTCTCGAGC'),
    ('VRC-CH30', 'CAGGTGCAGCTGGTGCAGTCAGGGGCTGCCGTGAGGAAGCCTGGGGCCTCAGTGACTGTCTCCTGCAAATTCGCTGAAGACGACGACTACTCTCCACACTGGGTGAATCCGGCCCCTGAACACTATATTCACTTTCTACGACAGGCCCCTGGACAGCAACTGGAGTGGTTGGCATGGATGAACCCTACGAATGGCGCCGTCAATTATGCATGGCAGCTTCATGGCAGGCTCACGGCGACCAGAGACGGGTCCATGACTACAGCCTTTTTGGAAGTGAGGAGTCTAAGATCTGACGACACGGCCGTCTATTATTGTGCGAGGGCCCAGAAAAGGGGGCGGAGTGAATGGGCCTACGCCCACTGGGGCCAGGGAACCCCGGTCGCCGTCTCCTCA'),
    ('VRC-CH31', 'CAGGTGCAGCTGGTGCAGTCAGGGGCTGCCGTGAGGAAGCCTGGGGCCTCAGTGACTGTCTCCTGTAAATTCGCTGAAGACGACGACTACTCTCCATACTGGGTGAATCCGGCCCCTGAACATTTTATTCACTTTTTGCGACAGGCCCCTGGACAACAACTAGAGTGGCTGGCATGGATGAACCCAACGAATGGCGCCGTTAATTATGCATGGTACCTTAATGGCAGGGTCACGGCGACCAGGGACAGGTCCATGACTACAGCCTTTTTGGAAGTGAAGAGTCTAAGATCTGACGACACGGCCGTCTACTATTGTGCGAGGGCCCAGAAAAGGGGGCGGAGTGAGTGGGCCTACGCCCACTGGGGTCAGGGCACTCCGGTCGTCGTCTCGTCA'),
    ('VRC-CH32', 'CAGGTGCAGCTGGTGCAGTCAGGGGCTGCCGTGAGGAAGCCTGGGGCCTCAGTGACTGTCTCCTGCAAGTTCGCTGAAGACGACGACTTCTCTCCACACTGGGTGAATCCGGCCCCTGAACACTATATTCATTTTCTGCGACAGGCACCTGGACAACAACTAGAGTGGTTGGCATGGATGAAGCCTACGAATGGTGCCGTCAATTATGCATGGCAACTTCAGGGCAGGGTCACGGTGACCAGGGACAGGTCCCAGACTACAGCCTTTTTGGAAGTTAAGAATCTGAGATCTGACGACACGGCCGTCTATTATTGTGCGAGGGCCCAGAAAAGGGGGCGCAGCGAGTGGGCCTATGCCCACTGGGGCCAGGGAACCCCGGTCGTCATCTCCGCA'),
    ('VRC-CH33', 'CAGGTGCAGCTGGTGCAGTCAGGGGCTGCCGTGAGGAAGCCCGGGGCCTCAATTAGTGTCTCCTGCAAATTCGCTGATGCCGACGACTACTCTCCGCACTGGATGAATCCGGCCCCTGAACACTATATTCACTTTCTGCGCCAGGCCCCTGGACAGCAATTAGAGTGGTTGGCGTGGATGAATCCTACGAATGGCGCCGTTAATTATGCCTGGTACCTTAATGGCAGGGTCACGGCGACCAGAGACAGGTCCATGACCACAGCGTTTCTGGAAGTGAGGAGTCTGAGATCTGACGACACGGCCGTCTATTATTGTGCGAGGGCCCAGAAAAGGGCGCGGAGTGAATGGGCCTACGCCCACTGGGGCCAGGGAACCCCGGTCGTCGTCTCCTCA'),
    ('VRC-CH34', 'CAGGTGCAGCTGGTGCAGTCGGGGGCTGCCGTGAGGAAGCCTGGGGCCTCAGTAACTGTCTCCTGCAAATTCGCTGAAGACGACGACTGGTCTCCACACTGGGTGAATCCGGCCCCTGAACACTATATTCATTTTCTACGGCAGGCCCCTGGACAGCAATTAGAGTGGTTGGCATGGATGAACCCTACAAATGGTGCCGTCAATTATGCATGGCAGCTTAACGGCAGGCTCACGGCGACCAGAGACACGTCCATGACTACAGCCTTTTTGGAGGTGAAGAGTCTGAGATCTGACGACACGGCCGTCTATTATTGTGCGAGGGCCCAAAAAAGGGGGCGCAGTGAATGGGCCTACGCCCACTGGGGCCAGGGAACCCCGGTCGTCGTCTCCTCA'),
    ('VRC-PG04', 'CAGGTGCAGCTGGTGCAGTCTGGGTCTGGAGTGAAGAAGCCTGGGGCTTCGGTGAGAGTCTCCTGTTGGACCTCTGAGGACATCTTCGAAAGAACCGAGTTGATTCATTGGGTGCGACAGGCCCCTGGACAAGGACTTGAGTGGATTGGGTGGGTGAAAACCGTCACTGGCGCCGTCAATTTTGGTTCACCAGATTTTCGACAGAGAGTCTCTCTGACCCGCGACAGGGACCTCTTCACAGCCCATATGGACATTCGCGGACTGACACAAGGCGACACGGCCACATATTTTTGCGCGAGACAGAAATTTTATACGGGCGGCCAAGGCTGGTACTTCGATCTCTGGGGCCGTGGAACCCTCATTGTTGTCTCGTCA'),
    ('VRC-PG04b', 'CAGGTGCAGCTGGTGCAGTCTGGGTCTGGAGTGAAGAAGCCTGGGGCTTCGGTGAGAGTCTCCTGTTGGACCTCTGAGGACATCTTCGAAAGAACCGAATTGATTCATTGGGTGCGACAGGCCCCTGGACAAGGACTTGAGTGGATTGGGTGGGTGAAAACTGTCACTGGCGCCGTCAATTTTGGTTCACCAAATTTTCGACATAGAGTCTCTCTGACCCGCGACAGGGACCTCTTCACAGCCCATATGGACATTCGCGGACTGACACAAGGCGACACGGCCACATATTTTTGCGCGAGACAGAAATTTGAGAGGGGCGGCCAAGGCTGGTATTTCGATCTCTGGGGCCGTGGAACCCTCATTGTTGTCTCGTCA'),
    ('VRC01', 'CAGGTGCAGCTGGTGCAGTCTGGGGGTCAGATGAAGAAGCCTGGCGAGTCGATGAGAATTTCTTGTCGGGCTTCTGGATATGAATTTATTGATTGTACGCTAAATTGGATTCGTCTGGCCCCCGGAAAAAGGCCTGAGTGGATGGGATGGCTGAAGCCTCGGGGGGGGGCCGTCAACTACGCACGTCCACTTCAGGGCAGAGTGACCATGACTCGAGACGTTTATTCCGACACAGCCTTTTTGGAGCTGCGCTCGTTGACAGTAGACGACACGGCCGTCTACTTTTGTACTAGGGGAAAAAACTGTGATTACAATTGGGACTTCGAACACTGGGGCCGGGGCACCCCGGTCATCGTCTCATCA'),
    ('VRC02', 'CAGGTGCAGCTGGTGCAGTCTGGGGGCCAGATGAAGAAGCCTGGCGAGTCGATGAGAATTTCTTGTCAGGCTTCCGGATATGAATTTATTGATTGTACACTAAATTGGGTTCGCCTGGCCCCCGGAAGAAGGCCTGAATGGATGGGATGGCTGAAGCCTCGAGGGGGGGCCGTCAACTACGCACGTCCACTTCAAGGCAGAGTGACCATGACTCGAGACGTGTATTCCGACACAGCCTTTTTGGAGCTGCGCTCCTTGACAGCAGACGACACGGCCGTCTACTATTGTACTAGGGGAAAAAATTGTGATTACAATTGGGACTTCGAACACTGGGGCCGGGGTACCCCGGTCACCGTCTCATCA'),
    ('VRC03', 'CAGGTGCAGCTGGTGCAGTCTGGGGCTGTGATTAAGACGCCTGGGTCCTCAGTGAAGATCTCATGTCGGGCTTCTGGATACAACTTTCGTGATTATTCGATCCATTGGGTCCGCCTCATTCCTGACAAGGGATTTGAGTGGATTGGATGGATTAAACCTCTGTGGGGTGCCGTCAGTTATGCCCGGCAACTTCAGGGCCGAGTCTCTATGACTCGACAATTATCTCAAGACCCAGACGACCCGGACTGGGGCGTTGCCTACATGGAGTTCAGTGGACTGACGCCCGCCGACACGGCCGAATATTTTTGTGTCCGGAGAGGGTCCTGTGATTATTGCGGAGACTTTCCCTGGCAATACTGGGGTCAGGGCACCGTCGTCGTCGTCTCGTCA'),
    ('VRC06b', 'CAGGTGCAGCTGGTGGAGTCTGGGTCTGCGATGAGGAAGCCGGGGTCGTCAGTGAAGATCTCATGTCGGGCTTCTGGATTCAATTTTCGCGAATATTCGATACATTGGGTCCGACTGATTCCTGGCAGGGGACTTGAGTGGATGGGGTGGATAAAGGGTATGTGGGGAGCCGTCAATTATGCCCGGCAACTTCAGGGCCGGGTCTCTATGACTCGACAATTGTCTCAGGACCCAGACGACCCAGACTGGGGCGTCGCCTACCTGGACTTCAGTGGACTGACGTCCGGCGATACAGGCGAATATTTTTGTGTGAGAAAAGGACCCTCCTGTCCTCACTGCGGAGACTTCCACTGGCAACATTGGGGTCAGGGCACACTCGTCGTCGTCTCGACA'),
    ('VRC13', 'CAGGTGCAGCTGGTGCAGCCCGGGACTGCGATGAAGTCTCTTGGATCATCACTGACGATCACTTGCAGAGTCTCCGGAGACGACCTCGGCTCTTTCCACTTCGGCACTTATTTTATGATCTGGGTGCGTCAAGCCCCTGGACAGGGCCTTGAGTACATGGGAGGCATCCTCCCCTCCACGAAGACACCCACTTACGCGCACAAATTTCGAGGCCGCGTCTCTATTTCCGCTCCCGGGGTTCCGCCCGTGCTGTCCCTCGCGCTGACCAACCTCACATACGACGACACGGCCACCTACTTCTGTGCGCGCGAGAGGGGGCGACATTTTGAGCCAAAGAACAGGGATAATCTGGAAGGCAAATTTTTCGATTTATGGGGTCGTGGCACCTTCGTTCGCGTCTCGCCG'),
    ('VRC16', 'GAGGTGCAGTTATCAGAGTCGGGGGGAGGCTTCGTAAAGCCGGGGGGGTCCCTGAGACTCTCCTGTGAGGCCTCTGGATTCACCTTCAATAATTATGCCATGGGTTGGGTCCGCCAGGCTCCAGGGAAGGGGCTGGAGTGGGTCTCTGTGACGAGTGCTCATGGTGGTAGTGCATACTTTGGAGAATTCGTGAAGGGCCGCTTCACCATGTCCAGAGACCACTTCATAGACACGGTCTACTTGGAAATGAACAGACTGACAGTCGAGGACACGGCCGTCTACTACTGCGTCAGAGTGACATTTTACCATGAGGGCAGTGGCTACTATTATCGTGCCGGGAATTACTTTGATTCCTGGGGCCAGGGAACCCTAGTCATCGTCTCCGCA'),
    ('VRC26.08', 'GAAGTGCAGCTGGTGGAGTCTGGGGGAGGCGTGGTCCAGCCTGGGAGGTCCCTGACACTGTCCTGTGCAGCCTCTCAATTCACTTTTTCGAATTATGGCATGCACTGGGTCCGCCAGGCTCCAGGCAAGGGACTGGAGTGGGTGGCAAGTGTATCAAATGATGGAACCAAGAAATATCATGGAGATTCCGTGTGGGGCCGATTCACCATCTCCAGAGACAATTCCAAGAACACACTGTTTCTACAAATGAGCAGCCTGCGAGCTGAGGACACGGCTGTATATTTCTGTGTGAGAGATCAACGAGAAGACGAGTGTGAAGAGTGGTGGTCGGATTATTATGATTTTGGGAGAGAGCTCCCTTGCCGAAAATTCCGGGGCCTGGGCCTGGCTGGAATTTTTGATATCTGGGGCCACGGGACAATGGTCACCGTCTCTTC'),
    ('VRC26.25', 'CAGGTGCAGTTGGTGGAGTCTGGGGGAGGCGTGGTCCAGCCTGGGACGTCCCTGAGACTCTCCTGTGCAGCCTCTCAATTCAGGTTTGATGGTTATGGCATGCACTGGGTCCGCCAGGCCCCAGGCAAGGGGCTGGAGTGGGTGGCATCTATATCACATGATGGAATTAAAAAGTATCACGCAGAAAAAGTGTGGGGCCGCTTCACCATCTCCAGAGACAATTCCAAGAACACACTGTATCTACAAATGAACAGCCTGCGACCTGAGGACACGGCTCTCTACTACTGTGCGAAAGATTTGCGAGAAGACGAATGTGAAGAGTGGTGGTCGGATTATTACGATTTTGGGAAACAACTCCCTTGCGCAAAGTCACGCGGCGGCTTGGTTGGAATTGCTGATAACTGGGGCCAAGGGACAATGGTCACCGTCTCTTCA'),
    ('VRC26.26', 'GAGGTGCAGCTGGTGGAGTCTGGGGGAGGCATAGTCCAGCCTGGGAGGTCCCTGACACTGTCTTGTGTAGCCTCTCAATTCGCTTTTTCGCATTATGGCATGCACTGGGTCCGCCAGGCTCCAGGCAAGGGACTGGAGTGGGTGGCCAGTATCTCAAGGGATGAGACCAAGAAATATCATGGAGATTCCGTGTGGGACCGATTCAGTATCTCCAGAGACAATTCCAAGAATACCCTGTTTCTACAAATGAACAGCCTGCGAGCTGAGGACACGGCACTATATTTCTGTGTGAGAGATCAGCGAGAAGACGAATGTGAGGAGTGGTGGTCGGACTATTATGATTTTGGGAAAGAACTCCCTTGCCGAAAATTCCGGGGCCTGGGCCTGGCTGGAATTTTTGATGTCTGGGGCCACGGGACAATGGTCATCGTCTCTTCA'),
    ('b12', 'CTCGTGCAGTCTGGGGCTGAGGTGAAGAAGCCTGGGGCCTCAGTGAAGGTTTCTTGTCAGGCTTCTGGATACAGATTCAGTAACTTTGTTATTCATTGGGTGCGCCAGGCCCCCGGACAGAGGTTTGAGTGGATGGGATGGATCAATCCTTACAACGGAAACAAAGAATTTTCAGCGAAGTTCCAGGACAGAGTCACCTTTACCGCGGACACATCCGCGAACACAGCCTACATGGAGTTGAGGAGCCTCAGATCTGCAGACACGGCTGTTTATTATTGTGCGAGAGTGGGGCCATATAGTTGGGATGATTCTCCCCAGGACAATTATTATATGGACGTCTGGGGCAAAGGGACCACGGTCATCGTCTCCTCAGCCTCCACCAAG'),
    ('3BC176', 'CAGGTGCAGCTGATGCAGTCTGGGGCTCAGCTGAGGGACCCTGGGGACTCACTGAAGATTTCCTGCAAGGCATCTGGATACAACTTCATCGACTACCATATACACTGGGTGCGACTGGCCCCTGGACGAGGGCTTGAGTGGATGGGATGGATCGACCCTGTTGGTGGTATCACAAAGTACGCAGGGCAGTTCCAGGGCAGACTCTCCTTGACCAGGGACACGTCCACGAACACACTCTTCTTGGAGCTGAGCAGACTGACAGCTGGGGACACGGCCGTGTATTTCTGTGCTAGATCGATGCGACCCGTTGATCACGGGATTGATTATTCAGGGTTGTTCGTCTTCCATTTTTGGGGCCGAGGGTCCGACGTCCTCGTCTCCTCA'),
    ('3BNC55', 'CAGGTGCAGCTGGTGCAGTCTGGGACTGCGGTGAAGAGGCCTGGGGCCTCAGTGAGGGTCTCCTGCCAGGCTTCTGGATACACCTTCACCGACTACTTTATATACTGGTGGCGACAGGCCCCTGGACAAGGGCTTGAGTGGCTGGGATGGATCAACCCTCTCACTAGTCAACCAAGCTATCCATCGAGGTTTCAGGGCAGGCTCACCTTGACCAGGGACACGTTCGACGAAATGCTCTACATGGACCTGAGGGGGCTGAGATCTGACGACACGGGCATATATTTCTGTGCGAGACGGCATTCGGACTATTGTGATTTTGATATCTGGGGCTCAGGGACACAGATCATCGTCTCTTCA'),
    ('VRC07', 'CAGGTGCGGCTGTCGCAGTCTGGGGGTCAGATGAAGAAGCCTGGGGACTCAATGAGGATCTCCTGCAGGGCTTCTGGATACGAGTTCATCAATTGTCCTATCAACTGGATCCGACTGGCCCCTGGAAAAAGGCCTGAGTGGATGGGATGGATGAAACCTAGGGGTGGTGCCGTAAGCTATGCACGGCAGTTGCAGGGCAGAGTCACCATGACCAGGGACATGTACTCAGAGACAGCCTTCTTGGAGCTGAGGAGCCTGACATCTGACGACACGGCCGTGTATTTCTGTACGAGAAAGTACTGTACTGCGAGGGATTATTATAACTGGGACTTCGAGCACTGGGGCCAGGGCACCCCGGTCACCGTCTCCTCA'),
    ('VRC-PG05', 'GAGGTGCAGCTGGTGGAGTCTGGGGGAGGCTTGGTCCAGCCTGGGGGGTCCCTGAGACTCTCCTGTGCAGCCTCTGGATTCCCCTTTAATAGGGATTGGATGACCTGGGTCCGCCAGGCTCCAGGGAAAGGGCTGGAGTGGGTGGCCAACATAAACATGGATGGAGATAAGAAAGACTATGTGGACTCTGTGAAGGGCCGATTCACCATCTCCAGAGACAACGCCAAGACCTCACTGTATCTGCAAATGAACAGCCTGAGAGCCGGGGACACGGCTGTGTATTACTGTGCGAGAATCCGGCAGGTGAGTAAGTATTTGCAGTGGTATCCCGGCGTCTTTGAAATGTGGGGCCAAGGGACCATGGTCACCGTCTCCTCA'),
    ('4E10', 'CAGGTCCAGCTGGTGCAGTCTGGGGCTGAGGTGAAGAGGCCTGGGTCCTCAGTGACGGTCTCCTGCAAGGCTTCTGGAGGCTCCTTCAGCACCTATGCTCTCAGCTGGGTGCGACAGGCCCCTGGACGAGGGCTTGAGTGGATGGGAGGGGTCATCCCTCTCCTTACTATAACAAACTACGCACCGAGGTTCCAGGGCAGAATCACGATTACCGCGGACAGATCCACGAGCACAGCCTACCTGGAGCTGAACAGCCTGAGACCTGAGGACACGGCCGTGTATTACTGTGCGAGAGGTACAACTGGATGGGGGTGGTTGGGGAAGCCCATCGGCGCGTTCGCGCACTGGGGCCAGGGCACCCTGGTCACCGTCTCCTCA'),
    ('Z13e1', 'CTGCTGGAGTCGGGCCCAGGACTGCTGAAGCCTTCGGAGACCCTGTCCCTCACCTGCACTGTCTCTGGTGGCTCCATGATTAATTACTACTGGAGCTGGATCCGGCAGCCCCCAGGGGAGAGACCGCAGTGGCTTGGGCATATCATTTACGGTGGGACCACCAAGTACAACCCCTCCCTCGAGAGTCGAATCACCATATCAAGAGACATATCCAACAGCCAGTTCTCCCTGAGGCTGAACTCTGTGAGAGCTGCGGACACGGCCATCTATTACTGTGCGAGAGTGGCGATTGGTGTTTCGGGGTTTTTGAATTACTACTATTATATGGACGTCTGGGGCAGTGGGACCGCGGTCACCGTCTCCTCA'),
    ('10-1074', 'CAGGTGCAGCTGCAGGAGTCGGGCCCAGGACTGGTGAAGCCTTCGGAGACCCTGTCCGTCACCTGCAGTGTCTCTGGTGACTCCATGAATAATTACTACTGGACCTGGATCCGGCAGTCCCCAGGGAAGGGACTGGAGTGGATTGGGTATATCTCTGACCGTGAGAGCGCCACCTACAACCCCTCCCTCAACAGTCGAGTCGTCATATCCAGAGACACGTCCAAGAACCAGTTGTCCCTGAAGCTGAACTCTGTGACCCCCGCAGACACGGCCGTGTATTACTGTGCGACTGCCAGGCGAGGACAGCGTATTTATGGTGTCGTCTCCTTTGGAGAGTTCTTCTATTATTACTCTATGGACGTCTGGGGCAAAGGGACCACGGTCACCGTCTCCTCA'),
    ('10-996', 'CAGGTGCAGCTGCAGGAGTCGGGCCCAGGACTGGTGAAGCCTTCGGAGACCCTGTCCCTCACCTGCAGTGTCTCTAATGGCTCCGTCAGTGGTCGCTTCTGGAGCTGGATCCGGCAGTCCCCAGGGAGGGGACTGGAGTGGATTGGGTATTTCTCTGACACTGAGAAATCCAACTACAACCCCTCCCTCAGGAGTCGACTCACCCTATCCGTAGACGCGTCCAAGAACCAGTTGTCCCTGAAGCTGAACTCTGTGACCGCCGCAGACTCGGCCACGTATTACTGTGCGAGAACCCAGCAGGGGAAAAGGATTTATGGTGTGGTGTCGTTTGGGGAGTTCTTCCACTACTATTATATGGACGCCTGGGGCAAAGGGACCGCGGTCACCGTCTCCTCA')
]

# Create fasta file for later
with open('best_neutralizing_heavy_chain_na.fasta', 'w') as fasta_file:
    writer = fastaparser.Writer(fasta_file)
    for i, seq in best_neutralizing:
        writer.writefasta((i, seq))

Then we run these nucleotide sequences through PyIR (https://github.com/crowelab/PyIR), providing us all the data needed for training and clustering, specifically: see relevant_fields_seq and relevant_fields

First need to install PyIR:


In [None]:
!git clone https://github.com/crowelab/PyIR
!cd PyIR/ ; sudo pip3 install .
!sudo pyir setup

Cloning into 'PyIR'...
remote: Enumerating objects: 1122, done.[K
remote: Counting objects: 100% (51/51), done.[K
remote: Compressing objects: 100% (51/51), done.[K
remote: Total 1122 (delta 29), reused 1 (delta 0), pack-reused 1071[K
Receiving objects: 100% (1122/1122), 128.84 MiB | 22.54 MiB/s, done.
Resolving deltas: 100% (524/524), done.
Checking out files: 100% (480/480), done.
Processing /content/PyIR
Building wheels for collected packages: pyir
  Building wheel for pyir (setup.py) ... [?25l[?25hdone
  Created wheel for pyir: filename=pyir-1.3.1-cp37-none-any.whl size=42330338 sha256=4e1aff9a51b9c6021656dfbf0798f683cfcf3f161c7ca1b5ffa3f45e03616bfe
  Stored in directory: /tmp/pip-ephem-wheel-cache-k6r3jyye/wheels/3e/4f/1f/2c5c0ee4db24f0fc063fadc893f3482682062e025a13839635
Successfully built pyir
Installing collected packages: pyir
Successfully installed pyir-1.3.1
/usr/local/lib/python3.7/dist-packages/pyir/data/bin
/usr/local/lib/python3.7/dist-packages/pyir/data/germlines


In [None]:
PyIR_fields = [
    'sequence_id',
    'sequence',
    'locus',
    'stop_codon',
    'vj_in_frame',
    'productive',
    'rev_comp',
    'complete_vdj',
    'v_call',
    'd_call',
    'j_call',
    'sequence_alignment',
    'germline_alignment',
    'sequence_alignment_aa',
    'germline_alignment_aa',
    'v_alignment_start',
    'v_alignment_end',
    'd_alignment_start',
    'd_alignment_end',
    'j_alignment_start',
    'j_alignment_end',
    'v_sequence_alignment',
    'v_sequence_alignment_aa',
    'v_germline_alignment',
    'v_germline_alignment_aa',
    'd_sequence_alignment',
    'd_sequence_alignment_aa',
    'd_germline_alignment',
    'd_germline_alignment_aa',
    'j_sequence_alignment',
    'j_sequence_alignment_aa',
    'j_germline_alignment',
    'j_germline_alignment_aa',
    'fwr1',
    'fwr1_aa',
    'cdr1',
    'cdr1_aa',
    'fwr2',
    'fwr2_aa',
    'cdr2',
    'cdr2_aa',
    'fwr3',
    'fwr3_aa',
    'fwr4',
    'fwr4_aa',
    'cdr3',
    'cdr3_aa',
    'junction',
    'junction_length',
    'junction_aa',
    'junction_aa_length',
    'v_score',
    'd_score',
    'j_score',
    'v_cigar',
    'd_cigar',
    'j_cigar',
    'v_support',
    'd_support',
    'j_support',
    'v_identity',
    'd_identity',
    'j_identity',
    'v_sequence_start',
    'v_sequence_end',
    'v_germline_start',
    'v_germline_end',
    'd_sequence_start',
    'd_sequence_end',
    'd_germline_start',
    'd_germline_end',
    'j_sequence_start',
    'j_sequence_end',
    'j_germline_start',
    'j_germline_end',
    'fwr1_start',
    'fwr1_end',
    'cdr1_start',
    'cdr1_end',
    'fwr2_start',
    'fwr2_end',
    'cdr2_start',
    'cdr2_end',
    'fwr3_start',
    'fwr3_end',
    'fwr4_start',
    'fwr4_end',
    'cdr3_start',
    'cdr3_end',
    'np1',
    'np1_length',
    'np2',
    'np2_length',
    'v_family',
    'd_family',
    'j_family',
    'cdr3_aa_length'
]

In [None]:
from pyir import PyIR
FILE = 'best_neutralizing_heavy_chain_na.fasta'

pyirfiltered = PyIR(query=FILE, args=['--outfmt', 'dict'])
result = pyirfiltered.run()
save_obj(result, 'best_PyIR_result')

with open("best_PyIR_result.json", "w") as write_file:
    json.dump(result, write_file, indent=4)

best_dict = dict()
for i, k in enumerate(result.keys()):
    res = dict(); valid = True
    for lab in relevant_fields_seq:
        tmp = result[k][lab]
        for ch in chars_to_remove:
            tmp = tmp.replace(ch,'')
        if tmp == '':
            valid = False
            print('For {} the {} region couldn\' be computed :C -> exclude from dataset'.format(k,lab))
            break
        else:
            res[lab]= tmp
    if valid:

        for lab in set(relevant_fields).intersection(set(PyIR_fields)): # take only those that are available
            res[lab]= result[k][lab]
        best_dict['best_'+str(i)+'_['+k+']'] = res

Splitting input fasta file best_neutralizing_heavy_chain_na.fasta
70 sequences successfully split into 7 pieces
Starting process pool using 2 processors


  parser.parse(collected_args)
  parser.parse(collected_args)
  parser.parse(collected_args)
  parser.parse(collected_args)
  parser.parse(collected_args)
  parser.parse(collected_args)
  parser.parse(collected_args)
100%|██████████| 70/70 [00:02<00:00, 26.12seq/s]

70 sequences processed in 2.76 seconds, 25 sequences / s
Analysis complete, returning dictionary





As can be seen, we have to exclude 2 more sequences (2F5 and 2G12), since PyIR is unable to compute the CDR3 region. Now creat fasta file of available CDR3 regions.

In [None]:
already_seen = set(); duplicates = set(); no_cdr3 = set()
with open('best_neutralizing_heavy_chain_cdr3_aa.fasta', 'w') as fasta_file:
    writer = fastaparser.Writer(fasta_file)
    for k in best_dict.keys():
        res = best_dict[k]
        if res['cdr3_aa'] not in already_seen:
            writer.writefasta((k, res['cdr3_aa']))
            already_seen.add(res['cdr3_aa'])
        else:
            duplicates.add(k)

print('Number of unique CDR3 regions =',len(already_seen))
print('Duplicates:',duplicates)

Number of unique CDR3 regions = 62
Duplicates: {'best_39_[PGT142]', 'best_53_[VRC02]', 'best_40_[PGT143]', 'best_47_[VRC-CH32]', 'best_46_[VRC-CH31]', 'best_49_[VRC-CH34]'}


We already removed the duplicates, which will later be added to the correct cluster. Use https://drive5.com/usearch/manual/uclust_algo.html to cluster the CDR3 regions. Download linux version from https://drive5.com/usearch/download.html. Identity was set such that there are roughly 40 clusters, which turned out to be 0.55.

In [None]:
# Download and set up usearch
if not os.path.isfile(os.getcwd()+'/usearch'):
    os.system('wget '+url_usearch+' -O usearch.gz')
    with gzip.open('usearch.gz', 'rb') as f_in:
        with open('usearch', 'wb') as f_out:
            shutil.copyfileobj(f_in, f_out)
    os.remove('usearch.gz')
    os.system('chmod +rwx usearch') # Change access permission

if not os.path.exists(os.getcwd()+'/usearch_results'):
    os.mkdir(os.getcwd()+'/usearch_results')

os.system(os.getcwd()+'/usearch -cluster_fast best_neutralizing_heavy_chain_cdr3_aa.fasta -id 0.55 -centroids usearch_results/nr.fasta')
os.system(os.getcwd()+'/usearch -cluster_fast best_neutralizing_heavy_chain_cdr3_aa.fasta -id 0.55 -clusters usearch_results/nr.fasta')

0

In [None]:
best_clusters_dict = dict()

with open('usearch_results/nr.fasta') as fasta_file: # centroids
    parser = fastaparser.Reader(fasta_file)
    for i, seq in enumerate(parser):
        with open('usearch_results/nr.fasta'+str(i)) as fasta_file_cluster:
            parser_cluster = fastaparser.Reader(fasta_file_cluster)
            keys_in_cluster = list(); seqs_in_cluster = list()
            for cls_seq in parser_cluster:
                keys_in_cluster.append(cls_seq.id)
                seqs_in_cluster.append(cls_seq.sequence_as_string())
            # Check if any duplicate belongs to this cluster
            to_rem = set()
            for d in duplicates:
                if best_dict[d]['cdr3_aa'] in seqs_in_cluster:
                    keys_in_cluster.append(d)
                    to_rem.add(d)
            duplicates.difference_update(to_rem)
            best_clusters_dict[i] = {
                'centroid': seq.id,
                'set': keys_in_cluster
            }
print('Number of clusters =',i+1)

# Sanity checks
assert(0 == len(duplicates))
s = 0
for k in best_clusters_dict:
    s += len(best_clusters_dict[k]['set'])
assert(s == len(best_dict) - len(no_cdr3))

shutil.rmtree(os.getcwd()+'/usearch_results', ignore_errors=True) # remove intermediate results

Number of clusters = 41


Save intermediate results for later use.

In [None]:
save_obj(best_clusters_dict, source_folder+'best_clusters_dict')
save_obj(best_dict, source_folder+'best_dict')

Visualize clusters.

In [None]:
best_clusters_dict = load_obj(source_folder+'best_clusters_dict')
best_dict = load_obj(source_folder+'best_dict')

for c in best_clusters_dict.keys():
    print('------------------------------------------------------------')
    print(c,':')
    for e in best_clusters_dict[c]['set']:
        print('\t',e,':',best_dict[e]['sequence_alignment_aa'])

------------------------------------------------------------
0 :
	 best_0_[10E8] : EVQLVESGGGLVKPGGSLRLSCSASGFDFDNAWMTWVRQPPGKGLEWVGRITGPGEGWSVDYAAPVEGRFTISRLNSINFLYLEMNNLRMEDSGLYFCARTGKYYDFWSGYPPGEEYFQDWGRGTLVTVSS
------------------------------------------------------------
1 :
	 best_1_[12A12] : SQHLVQSGTQVKKPGASVRISCQASGYSFTDYVLHWWRQAPGQGLEWMGWIKPVYGARNYARRFQGRINFDRDIYREIAFMDLSGLRSDDTALYFCARDGSGDDTSWHLDPWGQGTLVIVSA
------------------------------------------------------------
2 :
	 best_2_[12A21] : SQHLVQSGTQVKKPGASVRVSCQASGYTFTNYILHWWRQAPGQGLEWMGLIKPVFGAVNYARQFQGRIQLTRDIYREIAFLDLSGLRSDDTAVYYCARDESGDDLKWHLHPWGQGTQV
------------------------------------------------------------
3 :
	 best_3_[1B2530] : QVQLEQSGTAVRKPGASVTLSCQASGYNFVKYIIHWVRQKPGLGFEWVGMIDPYRGRPWSAHKFQGRLSLSRDTSMEILYMTLTSLKSDDTATYFCARAEAASDSHSRPIMFDHWGQGSLVTVSS
------------------------------------------------------------
4 :
	 best_6_[35O22] : QGQLVQSGAELKKPGASVKISCKTSGYRFNFYHINWIRQTAGRGPEWMGWISPYSGDKNLAPAFQDRVIMTTDTEVPVTS

In [None]:
best_dict = load_obj(source_folder+'best_dict')

s = 0
for k in best_dict.keys():
    tmp = (float(best_dict[k]['v_identity']) + float(best_dict[k]['j_identity']))/2
    s += tmp
    print(k,tmp)

print('\nAverage v-j-identity: {}'.format(s/len(best_dict)))

best_0_[10E8] 86.1005
best_1_[12A12] 83.27699999999999
best_2_[12A21] 85.189
best_3_[1B2530] 82.939
best_6_[35O22] 78.89250000000001
best_7_[3BNC117] 77.9475
best_8_[3BNC60] 77.1545
best_9_[8ANC131] 79.6815
best_10_[8ANC195] 79.36500000000001
best_11_[CH01] 90.03399999999999
best_12_[CH02] 88.54249999999999
best_13_[CH03] 92.2205
best_14_[CH04] 90.2205
best_15_[CH103] 80.2825
best_16_[CH235.12] 79.7225
best_17_[CH98] 79.0055
best_18_[DH270.1] 97.089
best_19_[DH270.4] 90.5795
best_20_[DH270.5] 92.0155
best_21_[DH270.6] 90.0565
best_22_[HJ16] 78.026
best_23_[N6] 75.929
best_24_[NIH45-46] 76.01050000000001
best_25_[PG16] 90.05799999999999
best_26_[PG9] 90.9625
best_27_[PGDM1400] 82.70750000000001
best_28_[PGT121] 85.57050000000001
best_29_[PGT122] 87.9645
best_30_[PGT123] 84.8525
best_31_[PGT125] 77.4605
best_32_[PGT126] 82.47999999999999
best_33_[PGT127] 85.0795
best_34_[PGT128] 83.529
best_35_[PGT130] 82.4105
best_36_[PGT131] 82.50649999999999
best_37_[PGT135] 82.28
best_38_[PGT141] 88.


# Broad and non-neutralizing

The data was obtained from http://opig.stats.ox.ac.uk/webapps/oas/oas by selecting:

* Chain: Heavy
* Isotype: IGHG
* Disease: HIV-Non-Neutralizing || HIV-Braod-Neutralizing

This lead to 46 files with an average size of around 150 MB for the broad-neutralizing sequences and 50 files for the non-neutralizing sequences. One file corresponds to one patient.

Steps:


1. Extract relevant information (see list relevant_fields) from csv files in MiAIRR format. Aligned sequence and germline seq. and CRH3 region.
2. Exclude sequences with uncomplete data.
2. Run the clustering algorithm locally for each file separately, leading to 96 (1 for each patient) centroid files. The identity threshold is defined in cluster_identity.
3. Merge cdr3 centroids for broad and non-neutralizing and run clustering again on merged files
4. Use resulting sequence ids to compute final training set

In [None]:
# All available data about each sequence
OAS_fields = [
    'sequence',
    'locus',
    'stop_codon',
    'vj_in_frame',
    'productive',
    'rev_comp',
    'v_call',
    'd_call',
    'j_call',
    'sequence_alignment',
    'germline_alignment',
    'sequence_alignment_aa',
    'germline_alignment_aa',
    'v_alignment_start',
    'v_alignment_end',
    'd_alignment_start',
    'd_alignment_end',
    'j_alignment_start',
    'j_alignment_end',
    'v_sequence_alignment',
    'v_sequence_alignment_aa',
    'v_germline_alignment',
    'v_germline_alignment_aa',
    'd_sequence_alignment',
    'd_sequence_alignment_aa',
    'd_germline_alignment',
    'd_germline_alignment_aa',
    'j_sequence_alignment',
    'j_sequence_alignment_aa',
    'j_germline_alignment',
    'j_germline_alignment_aa',
    'fwr1',
    'fwr1_aa',
    'cdr1',
    'cdr1_aa',
    'fwr2',
    'fwr2_aa',
    'cdr2',
    'cdr2_aa',
    'fwr3',
    'fwr3_aa',
    'cdr3',
    'cdr3_aa',
    'junction',
    'junction_length',
    'junction_aa',
    'junction_aa_length',
    'v_score',
    'd_score',
    'j_score',
    'v_cigar',
    'd_cigar',
    'j_cigar',
    'v_support',
    'd_support',
    'j_support',
    'v_identity',
    'd_identity',
    'j_identity',
    'v_sequence_start',
    'v_sequence_end',
    'v_germline_start',
    'v_germline_end',
    'd_sequence_start',
    'd_sequence_end',
    'd_germline_start',
    'd_germline_end',
    'j_sequence_start',
    'j_sequence_end',
    'j_germline_start',
    'j_germline_end',
    'fwr1_start',
    'fwr1_end',
    'cdr1_start',
    'cdr1_end',
    'fwr2_start',
    'fwr2_end',
    'cdr2_start',
    'cdr2_end',
    'fwr3_start',
    'fwr3_end',
    'cdr3_start',
    'cdr3_end',
    'np1',
    'np1_length',
    'np2',
    'np2_length',
    'c_region',
    'Isotype',
    'Redundancy',
    'ANARCI_numbering',
    'ANARCI_status'
]

for x in OAS_fields:
    print(x)

In [None]:
# URLS obtained from OAS
urls_broad_neutrlizing = [
    'http://opig.stats.ox.ac.uk/webapps/ngsdb/json/Roshkin_2020/703010269_igblastn_anarci_Heavy_IGHG.csv.gz',
    'http://opig.stats.ox.ac.uk/webapps/ngsdb/json/Roshkin_2020/706010383_igblastn_anarci_Heavy_IGHG.csv.gz',
    'http://opig.stats.ox.ac.uk/webapps/ngsdb/json/Roshkin_2020/704010581_igblastn_anarci_Heavy_IGHG.csv.gz',
    'http://opig.stats.ox.ac.uk/webapps/ngsdb/json/Roshkin_2020/700010111_igblastn_anarci_Heavy_IGHG.csv.gz',
    'http://opig.stats.ox.ac.uk/webapps/ngsdb/json/Roshkin_2020/703010957_igblastn_anarci_Heavy_IGHG.csv.gz',
    'http://opig.stats.ox.ac.uk/webapps/ngsdb/json/Roshkin_2020/707010175_igblastn_anarci_Heavy_IGHG.csv.gz',
    'http://opig.stats.ox.ac.uk/webapps/ngsdb/json/Roshkin_2020/703010619_igblastn_anarci_Heavy_IGHG.csv.gz',
    'http://opig.stats.ox.ac.uk/webapps/ngsdb/json/Roshkin_2020/703010547_igblastn_anarci_Heavy_IGHG.csv.gz',
    'http://opig.stats.ox.ac.uk/webapps/ngsdb/json/Roshkin_2020/705010765_igblastn_anarci_Heavy_IGHG.csv.gz',
    'http://opig.stats.ox.ac.uk/webapps/ngsdb/json/Roshkin_2020/707010598_igblastn_anarci_Heavy_IGHG.csv.gz',
    'http://opig.stats.ox.ac.uk/webapps/ngsdb/json/Roshkin_2020/707010536_igblastn_anarci_Heavy_IGHG.csv.gz',
    'http://opig.stats.ox.ac.uk/webapps/ngsdb/json/Roshkin_2020/706010090_igblastn_anarci_Heavy_IGHG.csv.gz',
    'http://opig.stats.ox.ac.uk/webapps/ngsdb/json/Roshkin_2020/707010134_igblastn_anarci_Heavy_IGHG.csv.gz',
    'http://opig.stats.ox.ac.uk/webapps/ngsdb/json/Roshkin_2020/703011749_igblastn_anarci_Heavy_IGHG.csv.gz',
    'http://opig.stats.ox.ac.uk/webapps/ngsdb/json/Roshkin_2020/704010461_igblastn_anarci_Heavy_IGHG.csv.gz',
    'http://opig.stats.ox.ac.uk/webapps/ngsdb/json/Roshkin_2020/703010874_igblastn_anarci_Heavy_IGHG.csv.gz',
    'http://opig.stats.ox.ac.uk/webapps/ngsdb/json/Roshkin_2020/707010219_igblastn_anarci_Heavy_IGHG.csv.gz',
    'http://opig.stats.ox.ac.uk/webapps/ngsdb/json/Roshkin_2020/702010537_igblastn_anarci_Heavy_IGHG.csv.gz',
    'http://opig.stats.ox.ac.uk/webapps/ngsdb/json/Roshkin_2020/703010468_igblastn_anarci_Heavy_IGHG.csv.gz',
    'http://opig.stats.ox.ac.uk/webapps/ngsdb/json/Roshkin_2020/703011852_igblastn_anarci_Heavy_IGHG.csv.gz',
    'http://opig.stats.ox.ac.uk/webapps/ngsdb/json/Roshkin_2020/703010314_igblastn_anarci_Heavy_IGHG.csv.gz',
    'http://opig.stats.ox.ac.uk/webapps/ngsdb/json/Roshkin_2020/705010741_igblastn_anarci_Heavy_IGHG.csv.gz',
    'http://opig.stats.ox.ac.uk/webapps/ngsdb/json/Roshkin_2020/707010277_igblastn_anarci_Heavy_IGHG.csv.gz',
    'http://opig.stats.ox.ac.uk/webapps/ngsdb/json/Roshkin_2020/713080510_igblastn_anarci_Heavy_IGHG.csv.gz',
    'http://opig.stats.ox.ac.uk/webapps/ngsdb/json/Roshkin_2020/702010440_igblastn_anarci_Heavy_IGHG.csv.gz',
    'http://opig.stats.ox.ac.uk/webapps/ngsdb/json/Roshkin_2020/702010432_igblastn_anarci_Heavy_IGHG.csv.gz',
    'http://opig.stats.ox.ac.uk/webapps/ngsdb/json/Roshkin_2020/705010534_igblastn_anarci_Heavy_IGHG.csv.gz',
    'http://opig.stats.ox.ac.uk/webapps/ngsdb/json/Roshkin_2020/707010585_igblastn_anarci_Heavy_IGHG.csv.gz',
    'http://opig.stats.ox.ac.uk/webapps/ngsdb/json/Roshkin_2020/703010596_igblastn_anarci_Heavy_IGHG.csv.gz',
    'http://opig.stats.ox.ac.uk/webapps/ngsdb/json/Roshkin_2020/702010293_igblastn_anarci_Heavy_IGHG.csv.gz',
    'http://opig.stats.ox.ac.uk/webapps/ngsdb/json/Roshkin_2020/703011477_igblastn_anarci_Heavy_IGHG.csv.gz',
    'http://opig.stats.ox.ac.uk/webapps/ngsdb/json/Roshkin_2020/703010564_igblastn_anarci_Heavy_IGHG.csv.gz',
    'http://opig.stats.ox.ac.uk/webapps/ngsdb/json/Roshkin_2020/704010210_igblastn_anarci_Heavy_IGHG.csv.gz',
    'http://opig.stats.ox.ac.uk/webapps/ngsdb/json/Roshkin_2020/705010782_igblastn_anarci_Heavy_IGHG.csv.gz',
    'http://opig.stats.ox.ac.uk/webapps/ngsdb/json/Roshkin_2020/707010060_igblastn_anarci_Heavy_IGHG.csv.gz',
    'http://opig.stats.ox.ac.uk/webapps/ngsdb/json/Roshkin_2020/707010562_igblastn_anarci_Heavy_IGHG.csv.gz',
    'http://opig.stats.ox.ac.uk/webapps/ngsdb/json/Roshkin_2020/707010191_igblastn_anarci_Heavy_IGHG.csv.gz',
    'http://opig.stats.ox.ac.uk/webapps/ngsdb/json/Roshkin_2020/707010476_igblastn_anarci_Heavy_IGHG.csv.gz',
    'http://opig.stats.ox.ac.uk/webapps/ngsdb/json/Roshkin_2020/704010343_igblastn_anarci_Heavy_IGHG.csv.gz',
    'http://opig.stats.ox.ac.uk/webapps/ngsdb/json/Roshkin_2020/700010333_igblastn_anarci_Heavy_IGHG.csv.gz',
    'http://opig.stats.ox.ac.uk/webapps/ngsdb/json/Roshkin_2020/707010457_igblastn_anarci_Heavy_IGHG.csv.gz',
    'http://opig.stats.ox.ac.uk/webapps/ngsdb/json/Roshkin_2020/713080258_igblastn_anarci_Heavy_IGHG.csv.gz',
    'http://opig.stats.ox.ac.uk/webapps/ngsdb/json/Roshkin_2020/703010073_igblastn_anarci_Heavy_IGHG.csv.gz',
    'http://opig.stats.ox.ac.uk/webapps/ngsdb/json/Roshkin_2020/704010028_igblastn_anarci_Heavy_IGHG.csv.gz',
    'http://opig.stats.ox.ac.uk/webapps/ngsdb/json/Roshkin_2020/701010211_igblastn_anarci_Heavy_IGHG.csv.gz',
    'http://opig.stats.ox.ac.uk/webapps/ngsdb/json/Roshkin_2020/707010763_igblastn_anarci_Heavy_IGHG.csv.gz'
]
urls_non_neutrlizing = [
    'http://opig.stats.ox.ac.uk/webapps/ngsdb/json/Roshkin_2020/707010750_igblastn_anarci_Heavy_IGHG.csv.gz',
    'http://opig.stats.ox.ac.uk/webapps/ngsdb/json/Roshkin_2020/705010645_igblastn_anarci_Heavy_IGHG.csv.gz',
    'http://opig.stats.ox.ac.uk/webapps/ngsdb/json/Roshkin_2020/707010169_igblastn_anarci_Heavy_IGHG.csv.gz',
    'http://opig.stats.ox.ac.uk/webapps/ngsdb/json/Roshkin_2020/702010514_igblastn_anarci_Heavy_IGHG.csv.gz',
    'http://opig.stats.ox.ac.uk/webapps/ngsdb/json/Roshkin_2020/707010225_igblastn_anarci_Heavy_IGHG.csv.gz',
    'http://opig.stats.ox.ac.uk/webapps/ngsdb/json/Roshkin_2020/705010303_igblastn_anarci_Heavy_IGHG.csv.gz',
    'http://opig.stats.ox.ac.uk/webapps/ngsdb/json/Roshkin_2020/705010474_igblastn_anarci_Heavy_IGHG.csv.gz',
    'http://opig.stats.ox.ac.uk/webapps/ngsdb/json/Roshkin_2020/704010453_igblastn_anarci_Heavy_IGHG.csv.gz',
    'http://opig.stats.ox.ac.uk/webapps/ngsdb/json/Roshkin_2020/705010381_igblastn_anarci_Heavy_IGHG.csv.gz',
    'http://opig.stats.ox.ac.uk/webapps/ngsdb/json/Roshkin_2020/702010350_igblastn_anarci_Heavy_IGHG.csv.gz',
    'http://opig.stats.ox.ac.uk/webapps/ngsdb/json/Roshkin_2020/707010142_igblastn_anarci_Heavy_IGHG.csv.gz',
    'http://opig.stats.ox.ac.uk/webapps/ngsdb/json/Roshkin_2020/705010830_igblastn_anarci_Heavy_IGHG.csv.gz',
    'http://opig.stats.ox.ac.uk/webapps/ngsdb/json/Roshkin_2020/704010207_igblastn_anarci_Heavy_IGHG.csv.gz',
    'http://opig.stats.ox.ac.uk/webapps/ngsdb/json/Roshkin_2020/703011871_igblastn_anarci_Heavy_IGHG.csv.gz',
    'http://opig.stats.ox.ac.uk/webapps/ngsdb/json/Roshkin_2020/702010133_igblastn_anarci_Heavy_IGHG.csv.gz',
    'http://opig.stats.ox.ac.uk/webapps/ngsdb/json/Roshkin_2020/702010403_igblastn_anarci_Heavy_IGHG.csv.gz',
    'http://opig.stats.ox.ac.uk/webapps/ngsdb/json/Roshkin_2020/704010486_igblastn_anarci_Heavy_IGHG.csv.gz',
    'http://opig.stats.ox.ac.uk/webapps/ngsdb/json/Roshkin_2020/700010501_igblastn_anarci_Heavy_IGHG.csv.gz',
    'http://opig.stats.ox.ac.uk/webapps/ngsdb/json/Roshkin_2020/703010523_igblastn_anarci_Heavy_IGHG.csv.gz',
    'http://opig.stats.ox.ac.uk/webapps/ngsdb/json/Roshkin_2020/704010171_igblastn_anarci_Heavy_IGHG.csv.gz',
    'http://opig.stats.ox.ac.uk/webapps/ngsdb/json/Roshkin_2020/705010366_igblastn_anarci_Heavy_IGHG.csv.gz',
    'http://opig.stats.ox.ac.uk/webapps/ngsdb/json/Roshkin_2020/704010532_igblastn_anarci_Heavy_IGHG.csv.gz',
    'http://opig.stats.ox.ac.uk/webapps/ngsdb/json/Roshkin_2020/704010715_igblastn_anarci_Heavy_IGHG.csv.gz',
    'http://opig.stats.ox.ac.uk/webapps/ngsdb/json/Roshkin_2020/703010234_igblastn_anarci_Heavy_IGHG.csv.gz',
    'http://opig.stats.ox.ac.uk/webapps/ngsdb/json/Roshkin_2020/704010408_igblastn_anarci_Heavy_IGHG.csv.gz',
    'http://opig.stats.ox.ac.uk/webapps/ngsdb/json/Roshkin_2020/700010329_igblastn_anarci_Heavy_IGHG.csv.gz',
    'http://opig.stats.ox.ac.uk/webapps/ngsdb/json/Roshkin_2020/705010661_igblastn_anarci_Heavy_IGHG.csv.gz',
    'http://opig.stats.ox.ac.uk/webapps/ngsdb/json/Roshkin_2020/702010675_igblastn_anarci_Heavy_IGHG.csv.gz',
    'http://opig.stats.ox.ac.uk/webapps/ngsdb/json/Roshkin_2020/705010614_igblastn_anarci_Heavy_IGHG.csv.gz',
    'http://opig.stats.ox.ac.uk/webapps/ngsdb/json/Roshkin_2020/700010516_igblastn_anarci_Heavy_IGHG.csv.gz',
    'http://opig.stats.ox.ac.uk/webapps/ngsdb/json/Roshkin_2020/700010094_igblastn_anarci_Heavy_IGHG.csv.gz',
    'http://opig.stats.ox.ac.uk/webapps/ngsdb/json/Roshkin_2020/706010391_igblastn_anarci_Heavy_IGHG.csv.gz',
    'http://opig.stats.ox.ac.uk/webapps/ngsdb/json/Roshkin_2020/705010801_igblastn_anarci_Heavy_IGHG.csv.gz',
    'http://opig.stats.ox.ac.uk/webapps/ngsdb/json/Roshkin_2020/702010118_igblastn_anarci_Heavy_IGHG.csv.gz',
    'http://opig.stats.ox.ac.uk/webapps/ngsdb/json/Roshkin_2020/706010375_igblastn_anarci_Heavy_IGHG.csv.gz',
    'http://opig.stats.ox.ac.uk/webapps/ngsdb/json/Roshkin_2020/700010260_igblastn_anarci_Heavy_IGHG.csv.gz',
    'http://opig.stats.ox.ac.uk/webapps/ngsdb/json/Roshkin_2020/704010392_igblastn_anarci_Heavy_IGHG.csv.gz',
    'http://opig.stats.ox.ac.uk/webapps/ngsdb/json/Roshkin_2020/703010539_igblastn_anarci_Heavy_IGHG.csv.gz',
    'http://opig.stats.ox.ac.uk/webapps/ngsdb/json/Roshkin_2020/713080339_igblastn_anarci_Heavy_IGHG.csv.gz',
    'http://opig.stats.ox.ac.uk/webapps/ngsdb/json/Roshkin_2020/707010789_igblastn_anarci_Heavy_IGHG.csv.gz',
    'http://opig.stats.ox.ac.uk/webapps/ngsdb/json/Roshkin_2020/702010322_igblastn_anarci_Heavy_IGHG.csv.gz',
    'http://opig.stats.ox.ac.uk/webapps/ngsdb/json/Roshkin_2020/707010038_igblastn_anarci_Heavy_IGHG.csv.gz',
    'http://opig.stats.ox.ac.uk/webapps/ngsdb/json/Roshkin_2020/707010630_igblastn_anarci_Heavy_IGHG.csv.gz',
    'http://opig.stats.ox.ac.uk/webapps/ngsdb/json/Roshkin_2020/707010156_igblastn_anarci_Heavy_IGHG.csv.gz',
    'http://opig.stats.ox.ac.uk/webapps/ngsdb/json/Roshkin_2020/705010699_igblastn_anarci_Heavy_IGHG.csv.gz',
    'http://opig.stats.ox.ac.uk/webapps/ngsdb/json/Roshkin_2020/704010566_igblastn_anarci_Heavy_IGHG.csv.gz',
    'http://opig.stats.ox.ac.uk/webapps/ngsdb/json/Roshkin_2020/703010167_igblastn_anarci_Heavy_IGHG.csv.gz',
    'http://opig.stats.ox.ac.uk/webapps/ngsdb/json/Roshkin_2020/703010835_igblastn_anarci_Heavy_IGHG.csv.gz',
    'http://opig.stats.ox.ac.uk/webapps/ngsdb/json/Roshkin_2020/706010413_igblastn_anarci_Heavy_IGHG.csv.gz',
    'http://opig.stats.ox.ac.uk/webapps/ngsdb/json/Roshkin_2020/703010632_igblastn_anarci_Heavy_IGHG.csv.gz'
]

In [None]:
# Clustering identity, the higher the more sequences (~> more redundacy)
cluster_identity = 0.80

# Download and set up usearch
if not os.path.isfile(os.getcwd()+'/usearch'):
    os.system('wget '+url_usearch+' -O usearch.gz')
    with gzip.open('usearch.gz', 'rb') as f_in:
        with open('usearch', 'wb') as f_out:
            shutil.copyfileobj(f_in, f_out)
    os.remove('usearch.gz')
    os.system('chmod +rwx usearch') # Change access permission

t0 = time.time()
for urls, typ in [(urls_broad_neutrlizing, 'broad'), (urls_non_neutrlizing, 'non')]:
    rel_info = dict(); original_length = 0; current_num_seq = 0
    with open('all_centroids.fasta', 'w') as fasta_file_write:
        writer = fastaparser.Writer(fasta_file_write)
        for i, url in enumerate(urls):
            file_name = typ+'_patient_'+str(i)+'.csv'
            os.system('wget '+url+' -O '+file_name+'.gz')

            # Extact csv file
            with gzip.open(file_name+'.gz', 'rb') as f_in:
                with open(file_name, 'wb') as f_out:
                    shutil.copyfileobj(f_in, f_out)
            os.remove(file_name+'.gz')
            
            df = pd.read_csv(file_name, header=1)
            original_length += len(df)
            cdr3_file_name = typ+'_patient_'+str(i)+'_cdr3.fasta'
            with open(cdr3_file_name, 'w') as fasta_file_write_single:
                writer_single = fastaparser.Writer(fasta_file_write_single)
                for index, row in df.iterrows():
                    seq_id = typ+'_patient_'+str(i)+'_'+str(index)
                    
                    # Already purify sequences
                    res = dict(); valid = True
                    for lab in relevant_fields_seq:
                        tmp = row[lab]
                        for ch in chars_to_remove:
                            tmp = tmp.replace(ch,'')
                        # Exclude sequences for which not all seq.-fields are available
                        if tmp == '':
                            valid = False
                            break
                        else:
                            res[lab] = tmp
                    
                    if valid:
                        
                        # Add non-sequence fields
                        for lab in relevant_fields:
                            res[lab] = row[lab]

                        # Store (purified) relevant information 
                        rel_info[seq_id] = res
                    
                        # Store (unpurified) cdr3 regions in fasta file for clustering
                        writer_single.writefasta((seq_id, row['cdr3_aa']))

            if os.path.getsize(cdr3_file_name): # Check if any sequence in current patient file was found
                
                # Cluster each cdr3 file separately
                os.system(os.getcwd()+'/usearch -cluster_fast '+cdr3_file_name+' -id '+str(cluster_identity)+' -centroids centroids.fasta')

                with open('centroids.fasta') as fasta_file_read:
                    parser = fastaparser.Reader(fasta_file_read)
                    for seq in parser:

                        # Merge cdr3 centroids
                        writer.writefasta((seq.id, seq.sequence_as_string()))
                        current_num_seq += 1
                # Clean up
                os.remove('centroids.fasta')
            
            # Clean up
            os.remove(file_name)
            os.remove(cdr3_file_name)

            sys.stdout.write('\r| {}-neutralizing | Progress {} % | Curr. num. of seq. = {} (before final clustering)'\
                             .format(typ,int(100*(i+1)/len(urls)),current_num_seq))
            sys.stdout.flush()
        
    # Cluster merged cdr3 centroids (also to remove possible duplicates)
    os.system(os.getcwd()+'/usearch -cluster_fast all_centroids.fasta -id '+str(cluster_identity)+' -centroids centroids.fasta')

    final_dict = dict()
    with open('centroids.fasta') as fasta_file:
        parser = fastaparser.Reader(fasta_file)
        for seq in parser:
            # Pick selected sequences from original dict
            final_dict[seq.id] = rel_info[seq.id]
    
    # Clean up and save results
    os.remove('all_centroids.fasta')
    os.remove('centroids.fasta')
    save_obj(final_dict, source_folder+typ+'-neutralizing_final_dict')
    print('\n\nTotal {}-neutralizing sequences: {} (original ds len = {})\n'.format(typ,len(final_dict),original_length))

os.remove('usearch')
t1 = time.time()
print('Process took {}'.format(format_time(t1-t0)))

| broad-neutralizing | Progress 100 % | Curr. num. of seq. = 413543 (before final clustering)

Total broad-neutralizing sequences: 409622 (original ds len = 2156066)

| non-neutralizing | Progress 100 % | Curr. num. of seq. = 387479 (before final clustering)

Total non-neutralizing sequences: 384028 (original ds len = 1655909)

Process took 00:22:44


# Others (UZH)

File received from tlemmin on the 28.04.2021, named ``` IGHG_heavy```. UZH sequences

In [None]:
!git clone https://github.com/thomasfraling/bnAbs-GAN.git

In [None]:
# from pyir import PyIR
# FILE = os.getcwd+'/bnAbs-GAN/datasets/UZH_dataset.fasta'

t0 = time.time()
# pyirfiltered = PyIR(query=FILE, args=['--outfmt', 'dict'])
# result = pyirfiltered.run()
# save_obj(result, destination_folder+'UZH_dataset_PyIR_result')
result = load_obj(source_folder+'UZH_dataset_PyIR_result')

final_dict = dict()
for i, k in enumerate(result.keys()):
    res = dict(); valid = True
    for lab in relevant_fields_seq:
        tmp = result[k][lab]
        for ch in chars_to_remove:
            tmp = tmp.replace(ch,'')
        if tmp == '':
            valid = False
            # print('For {} the {} region couldn\' be computed :C -> exclude from dataset'.format(k,lab))
            break
        else:
            res[lab] = tmp
    
    if valid: # and v_j_avg < v_j_identity_threshold:
        
        # Add non-sequence fields
        for lab in set(relevant_fields).intersection(set(PyIR_fields)): # take only those that are available
            res[lab] = result[k][lab]
                            
        # Store (purified) relevant information 
        final_dict['UZH_dataset_'+str(i)] = res
        
print('Total number of sequences {} (original ds len = {})'.format(len(final_dict),len(result)))
save_obj(final_dict, source_folder+'UZH_dataset_dict')
t1 = time.time()
print('Process took {}'.format(format_time(t1-t0)))

Total number of sequences 12664 (original ds len = 12689)
Process took 00:00:00


# Other study (Simonich 2020)

In [None]:
urls_simonich = [
    'http://opig.stats.ox.ac.uk/webapps/ngsdb/json/Simonich_2020/SRR8321518_igblastn_anarci_Heavy_IGHG.csv.gz',
    'http://opig.stats.ox.ac.uk/webapps/ngsdb/json/Simonich_2020/SRR8321519_igblastn_anarci_Heavy_IGHG.csv.gz',
    'http://opig.stats.ox.ac.uk/webapps/ngsdb/json/Simonich_2020/SRR8321521_igblastn_anarci_Heavy_IGHG.csv.gz',
    'http://opig.stats.ox.ac.uk/webapps/ngsdb/json/Simonich_2020/SRR8321520_igblastn_anarci_Heavy_IGHG.csv.gz'
]

In [None]:
t0 = time.time(); final_dict = dict(); seq_counter = 0
for url in urls_simonich:

    os.system('wget '+url+' -O tmp_file.gz')

    # Extact csv file
    with gzip.open('tmp_file.gz', 'rb') as f_in:
        with open('tmp_file.csv', 'wb') as f_out:
            shutil.copyfileobj(f_in, f_out)
    os.remove('tmp_file.gz')
    
    df = pd.read_csv('tmp_file.csv', header=1)
    for index, row in df.iterrows():
        
        # Already purify sequences
        res = dict(); valid = True
        for lab in relevant_fields_seq:
            tmp = row[lab]
            for ch in chars_to_remove:
                tmp = tmp.replace(ch,'')
            # Exclude sequences for which not all seq.-fields are available
            if tmp == '':
                valid = False
                break
            else:
                res[lab] = tmp
        
        if valid:
            
            # Add non-sequence fields
            for lab in relevant_fields:
                res[lab] = row[lab]

            # Store (purified) relevant information 
            final_dict['simonich_'+str(seq_counter)] = res
            seq_counter += 1

    os.remove('tmp_file.csv')

t1 = time.time()
print('Total number of sequences = {}. Process took: {}'.format(seq_counter,format_time(t1-t0)))
save_obj(final_dict, source_folder+'simonich_dict')

Total number of sequences = 202166. Process took: 00:00:46
