In [4]:
import pandas as pd
import io, collections

from Bio import SeqIO, Entrez, SearchIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Blast import NCBIXML, NCBIWWW

Entrez.email = "A.N.Other@example.com"

In [27]:
sequences = '''>Platypus_H2B.O.1
WENYVYKVLKQVHPLTSISTKAVGIVDSFIDIFKRITSDASHLARYNKCSTITSREIQTAVQLMLPGELDRYAGSEGTKAITKYTT
>Platypus_H2B.O.2
YSIYVYKVLKQVHPLTSISTKAVGIMDSFINDIFERIASEASRLARYNKRSTITSREIQTAVLLTLPGELARHAVSEGTKAITKYTS
>Platypus_H2B.O.3
YSIYVYKVLKQVHPLTSISTKAVGIMDSFINDIFDRIASEASRLARYTKRSTIASREIQTAVLLTLPGELARHAVSEGTKAITKYTS'''
print(sequences)

>Platypus_H2B.O.1
WENYVYKVLKQVHPLTSISTKAVGIVDSFIDIFKRITSDASHLARYNKCSTITSREIQTAVQLMLPGELDRYAGSEGTKAITKYTT
>Platypus_H2B.O.2
YSIYVYKVLKQVHPLTSISTKAVGIMDSFINDIFERIASEASRLARYNKRSTITSREIQTAVLLTLPGELARHAVSEGTKAITKYTS
>Platypus_H2B.O.3
YSIYVYKVLKQVHPLTSISTKAVGIMDSFINDIFDRIASEASRLARYTKRSTIASREIQTAVLLTLPGELARHAVSEGTKAITKYTS


In [28]:
df = pd.DataFrame([{'name': s.split('\n')[0], 'sequence': s.split('\n')[1].strip()} for s in sequences.split('>')[1:]])
df.index = df.name
df

Unnamed: 0_level_0,name,sequence
name,Unnamed: 1_level_1,Unnamed: 2_level_1
Platypus_H2B.O.1,Platypus_H2B.O.1,WENYVYKVLKQVHPLTSISTKAVGIVDSFIDIFKRITSDASHLARY...
Platypus_H2B.O.2,Platypus_H2B.O.2,YSIYVYKVLKQVHPLTSISTKAVGIMDSFINDIFERIASEASRLAR...
Platypus_H2B.O.3,Platypus_H2B.O.3,YSIYVYKVLKQVHPLTSISTKAVGIMDSFINDIFDRIASEASRLAR...


## ... Blast sequences

In [6]:
df_blast = pd.read_csv(f"A8MVKD6P013-Alignment-HitTable.csv", header=None)
df_blast.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,Platypus_H2B.O.1,XP_001521160.2,100.0,86,0,0,1,86,27,112,4.51e-56,179,100.0
1,Platypus_H2B.O.1,XP_001511074.1,84.524,84,12,1,4,86,40,123,3.41e-39,136,90.48
2,Platypus_H2B.O.1,XP_038606306.1,83.333,84,13,1,4,86,40,123,8.26e-38,133,89.29
3,Platypus_H2B.O.1,MXQ84858.1,75.0,84,20,1,4,86,41,124,2.46e-37,132,90.48
4,Platypus_H2B.O.1,XP_028926540.1,82.143,84,14,1,4,86,40,123,3.62e-37,131,86.9


In [29]:
df_blast = pd.read_csv(f"A8MVKD6P013-Alignment-HitTable.csv", header=None)
df_blast = df_blast.iloc[:,:8]
df_blast = df_blast[(df_blast[2]==100) & (df_blast[6]==1) & (df_blast[7]==df_blast[3])]
for i, row in df_blast.iterrows():
    if len(df.loc[row[0], 'sequence']) != row[3]: continue
    df.at[row[0], 'accession'] = row[1]
df_blast

Unnamed: 0,0,1,2,3,4,5,6,7
0,Platypus_H2B.O.1,XP_001521160.2,100.0,86,0,0,1,86
101,Platypus_H2B.O.2,XP_001511074.1,100.0,87,0,0,1,87
202,Platypus_H2B.O.3,XP_028926540.1,100.0,87,0,0,1,87
204,Platypus_H2B.O.3,XP_028926523.1,100.0,87,0,0,1,87
205,Platypus_H2B.O.3,XP_028926523.1,100.0,87,0,0,1,87


In [30]:
df_blast.shape, len(set(df_blast[0]))

((5, 8), 3)

In [31]:
df_blast[df_blast[0].duplicated()]

Unnamed: 0,0,1,2,3,4,5,6,7
204,Platypus_H2B.O.3,XP_028926523.1,100.0,87,0,0,1,87
205,Platypus_H2B.O.3,XP_028926523.1,100.0,87,0,0,1,87


In [32]:
df_blast[df_blast[0].isin(df_blast[df_blast[0].duplicated()][0])]

Unnamed: 0,0,1,2,3,4,5,6,7
202,Platypus_H2B.O.3,XP_028926540.1,100.0,87,0,0,1,87
204,Platypus_H2B.O.3,XP_028926523.1,100.0,87,0,0,1,87
205,Platypus_H2B.O.3,XP_028926523.1,100.0,87,0,0,1,87


In [33]:
df.loc[df_blast[df_blast[0].duplicated()][0]]['accession']

name
Platypus_H2B.O.3    XP_028926523.1
Platypus_H2B.O.3    XP_028926523.1
Name: accession, dtype: object

In [34]:
for a in df['accession'].unique():
    if df[df['accession']==a]['sequence'].unique().shape[0] == 1: continue
    print(a)

In [35]:
df.columns

Index(['name', 'sequence', 'accession'], dtype='object')

In [36]:
df['sequence'].unique().shape[0]

3

In [37]:
df

Unnamed: 0_level_0,name,sequence,accession
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Platypus_H2B.O.1,Platypus_H2B.O.1,WENYVYKVLKQVHPLTSISTKAVGIVDSFIDIFKRITSDASHLARY...,XP_001521160.2
Platypus_H2B.O.2,Platypus_H2B.O.2,YSIYVYKVLKQVHPLTSISTKAVGIMDSFINDIFERIASEASRLAR...,XP_001511074.1
Platypus_H2B.O.3,Platypus_H2B.O.3,YSIYVYKVLKQVHPLTSISTKAVGIMDSFINDIFDRIASEASRLAR...,XP_028926523.1


In [39]:
df.to_csv('h2bo_histones.csv', index=False)