In [1]:
import pandas as pd
import io, collections

from Bio import SeqIO, Entrez, SearchIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Blast import NCBIXML, NCBIWWW

Entrez.email = "A.N.Other@example.com"

In [2]:
df = pd.read_csv('h2am_histones.csv')
df = df[df['Name'].str.contains('H2A.M')]
df['accession'] = ['']*df.shape[0]
df.index = df.Name
df

Unnamed: 0_level_0,Species,Name,Sequence,accession
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Pa_H2A.M,Picea abies,Pa_H2A.M,MEPATQGSGGRGGKKKPVSKSERAGLQFPVGRLARYLKKGRYAKRV...,
Sm_H2A.M,Selaginella moellendorffii,Sm_H2A.M,MVVQGGGRKGKKKSVSKSARAGLQFPVGRLARYLKNGRYAKRVGSG...,
Pp_H2A.M2,Physcomitrella patens,Pp_H2A.M2,MSGRGKGAGAAARKKSVTKSAKAGLQFPVGRLGRYLKKGRYAQRVG...,
Pp_H2A.M3,Physcomitrella patens,Pp_H2A.M3,MSGRGKGAGAAARKKSVSRSAKAGLQFPVGRLGRYLKKGRYARRVG...,
Pp_H2A.M1,Physcomitrella patens,Pp_H2A.M1,MSGRGKGAGAAARKKSVTKSAKAGLQFPVGRLGRYLKKGRYAQRVG...,
Mp_H2A.M1,Marchantia polymorpha,Mp_H2A.M1,MSARSGTTAVKKKPVSKSQKAGLQFPVGRMARFLKNGRYAKRIGAG...,
Mp_H2A.M2,Marchantia polymorpha,Mp_H2A.M2,MSGRGHSAKAKRKAISKSARAGLQFPVGRLARYLKNGRYAKRVGAG...,


In [3]:
set(df['Species'])

{'Marchantia polymorpha',
 'Physcomitrella patens',
 'Picea abies',
 'Selaginella moellendorffii'}

In [4]:
species_iter = iter(set(df['Species']))

In [17]:
species = next(species_iter)
sequences = "\n".join([SeqRecord(Seq(row['Sequence']), id=row['Name'], name=row['Name'],
                                 description=f"organism={row['Species']}").format("fasta") for i, row in df[df['Species']==species].iterrows()])
print(sequences)

StopIteration: 

## ... Blast sequences explicitly for each species

In [15]:
df_blast = pd.read_csv(f"{species.replace(' ', '_')}-Alignment-HitTable.csv", header=None)
df_blast = df_blast.iloc[:,:8]
df_blast = df_blast[(df_blast[2]==100) & (df_blast[6]==1) & (df_blast[7]==df_blast[3])]
for i, row in df_blast.iterrows():
    if len(df.loc[row[0], 'Sequence']) != row[3]: continue
    df.at[row[0], 'accession'] = row[1]
df_blast

Unnamed: 0,0,1,2,3,4,5,6,7


In [16]:
df[df['Species']==species]

Unnamed: 0_level_0,Species,Name,Sequence,accession
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Sm_H2A.M,Selaginella moellendorffii,Sm_H2A.M,MVVQGGGRKGKKKSVSKSARAGLQFPVGRLARYLKNGRYAKRVGSG...,


## Give HISTDB_H2A_M_\<Number\> accession for not found sequences 

In [18]:
df[df['accession']==''].shape

(3, 4)

In [19]:
histdb_accessions = iter([f'HISTDB_H2A_M_{i}' for i in range(df[df['accession']==''].shape[0])])

In [20]:
df['accession'] = [a if a else next(histdb_accessions) for a in df['accession']]
df[df['accession'].str.startswith('HISTDB')].shape

(3, 4)

In [21]:
df[df['accession'].str.startswith('HISTDB')]

Unnamed: 0_level_0,Species,Name,Sequence,accession
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Pa_H2A.M,Picea abies,Pa_H2A.M,MEPATQGSGGRGGKKKPVSKSERAGLQFPVGRLARYLKKGRYAKRV...,HISTDB_H2A_M_0
Sm_H2A.M,Selaginella moellendorffii,Sm_H2A.M,MVVQGGGRKGKKKSVSKSARAGLQFPVGRLARYLKNGRYAKRVGSG...,HISTDB_H2A_M_1
Mp_H2A.M2,Marchantia polymorpha,Mp_H2A.M2,MSGRGHSAKAKRKAISKSARAGLQFPVGRLARYLKNGRYAKRVGAG...,HISTDB_H2A_M_2


In [22]:
df.head()

Unnamed: 0_level_0,Species,Name,Sequence,accession
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Pa_H2A.M,Picea abies,Pa_H2A.M,MEPATQGSGGRGGKKKPVSKSERAGLQFPVGRLARYLKKGRYAKRV...,HISTDB_H2A_M_0
Sm_H2A.M,Selaginella moellendorffii,Sm_H2A.M,MVVQGGGRKGKKKSVSKSARAGLQFPVGRLARYLKNGRYAKRVGSG...,HISTDB_H2A_M_1
Pp_H2A.M2,Physcomitrella patens,Pp_H2A.M2,MSGRGKGAGAAARKKSVTKSAKAGLQFPVGRLGRYLKKGRYAQRVG...,XP_024376581.1
Pp_H2A.M3,Physcomitrella patens,Pp_H2A.M3,MSGRGKGAGAAARKKSVSRSAKAGLQFPVGRLGRYLKKGRYARRVG...,XP_024376580.1
Pp_H2A.M1,Physcomitrella patens,Pp_H2A.M1,MSGRGKGAGAAARKKSVTKSAKAGLQFPVGRLGRYLKKGRYAQRVG...,XP_024377711.1


In [23]:
len(set(df['accession']))

7

In [24]:
c = collections.Counter(df['accession'])
[ci[0] for ci in c.most_common() if ci[1]>1]

[]

In [25]:
df.to_csv('h2am_histones.csv', index=False)