In [1]:
%cd '..'

/mnt/scratch/l_singh/hdb/project_dir/histonedb/CURATED_SET


In [2]:
import inspect
import os
import re
import sys

import pandas as pd
from Bio import Entrez, SeqIO
from curated_set_services import CuratedSet

In [3]:
# Load data from histones.csv
curated_set = CuratedSet()
cs = curated_set

cs.data.shape, cs.data.columns

((555, 16),
 Index(['accession', 'type', 'variant_group', 'variant', 'doublet', 'gi',
        'ncbi_gene_id', 'hgnc_gene_name', 'taxonomy_id', 'organism', 'phylum',
        'class', 'taxonomy_group', 'info', 'references', 'sequence'],
       dtype='object'))

In [4]:
my_columns = cs.data.columns

In [5]:
list(cs.has_duplicates())

[]

In [6]:
cs.data.head()

Unnamed: 0,accession,type,variant_group,variant,doublet,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,references,sequence
NP_505463.1,NP_505463.1,H2A,cH2A,cH2A_(Animals),,17562014,,,6239,Caenorhabditis elegans,Nematoda,Chromadorea,,,26989147 22650316(?),MSGRGKGGKAKTGGKAKSRSSRAGLQFPVGRLHRILRKGNYAQRVG...
EEC09557.1,EEC09557.1,H2A,cH2A,cH2A_(Animals),,215500063,,,6945,Ixodes scapularis,Arthropoda,Arachnida,,,26989147 22650316(?),MSGRGKGGKVKGKSKTRSSRAGLQFPVGRIHRLLRKGNYAERVGAG...
NP_724343.1,NP_724343.1,H2A,cH2A,cH2A_(Animals),,24585673,,,7227,Drosophila melanogaster,Arthropoda,Insecta,,,26989147 22650316(?),MSGRGKGGKVKGKAKSRSNRAGLQFPVGRIHRLLRKGNYAERVGAG...
XP_001119899.1,XP_001119899.1,H2A,cH2A,cH2A_(Animals),,110764935,,,7460,Apis mellifera,Arthropoda,Insecta,,,26989147 22650316(?),MSGRGKGGKAKAKAKSRSNRAGLQFPVGRIHRLLRKGNYAERVGAG...
EDO48405.1,EDO48405.1,H2A,cH2A,cH2A_(Animals),,156227602,,,45351,Nematostella vectensis,Cnidaria,Anthozoa,,,26989147 22650316(?),MSGRGKGKAKGTKSKTRSSRAGLQFPVGRIHRHLRKGNYAERVGAG...


In [7]:
cs.data.shape

(555, 16)

## Данные были взяты из статьи DOI:10.5772/intechopen.81409. С помощью Blastp были найдены accessions и записаны в curated_service/H2BZ.fasta

In [8]:
h2bz_sequences = SeqIO.index("curated_service/H2BZ.fasta", "fasta")
cs.data.loc[cs.data.index.intersection(list(h2bz_sequences.keys())), :]

Unnamed: 0,accession,type,variant_group,variant,doublet,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,references,sequence
XP_001349046.1,XP_001349046.1,H2B,H2B.Z,H2B.Z,,124511826,,,36329,Plasmodium falciparum 3D7,Apicomplexa,Aconoidasida,,,,MSGKGPAQKSQAAKKTAGKTLGPRHKRKRRTESFSLYIFKVLKQVH...
XP_002369740.1,XP_002369740.1,H2B,H2B.Z,H2B.Z,,237840885,,,508771,Toxoplasma gondii ME49,Apicomplexa,Conoidasida,,,,MSGKGPAQKSQAAKKTAGKSLGPRYRRRKRTESFALYIYKVLKQVH...


In [12]:
list(map(lambda x: str(x.seq), h2bz_sequences.values()))

['MSGKGPAQKSQAAKKTAGKTLGPRHKRKRRTESFSLYIFKVLKQVHPETGVTKKSMNIMNSFINDIFDRLVTEATRLIRYNKKRTLSSREIQTAVRLLLPGELSKHAVSEGTKAVTKYTTSAA',
 'MSGKGPAQKSQAAKKTAGKSLGPRYRRRKRTESFALYIYKVLKQVHPETGVSKKSMSIMNSFINDIFDRLADEAVRLIRYNKKRTLSSREIQTAVRLLLPGELSKHAVSEGTKAVTKYTTSGA',
 'MSGKGPAQKSQAAKKTAGKSLGPRYRRRKRTESFALYIYKVLKQVHPETGVSKKSMSIMNSFINDIFDRLADEAVRLIRYNKKRTLSSREIQTAVRLLLPGELSKHAVSEGTKAVSKYTTSGA',
 'MPGKGPAEKRQAAKKTAGKTPAEAGKKRRRKRTESFALYIYKVLKQVHPETGISKKSMSIMNSFINDIFDRMATEATNLIRFNKKKTLSSREVQTSVRLMLPGELSKHAVSEGTKAVTKYTTAAGN',
 'MSGKVPSSKSQAAKKTAGKSLGIRYRRKKRIESFSLYIYKVLKQVHPETGVSKKSMSIMNSFINDIFDRMALEATRLIRYNKKSTLSSREIQTAVRLLLPGELSKHAVSEGTKAVTKYTTSGA',
 'MSGKVPSTKSQAAKKTAGKTLGVRYRRKKRIESFALYIYKVLKQVHPETGVSKKSMSIMNSFINDIFDRLALEATRLIRYNKKSTLSSREIQTAVRLLLPGELSKHAVSEGTKAVTKYTTSGV',
 'MSGKGGKQQLAKKTAANKLPSHHLDKNKKRRRRGETFSIYIYKVLRQVQPKLGMSRKSMAIMNSFINDIFERLATEAVKLIQYNKKRTLSSREMQTSVRLLLPGELSKHAATEGAKAVEKYENRPIA',
 'MSGKSGKSIKGPAQKQQAAKKTAGKSPADGGKRKRRKRTESFALYIYKVLKQVHPETGISKKSMSIMNSFINDVFDRLSAEAVKLVQYNKKRTLS

In [13]:
df = pd.DataFrame(
    {
        "accession": list(h2bz_sequences.keys()),
        "type": ["H2B"] * len(h2bz_sequences),
        "variant_group": ["H2B.Z"] * len(h2bz_sequences),
        "variant": ["H2B.Z"] * len(h2bz_sequences),
        "references": ["DOI:10.5772/intechopen.81409"] * len(h2bz_sequences),
        "sequence": list(map(lambda x: str(x.seq), h2bz_sequences.values())),
    }
)
df.index = df.accession
# df.at[cs.data.index.intersection(list(h2bz_sequences.keys())), "references"] = (
#     "DOI:10.5772/intechopen.81409 "
#     + cs.data.loc[cs.data.index.intersection(list(h2bz_sequences.keys())), "references"]
# )
df.shape, df.columns

((8, 6),
 Index(['accession', 'type', 'variant_group', 'variant', 'references',
        'sequence'],
       dtype='object'))

In [14]:
df.head()

Unnamed: 0_level_0,accession,type,variant_group,variant,references,sequence
accession,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
XP_001349046.1,XP_001349046.1,H2B,H2B.Z,H2B.Z,DOI:10.5772/intechopen.81409,MSGKGPAQKSQAAKKTAGKTLGPRHKRKRRTESFSLYIFKVLKQVH...
XP_002369740.1,XP_002369740.1,H2B,H2B.Z,H2B.Z,DOI:10.5772/intechopen.81409,MSGKGPAQKSQAAKKTAGKSLGPRYRRRKRTESFALYIYKVLKQVH...
XP_013228334.1,XP_013228334.1,H2B,H2B.Z,H2B.Z,DOI:10.5772/intechopen.81409,MSGKGPAQKSQAAKKTAGKSLGPRYRRRKRTESFALYIYKVLKQVH...
CEM32013.1,CEM32013.1,H2B,H2B.Z,H2B.Z,DOI:10.5772/intechopen.81409,MPGKGPAEKRQAAKKTAGKTPAEAGKKRRRKRTESFALYIYKVLKQ...
XP_001610608.1,XP_001610608.1,H2B,H2B.Z,H2B.Z,DOI:10.5772/intechopen.81409,MSGKVPSSKSQAAKKTAGKSLGIRYRRKKRIESFSLYIYKVLKQVH...


In [16]:
# cs.data = cs.data[cs.data['accession'] != cs.data.index.intersection(g_h2b_accessions).values[0]]
# cs.data.shape
df = df[~df['accession'].isin(cs.data.index.intersection(list(h2bz_sequences.keys())).values)]
df.shape

(6, 6)

In [17]:
df

Unnamed: 0_level_0,accession,type,variant_group,variant,references,sequence
accession,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
XP_013228334.1,XP_013228334.1,H2B,H2B.Z,H2B.Z,DOI:10.5772/intechopen.81409,MSGKGPAQKSQAAKKTAGKSLGPRYRRRKRTESFALYIYKVLKQVH...
CEM32013.1,CEM32013.1,H2B,H2B.Z,H2B.Z,DOI:10.5772/intechopen.81409,MPGKGPAEKRQAAKKTAGKTPAEAGKKRRRKRTESFALYIYKVLKQ...
XP_001610608.1,XP_001610608.1,H2B,H2B.Z,H2B.Z,DOI:10.5772/intechopen.81409,MSGKVPSSKSQAAKKTAGKSLGIRYRRKKRIESFSLYIYKVLKQVH...
HISTDB_H2B_Z_0,HISTDB_H2B_Z_0,H2B,H2B.Z,H2B.Z,DOI:10.5772/intechopen.81409,MSGKVPSTKSQAAKKTAGKTLGVRYRRKKRIESFALYIYKVLKQVH...
XP_011128492.1,XP_011128492.1,H2B,H2B.Z,H2B.Z,DOI:10.5772/intechopen.81409,MSGKGGKQQLAKKTAANKLPSHHLDKNKKRRRRGETFSIYIYKVLR...
XP_628349.1,XP_628349.1,H2B,H2B.Z,H2B.Z,DOI:10.5772/intechopen.81409,MSGKSGKSIKGPAQKQQAAKKTAGKSPADGGKRKRRKRTESFALYI...


In [18]:
cs.data = pd.concat([cs.data, df]).fillna("")
cs.data.shape, list(cs.has_duplicates())

((561, 16), [])

In [19]:
cs.data.tail()

Unnamed: 0,accession,type,variant_group,variant,doublet,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,references,sequence
CEM32013.1,CEM32013.1,H2B,H2B.Z,H2B.Z,,,,,,,,,,,DOI:10.5772/intechopen.81409,MPGKGPAEKRQAAKKTAGKTPAEAGKKRRRKRTESFALYIYKVLKQ...
XP_001610608.1,XP_001610608.1,H2B,H2B.Z,H2B.Z,,,,,,,,,,,DOI:10.5772/intechopen.81409,MSGKVPSSKSQAAKKTAGKSLGIRYRRKKRIESFSLYIYKVLKQVH...
HISTDB_H2B_Z_0,HISTDB_H2B_Z_0,H2B,H2B.Z,H2B.Z,,,,,,,,,,,DOI:10.5772/intechopen.81409,MSGKVPSTKSQAAKKTAGKTLGVRYRRKKRIESFALYIYKVLKQVH...
XP_011128492.1,XP_011128492.1,H2B,H2B.Z,H2B.Z,,,,,,,,,,,DOI:10.5772/intechopen.81409,MSGKGGKQQLAKKTAANKLPSHHLDKNKKRRRRGETFSIYIYKVLR...
XP_628349.1,XP_628349.1,H2B,H2B.Z,H2B.Z,,,,,,,,,,,DOI:10.5772/intechopen.81409,MSGKSGKSIKGPAQKQQAAKKTAGKSPADGGKRKRRKRTESFALYI...


In [20]:
cs.data[cs.data["variant"] == "H2B.Z"].shape

(9, 16)

In [21]:
cs.update_accession_version()
cs.data = cs.data.set_index(cs.data.accession.values)

Email address is not specified.

To make use of NCBI's E-utilities, NCBI requires you to specify your
email address with each request.  As an example, if your email address
is A.N.Other@example.com, you can specify it as follows:
   from Bio import Entrez
   Entrez.email = 'A.N.Other@example.com'
In case of excessive usage of the E-utilities, NCBI will attempt to contact
a user at the email address provided before blocking access to the
E-utilities.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  updating_data['accession'] = new_accessions


In [22]:
cs.data[cs.data["variant"] == "H2B.Z"].shape

(9, 16)

In [23]:
curated_set.update_taxids(blank_data=True)

Fetched taxid from NCBI 5802
Fetched taxid from NCBI 1169540
Fetched taxid from NCBI 484906
Fetched taxid from NCBI 110365
Fetched taxid from NCBI 353152
 changes to 5802
 changes to Eimeria tenella
 changes to Apicomplexa
 changes to Conoidasida
 changes to 1169540
 changes to Vitrella brassicaformis CCMP3155
 changes to None
 changes to None
 changes to 484906
 changes to Babesia bovis T2Bo
 changes to Apicomplexa
 changes to Aconoidasida
 changes to 110365
 changes to Gregarina niphandrodes
 changes to Apicomplexa
 changes to Conoidasida
 changes to 353152
 changes to Cryptosporidium parvum Iowa II
 changes to Apicomplexa
 changes to Conoidasida


In [24]:
cs.data[cs.data["variant"] == "H2B.Z"].shape

(9, 16)

In [25]:
cs.data[cs.data["variant"] == "H2B.Z"]

Unnamed: 0,accession,type,variant_group,variant,doublet,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,references,sequence
XP_013228334.1,XP_013228334.1,H2B,H2B.Z,H2B.Z,,,,,5802.0,Eimeria tenella,Apicomplexa,Conoidasida,,,DOI:10.5772/intechopen.81409,MSGKGPAQKSQAAKKTAGKSLGPRYRRRKRTESFALYIYKVLKQVH...
CEM32013.1,CEM32013.1,H2B,H2B.Z,H2B.Z,,,,,1169540.0,Vitrella brassicaformis CCMP3155,,,,,DOI:10.5772/intechopen.81409,MPGKGPAEKRQAAKKTAGKTPAEAGKKRRRKRTESFALYIYKVLKQ...
XP_001610608.1,XP_001610608.1,H2B,H2B.Z,H2B.Z,,,,,484906.0,Babesia bovis T2Bo,Apicomplexa,Aconoidasida,,,DOI:10.5772/intechopen.81409,MSGKVPSSKSQAAKKTAGKSLGIRYRRKKRIESFSLYIYKVLKQVH...
XP_011128492.1,XP_011128492.1,H2B,H2B.Z,H2B.Z,,,,,110365.0,Gregarina niphandrodes,Apicomplexa,Conoidasida,,,DOI:10.5772/intechopen.81409,MSGKGGKQQLAKKTAANKLPSHHLDKNKKRRRRGETFSIYIYKVLR...
XP_628349.1,XP_628349.1,H2B,H2B.Z,H2B.Z,,,,,353152.0,Cryptosporidium parvum Iowa II,Apicomplexa,Conoidasida,,,DOI:10.5772/intechopen.81409,MSGKSGKSIKGPAQKQQAAKKTAGKSPADGGKRKRRKRTESFALYI...
XP_678689.1,XP_678689.1,H2B,H2B.Z,H2B.Z,,68073549.0,,,5823.0,Plasmodium berghei ANKA,Apicomplexa,Aconoidasida,,,,MSGKGPAQKSQAAKKTAGKTLGPRHKRKRRTESFSLYIFKVLKQVH...
XP_001349046.1,XP_001349046.1,H2B,H2B.Z,H2B.Z,,124511826.0,,,36329.0,Plasmodium falciparum 3D7,Apicomplexa,Aconoidasida,,,,MSGKGPAQKSQAAKKTAGKTLGPRHKRKRRTESFSLYIFKVLKQVH...
XP_002369740.1,XP_002369740.1,H2B,H2B.Z,H2B.Z,,237840885.0,,,508771.0,Toxoplasma gondii ME49,Apicomplexa,Conoidasida,,,,MSGKGPAQKSQAAKKTAGKSLGPRYRRRKRTESFALYIYKVLKQVH...
HISTDB_H2B_Z_0,HISTDB_H2B_Z_0,H2B,H2B.Z,H2B.Z,,,,,,,,,,,DOI:10.5772/intechopen.81409,MSGKVPSTKSQAAKKTAGKTLGVRYRRKKRIESFALYIYKVLKQVH...


In [26]:
cs.save()

Added sequence with accession XP_001610608.1
Added sequence with accession HISTDB_H2B_Z_0
Added sequence with accession XP_011128492.1
Added sequence with accession XP_013228334.1
Added sequence with accession XP_628349.1
Added sequence with accession CEM32013.1
cp histones.csv backups/histones.csv-Mar0624162721
Previous data backuped to backups/histones.csv-Mar0624162721
Results saved to histones.csv
