In [1]:
%cd '..'

/mnt/scratch/v_kulikov/histonedb_Vladimir/CURATED_SET


In [2]:
import inspect
import os
import re
import sys

import pandas as pd
from Bio import Entrez, SeqIO
from curated_set_services import CuratedSet

In [3]:
# Load data from histones.csv
curated_set = CuratedSet()
cs = curated_set

cs.data.shape, cs.data.columns

((564, 16),
 Index(['accession', 'type', 'variant_group', 'variant', 'doublet', 'gi',
        'ncbi_gene_id', 'hgnc_gene_name', 'taxonomy_id', 'organism', 'phylum',
        'class', 'taxonomy_group', 'info', 'references', 'sequence'],
       dtype='object'))

In [4]:
list(cs.has_duplicates())

[]

In [5]:
cs.data[cs.data["accession"] == "P16889.3"]

Unnamed: 0,accession,type,variant_group,variant,doublet,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,references,sequence
P16889.3,P16889.3,H2B,late_H2B,late_H2B.3__???,,,,,7668,Strongylocentrotus purpuratus,Echinodermata,Echinoidea,,,3697096,MPAKAQAAGKKGSKKAKAPKPSGDKKRRRKRKESYGIYIYKVLKQV...


In [6]:
cs.data.head()

Unnamed: 0,accession,type,variant_group,variant,doublet,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,references,sequence
NP_505463.1,NP_505463.1,H2A,cH2A,cH2A_(Animals),,17562014,,,6239,Caenorhabditis elegans,Nematoda,Chromadorea,,,26989147 22650316(?),MSGRGKGGKAKTGGKAKSRSSRAGLQFPVGRLHRILRKGNYAQRVG...
EEC09557.1,EEC09557.1,H2A,cH2A,cH2A_(Animals),,215500063,,,6945,Ixodes scapularis,Arthropoda,Arachnida,,,26989147 22650316(?),MSGRGKGGKVKGKSKTRSSRAGLQFPVGRIHRLLRKGNYAERVGAG...
NP_724343.1,NP_724343.1,H2A,cH2A,cH2A_(Animals),,24585673,,,7227,Drosophila melanogaster,Arthropoda,Insecta,,,26989147 22650316(?),MSGRGKGGKVKGKAKSRSNRAGLQFPVGRIHRLLRKGNYAERVGAG...
XP_001119899.1,XP_001119899.1,H2A,cH2A,cH2A_(Animals),,110764935,,,7460,Apis mellifera,Arthropoda,Insecta,,,26989147 22650316(?),MSGRGKGGKAKAKAKSRSNRAGLQFPVGRIHRLLRKGNYAERVGAG...
EDO48405.1,EDO48405.1,H2A,cH2A,cH2A_(Animals),,156227602,,,45351,Nematostella vectensis,Cnidaria,Anthozoa,,,26989147 22650316(?),MSGRGKGKAKGTKSKTRSSRAGLQFPVGRIHRHLRKGNYAERVGAG...


In [7]:
cs.data.shape

(564, 16)

In [8]:
late_h2b_accessions = [
    "P16889.3",
]

In [9]:
df = pd.DataFrame(
    {
        "accession": late_h2b_accessions,
        "type": ["H2B"] * len(late_h2b_accessions),
        "variant_group": ["late_H2B_(Echinoidea)"] * len(late_h2b_accessions),
        "variant": ["late_H2B_(Echinoidea)"] * len(late_h2b_accessions),
        "references": ["3697096"] * len(late_h2b_accessions),
    }
)
df.index = df.accession
df.shape, df.columns

((1, 5),
 Index(['accession', 'type', 'variant_group', 'variant', 'references'], dtype='object'))

In [10]:
df.head()

Unnamed: 0_level_0,accession,type,variant_group,variant,references
accession,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
P16889.3,P16889.3,H2B,late_H2B_(Echinoidea),late_H2B_(Echinoidea),3697096


In [11]:
cs.data = pd.concat([cs.data, df]).fillna("")
cs.data.shape, list(cs.has_duplicates())

((565, 16), ['P16889.3'])

In [12]:
cs.data.tail()

Unnamed: 0,accession,type,variant_group,variant,doublet,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,references,sequence
NP_005309.1,NP_005309.1,H1,H1.0,H1.0_(Homo_sapiens)__???,,,3005.0,H1-0,9606.0,Homo sapiens,Chordata,Mammalia,Mammalia,,26689747,MTENSTSAPAAKPKRAKASKKSTDHPKYSDMIVAAIQAEKNRAGSS...
NP_005311.1,NP_005311.1,H1,H1.3,H1.3_(Homo_sapiens)__???,,,3007.0,H1-3,9606.0,Homo sapiens,Chordata,Mammalia,Mammalia,,26689747,MSETAPLAPTIPAPAEKTPVKKKAKKAGATAGKRKASGPPVSELIT...
NP_861453.1,NP_861453.1,H1,H1.7,H1.7_(Homo_sapiens)__???,,,341567.0,H1-7,9606.0,Homo sapiens,Chordata,Mammalia,Mammalia,,26689747,MEQALTGEAQSRWPRRGGSGAMAEAPGPSGESRGHSATQLPAEKTV...
NP_005312.1,NP_005312.1,H1,H1.4,H1.4_(Homo_sapiens)__???,,,3008.0,H1-4,9606.0,Homo sapiens,Chordata,Mammalia,Mammalia,,26689747,MSETAPAAPAAPAPAEKTPVKKKARKSAGAAKRKASGPPVSELITK...
P16889.3,P16889.3,H2B,late_H2B_(Echinoidea),late_H2B_(Echinoidea),,,,,,,,,,,3697096,


In [13]:
cs.data[cs.data["variant"] == "late_H2B_(Echinoidea)"].shape

(2, 16)

In [14]:
cs.update_accession_version()
cs.data = cs.data.set_index(cs.data.accession.values)

Email address is not specified.

To make use of NCBI's E-utilities, NCBI requires you to specify your
email address with each request.  As an example, if your email address
is A.N.Other@example.com, you can specify it as follows:
   from Bio import Entrez
   Entrez.email = 'A.N.Other@example.com'
In case of excessive usage of the E-utilities, NCBI will attempt to contact
a user at the email address provided before blocking access to the
E-utilities.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  updating_data['accession'] = new_accessions


In [15]:
cs.data[cs.data["variant"] == "late_H2B_(Echinoidea)"].shape

(2, 16)

In [16]:
curated_set.update_taxids(blank_data=True)

Fetched taxid from NCBI 7668
 changes to 7668
 changes to Strongylocentrotus purpuratus
 changes to Echinodermata
 changes to Echinoidea


In [17]:
cs.data[cs.data["variant"] == "late_H2B_(Echinoidea)"].shape

(2, 16)

In [19]:
cs.data[cs.data["variant"] == "late_H2B_(Echinoidea)"]

Unnamed: 0,accession,type,variant_group,variant,doublet,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,references,sequence
P16889.3,P16889.3,H2B,late_H2B_(Echinoidea),late_H2B_(Echinoidea),,,,,7668,Strongylocentrotus purpuratus,Echinodermata,Echinoidea,,,3697096,
P16888.2,P16888.2,H2B,late_H2B_(Echinoidea),late_H2B_(Echinoidea),,,,,7668,Strongylocentrotus purpuratus,Echinodermata,Echinoidea,,,3697096,MPAKAQPAGKKGSKKAKAPRPSGGKKRRRRRKESYGIYIYKVLKQV...


## Updating sequences for late_H2B_(Echinoidea)

In [20]:
cs.data[cs.data['sequence'] == '']

Unnamed: 0,accession,type,variant_group,variant,doublet,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,references,sequence
P16889.3,P16889.3,H2B,late_H2B_(Echinoidea),late_H2B_(Echinoidea),,,,,7668,Strongylocentrotus purpuratus,Echinodermata,Echinoidea,,,3697096,


In [21]:
cs.update_sequence(blank_data=True)

Downloading FASTA SeqRecords by ACCESSIONs from NCBI
Fetching 1 seqs


Email address is not specified.

To make use of NCBI's E-utilities, NCBI requires you to specify your
email address with each request.  As an example, if your email address
is A.N.Other@example.com, you can specify it as follows:
   from Bio import Entrez
   Entrez.email = 'A.N.Other@example.com'
In case of excessive usage of the E-utilities, NCBI will attempt to contact
a user at the email address provided before blocking access to the
E-utilities.


Sequence for P16889.3 changes from  to MPAKAQAAGKKGSKKAKAPKPSGDKKRRRKRKESYGIYIYKVLKQVHPDTGISSRAMSIMNSFVNDVFERIAAEASRLAHYNKKSTITSREVQTAVRLLLPGELAKHAVSEGTKAVTKYTTSK
Sequences updated: 1


In [22]:
cs.data[cs.data["variant"] == "late_H2B_(Echinoidea)"]

Unnamed: 0,accession,type,variant_group,variant,doublet,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,references,sequence
P16889.3,P16889.3,H2B,late_H2B_(Echinoidea),late_H2B_(Echinoidea),,,,,7668,Strongylocentrotus purpuratus,Echinodermata,Echinoidea,,,3697096,MPAKAQAAGKKGSKKAKAPKPSGDKKRRRKRKESYGIYIYKVLKQV...
P16888.2,P16888.2,H2B,late_H2B_(Echinoidea),late_H2B_(Echinoidea),,,,,7668,Strongylocentrotus purpuratus,Echinodermata,Echinoidea,,,3697096,MPAKAQPAGKKGSKKAKAPRPSGGKKRRRRRKESYGIYIYKVLKQV...


In [23]:
cs.save()

variant_group  self                  late_H2B
               other    late_H2B_(Echinoidea)
variant        self           late_H2B.3__???
               other    late_H2B_(Echinoidea)
Name: P16889.3, dtype: object
cp histones.csv backups/histones.csv-Mar2624081852
Previous data backuped to backups/histones.csv-Mar2624081852
Results saved to histones.csv
