In [7]:
import csv
import requests
import os

import numpy as np
import pandas as pd

from contextlib import closing
from Bio import SeqIO
from Bio.Align.Applications import MuscleCommandline

In [2]:
geo = "Massachusetts"
taxon = "Insecta"
API = "http://www.boldsystems.org/index.php/API_Public/combined?"
tax_request = "{}taxon={}&geo={}&format=tsv".format(API, taxon, geo)
print(tax_request)

http://www.boldsystems.org/index.php/API_Public/combined?taxon=Insecta&geo=Massachusetts&format=tsv


In [3]:
with open("result.tsv", "wb") as file, \
        requests.get(tax_request, stream=True) as response:
    for line in response.iter_lines():
        # print(line, end="\r")
        file.write(line)
        file.write(b"\n")

In [25]:
test = pd.read_csv("result.tsv", delimiter="\t", engine="python", error_bad_lines=False)
test["nucleotides"] = test["nucleotides"].astype(str)

In [26]:
test.shape

(3079, 80)

In [27]:
# remove rows missing COI-5P in marker_codes
test = test[test["marker_codes"].str.contains("COI-5P", na=False)]
test.shape

(2743, 80)

In [35]:
test["nucleotides"].apply(lambda x: len([i for i in x if i.isalpha()])).unique()

In [32]:
test = test[
    test.apply(
    (lambda x: True if len([i for i in x["nucleotides"] if i.isalpha()]) >= 350 else False),
    axis=1
)
]
test.shape

(2647, 80)

In [33]:
test["nucleotides"].apply(lambda x: len([i for i in x if i.isalpha()])).unique()

array([658, 612, 407, 610, 874, 662, 619, 677, 603, 426, 419, 632, 679,
       556, 633, 577, 592, 621, 699, 628, 615, 571, 537, 627, 630, 476,
       637, 576, 602, 634, 546, 643, 429, 514, 578, 563, 570, 626, 647,
       636, 618, 393, 652, 641, 532, 600, 623, 620, 657, 625, 565, 650,
       639, 622, 551, 665, 646, 635, 648, 656, 557, 640, 527, 587, 644,
       573, 631, 585, 550, 614, 559, 589, 663, 509, 655, 414, 594, 698,
       682, 518, 465, 654, 642, 378, 416, 478, 390, 351, 544, 604, 361,
       555, 593, 678, 608, 575, 572, 696, 389, 653, 605, 403, 588, 638,
       566, 376, 613, 651, 624, 467, 466, 538, 695, 526, 516, 607, 553,
       541, 591, 430, 360, 692, 540, 580, 554, 410, 595, 534, 597, 498,
       601, 507, 489, 561, 606, 586, 645, 649, 543, 583, 379, 629, 617,
       529, 596, 599, 431, 542, 562, 683], dtype=int64)

In [36]:
test["nucleotides"].apply(lambda x: len(x)).unique()

array([658, 874, 662, 677, 648, 679, 699, 735, 649, 665, 663, 656, 698,
       682, 666, 678, 696, 605, 383, 695, 630, 692, 680, 655, 664, 683],
      dtype=int64)

In [8]:
# save new tsv of processid, species_name, nucleotides
test[["processid", "species_name", "nucleotides"]].to_csv("check.tsv", sep="\t", index=False)

In [9]:
with open("out_fasta.fasta", "w") as file:
    for index, row in test.iterrows():
        file.write(">{}\n".format(row["processid"]))
        file.write(row["nucleotides"])
        file.write("\n")


In [40]:

alignment = SeqIO.parse("out_fasta.fasta", format="fasta")
records = list(alignment)


In [8]:
# honestly do this call in your own subprocess so you can give the user direct feedback
# wrap it in a function which prints when the process is complete and returns a value so you can determine if the window was closed or something or if it succeded
alignment_call = MuscleCommandline(
    os.path.realpath("niclassify/applications/muscle3.8.31_i86win32.exe"),
    input=os.path.realpath("out_fasta.fasta"),
    out="aligned.fasta"
)
call_results = alignment_call()
print(call_results)

In [None]:
import csv
with open("result.tsv", "r") as file:
    reader = csv.reader(file, delimiter="\t")