In [1]:
import csv
import json


def tid(tree_id: int) -> str:
        """
        Generate TreeID from tree_id in database
        Since postgresql serial is 4*8 bit, if id is too big,
        alter table and return raw number.
        TID format: 'T00A100000'
            first letter, 'T'
            2-4, digit or capital letter, 0-9 and A-Z
            5-10, digit
        Args:
            tree_id: tree_id, postgresql serial number

        Returns:
            tid: str
        """
        if tree_id is None:
             print(tree_id)
             return 'T00000000'
        max_n = min(36 ** 3 * 100_000, 2 ** (8 * 4 - 1))
        prefix = 'T'
        n = 1_000_00
        if tree_id >= max_n:
            return 'T' + str(tree_id)
        # 0-9 and A-Z
        base = 26 + 10

        a, b = divmod(tree_id, n)
        letters = ''
        while a > 0:
            a, digit = divmod(a, base)
            if digit < 10:
                letters = str(digit) + letters
            else:
                letters = chr(ord('A') + (digit - 10)) + letters
        return prefix + f'{letters:>03}' + f'{b:05d}'

In [None]:
# use sql to output table from treebase

In [None]:
"""psql
copy (
select to_json(t) from (
select tmp.name_txt as root, trees.study_id, study.doi,
	treefile.treefile_id, trees.tree_id, trees.tree_label, trees.tree_title, trees.tree_type, trees.tree_kind, 
	trees.root as root_id, trees.tree_quality, trees.upload_date, trees.tree_type_new
from trees inner join study on study.study_id=trees.study_id
inner join treefile on trees.tree_id=treefile.tree_id
left join (select * from ncbi_names where name_class='scientific name') as tmp on trees.root=tmp.tax_id
) as t)
to 'r:\\test.json';
"""

In [2]:
trees = list()
with open('r:\\test.json', 'r') as _:
    for line in _:
        # handle escape character from psql
        line2 = line.replace(r'\\', '\\')
        x = json.loads(line2)
        x['tree_file'] = tid(x['tree_id']) + '.nwk'
        trees.append(x)
print(trees[214:216])
json.dump(trees, open('r:\\tree.json', 'w'), indent=True)
print(len(trees))

[{'root': 'Trachelium', 'study_id': 709, 'doi': '10.1006/lich.2001.0325', 'treefile_id': 7047, 'tree_id': 401, 'tree_label': 'Fig. 3b, mtSSU+28S', 'tree_title': 'Coccotremataceae', 'tree_type': 'Single', 'tree_kind': 'Species Tree', 'root_id': 16464, 'tree_quality': 'Unrated', 'upload_date': None, 'tree_type_new': 'treebase', 'tree_file': 'T00000401.nwk'}, {'root': None, 'study_id': 724, 'doi': '10.1006/mpev.2000.0840', 'treefile_id': 7070, 'tree_id': 403, 'tree_label': 'Fig. 4', 'tree_title': 'Phalacrocoracidae', 'tree_type': 'Single', 'tree_kind': 'Species Tree', 'root_id': 16531, 'tree_quality': 'Unrated', 'upload_date': None, 'tree_type_new': 'treebase', 'tree_file': 'T00000403.nwk'}]
135502


In [None]:
"""psql
copy (
	select to_json(t) from 
	  (select tree_id,newick from treefile) as t)
to 'r:\\treefile.json';
"""

In [4]:
treefile = list()
with open('r://treefile.json', 'r') as _:
    for line in _:
        line2 = line.replace(r'\\', '\\')
        t = json.loads(line2)
        filename = tid(t['tree_id']) + '.nwk'
        with open(f'r://out/{filename}', 'w') as out_:
            out_.write(t['newick'])

In [None]:
"""psql
copy (
	select to_json(t) from (
select array_agg(treefile.tree_id) as tree_files, study.* from study
left join trees on study.study_id = trees.study_id
left join treefile on trees.tree_id = treefile.tree_id
group by study.study_id
order by study.study_id) as t)
to 'r:\\paper.raw';
"""

In [5]:
paper = list()
with open('r://paper.raw', 'r') as _:
    for line in _:
        line2 = line.replace(r'\\', '\\')
        t = json.loads(line2)
        t['tree_files'] = [tid(tf)+'.nwk' for tf in t['tree_files']]
        paper.append(t)
print(len(paper))        
                           



None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None


In [6]:
with open(f'r://paper.json', 'w') as out_:
    json.dump(paper, out_, indent=True)

In [None]:
"""
1. import to database
2. output merged final json (paper.json,tree.json) and newick treefiles
3. statistics
"""


In [8]:
with open(f'r://paper.json') as _:
    data2 = json.load(_)
header = data2.pop()
header['tree_files'] = len(header['tree_files'])
print(header)
with open(r'r://for_draw.csv', 'w', newline='') as out:
    writer = csv.DictWriter(out, fieldnames=list(header.keys()))
    writer.writeheader()
    writer.writerow(header)
    for record in data2:
        record['tree_files'] = len(record['tree_files'])
        writer.writerow(record)


{'tree_files': 1, 'study_id': 26168, 'pub_type': None, 'author': 'Charles Ross Lindsey, Andrew H. Knoll, Matthew D. Herron & Frank Rosenzweig', 'year': 2024, 'title': 'Fossil-calibrated molecular clock data enable reconstruction of steps leading to differentiated multicellularity and anisogamy in the Volvocine algae', 'journal': 'BMC Biology', 's_author': None, 's_title': None, 'place_pub': None, 'publisher': None, 'volume': None, 'number': None, 'pages': None, 'isbn': None, 'keywords': '', 'abstract': 'Throughout its nearly four-billion-year history, life has undergone evolutionary transitions in which simpler subunits have become integrated to form a more complex whole. Many of these transitions opened the door to innovations that resulted in increased biodiversity and/or organismal efficiency. The evolution of multicellularity from unicellular forms represents one such transition, one that paved the way for cellular differentiation, including differentiation of male and female gamet

In [None]:
# count journal

'''sql
SELECT COUNT(DISTINCT journal)
FROM study;
'''

