In [2]:
import lxml
from lxml import etree
from lxml.html.clean import clean_html
import csv
class mysql_dialect(csv.unix_dialect):
    doublequote = False
    escapechar = "\\"

csv.register_dialect("mysql_dialect", mysql_dialect)
    
def collect_articles(tree):
    l1 = [book.find('код').text for book in tree.getroot().iter('издание')]
    l2 = ["{:0>5}".format(article) for article in l1]
    return l1, l2
        
def save_tree(tree, fname='fixture.xml'):
    tree.write(fname, encoding='utf-8', pretty_print=True)
    
def open_tree(fname='oneCdata.xml'):
    return etree.parse(fname)
    
def index_books(tree):
    return {book.find('код').text:book for book in tree.getroot().iter("издание")}

def parse_site_data(hash1, hash2, book_index, processor):
    with open('cl_book.csv') as f:
        book_reader = csv.DictReader(f, dialect='mysql_dialect')
        for row in book_reader:
            article = row['n_code'].strip()
            if article in hash2 or article in hash1:
                if article in hash2:
                    bi = hash1[hash2.index(article)]
                else:
                    bi = article
                book = book_index[bi]
                processor(book, row)

def process_book(book, site_data):
    for i, k in enumerate(site_data, 1):
        if k is None:
            print("error with book %s - %s" % (i, site_data))
            continue
        el = etree.Element(k)
        el.text = site_data[k]
        book.append(el)
        
def index_site_categories():
    cat_index = []
    with open('cl_catalog.csv') as f:
        cat_reader = csv.DictReader(f, dialect='mysql_dialect')
        return {el['site_id']:el for el in cat_reader}
            
                
def build_site_categories_url(cat_index):
    for cat in cat_index.values():
        path_list = [cat['site_url']]
        parent_id = cat['site_parent_id']
        while parent_id != '0':
            try:
                parent_cat = cat_index[parent_id]
            except KeyError:
                break
            path_list.append(parent_cat['site_url'])
            parent_id = parent_cat['site_parent_id']
        cat['site_url'] = '/'.join(reversed(path_list))
        
def inject_site_categories(book_index, cat_index):
    for book in book_index.values():
        id_tag = book.find("book_cid")
        if id_tag is not None:
            container = etree.Element("site_categories")
            for i in id_tag.text.split(','):
                i = i.strip()
                if len(i):
                    cat = cat_index.get(i, None)
                    if cat is not None:
                        el  = etree.Element("site_category", url=cat['site_url'])
                        el.text = cat['site_title']
                        container.append(el)
            book.append(container)
            
def clean_site_annotations(tree):
       for annotation in tree.getroot().iter('book_description'):
            if annotation.text is not None and len(annotation.text):
                annotation.text = clean_html(annotation.text)
                
            
def make_fixtures():
    tree = open_tree()
    hash1, hash2 = collect_articles(tree)
    book_index = index_books(tree)
    parse_site_data(hash1, hash2, book_index, process_book)
    cat_index = index_site_categories()
    build_site_categories_url(cat_index)
    inject_site_categories(book_index, cat_index)
    clean_site_annotations(tree)
    save_tree(tree)

In [3]:
make_fixtures()