In [1]:
import re
import requests
import pandas as pd
import numpy as np
from time import sleep
from bs4 import BeautifulSoup
from datetime import date
from datetime import datetime
from dateutil import parser

In [2]:
regex = '"slug":"(.*?)".*?"link":"(.*?)".*?"title":{"rendered":"(.*?)"}.*?"content":{"rendered":"(.*?)"}'

ppp = 100
pages = round(754/100)

In [3]:
# https://github.com/matthewwithanm/python-markdownify
from bs4 import BeautifulSoup, NavigableString
import re
import six

convert_heading_re = re.compile(r'convert_h(\d+)')
line_beginning_re = re.compile(r'^', re.MULTILINE)
whitespace_re = re.compile(r'[\r\n\s\t ]+')

# Heading styles
ATX = 'atx'
ATX_CLOSED = 'atx_closed'
UNDERLINED = 'underlined'
SETEXT = UNDERLINED

def escape(text):
    if not text:
        return ''
    return text.replace('_', r'\_')


def chomp(text):
    """
    If the text in an inline tag like b, a, or em contains a leading or trailing
    space, strip the string and return a space as suffix of prefix, if needed.
    This function is used to prevent conversions like
        <b> foo</b> => ** foo**
    """
    prefix = ' ' if text and text[0] == ' ' else ''
    suffix = ' ' if text and text[-1] == ' ' else ''
    text = text.strip()
    return (prefix, suffix, text)


def _todict(obj):
    return dict((k, getattr(obj, k)) for k in dir(obj) if not k.startswith('_'))


class MarkdownConverter(object):
    class DefaultOptions:
        strip = None
        convert = None
        autolinks = True
        heading_style = UNDERLINED
        bullets = '*+-'  # An iterable of bullet types.

    class Options(DefaultOptions):
        pass

    def __init__(self, **options):
        # Create an options dictionary. Use DefaultOptions as a base so that
        # it doesn't have to be extended.
        self.options = _todict(self.DefaultOptions)
        self.options.update(_todict(self.Options))
        self.options.update(options)
        if self.options['strip'] is not None and self.options['convert'] is not None:
            raise ValueError('You may specify either tags to strip or tags to'
                             ' convert, but not both.')

    def convert(self, html):
        soup = BeautifulSoup(html, 'html.parser')
        return self.process_tag(soup, children_only=True)

    def process_tag(self, node, children_only=False):
        text = ''

        # Convert the children first
        for el in node.children:
            if isinstance(el, NavigableString):
                text += self.process_text(six.text_type(el))
            else:
                text += self.process_tag(el)

        if not children_only:
            convert_fn = getattr(self, 'convert_%s' % node.name, None)
            if convert_fn and self.should_convert_tag(node.name):
                text = convert_fn(node, text)

        return text

    def process_text(self, text):
        return escape(whitespace_re.sub(' ', text or ''))

    def __getattr__(self, attr):
        # Handle headings
        m = convert_heading_re.match(attr)
        if m:
            n = int(m.group(1))

            def convert_tag(el, text):
                return self.convert_hn(n, el, text)

            convert_tag.__name__ = 'convert_h%s' % n
            setattr(self, convert_tag.__name__, convert_tag)
            return convert_tag

        raise AttributeError(attr)

    def should_convert_tag(self, tag):
        tag = tag.lower()
        strip = self.options['strip']
        convert = self.options['convert']
        if strip is not None:
            return tag not in strip
        elif convert is not None:
            return tag in convert
        else:
            return True

    def indent(self, text, level):
        return line_beginning_re.sub('\t' * level, text) if text else ''

    def underline(self, text, pad_char):
        text = (text or '').rstrip()
        return '%s\n%s\n\n' % (text, pad_char * len(text)) if text else ''

    def convert_a(self, el, text):
        prefix, suffix, text = chomp(text)
        if not text:
            return ''
        href = el.get('href')
        title = el.get('title')
        if self.options['autolinks'] and text == href and not title:
            # Shortcut syntax
            return '<%s>' % href
        title_part = ' "%s"' % title.replace('"', r'\"') if title else ''
        return '%s[%s](%s%s)%s' % (prefix, text, href, title_part, suffix) if href else text

    def convert_b(self, el, text):
        return self.convert_strong(el, text)

    def convert_blockquote(self, el, text):
        return '\n' + line_beginning_re.sub('> ', text) if text else ''

    def convert_br(self, el, text):
        return '  \n'

    def convert_em(self, el, text):
        prefix, suffix, text = chomp(text)
        if not text:
            return ''
        return '%s*%s*%s' % (prefix, text, suffix)

    def convert_hn(self, n, el, text):
        style = self.options['heading_style']
        text = text.rstrip()
        if style == UNDERLINED and n <= 2:
            line = '=' if n == 1 else '-'
            return self.underline(text, line)
        hashes = '#' * n
        if style == ATX_CLOSED:
            return '%s %s %s\n\n' % (hashes, text, hashes)
        return '%s %s\n\n' % (hashes, text)

    def convert_i(self, el, text):
        return self.convert_em(el, text)

    def convert_list(self, el, text):
        nested = False
        while el:
            if el.name == 'li':
                nested = True
                break
            el = el.parent
        if nested:
            # remove trailing newline if nested
            return '\n' + self.indent(text, 1).rstrip()
        return '\n' + text + '\n'

    convert_ul = convert_list
    convert_ol = convert_list

    def convert_li(self, el, text):
        parent = el.parent
        if parent is not None and parent.name == 'ol':
            if parent.get("start"):
                start = int(parent.get("start"))
            else:
                start = 1
            bullet = '%s.' % (start + parent.index(el))
        else:
            depth = -1
            while el:
                if el.name == 'ul':
                    depth += 1
                el = el.parent
            bullets = self.options['bullets']
            bullet = bullets[depth % len(bullets)]
        return '%s %s\n' % (bullet, text or '')

    def convert_p(self, el, text):
        return '%s\n\n' % text if text else ''

    def convert_strong(self, el, text):
        prefix, suffix, text = chomp(text)
        if not text:
            return ''
        return '%s**%s**%s' % (prefix, text, suffix)

    def convert_img(self, el, text):
        alt = el.attrs.get('alt', None) or ''
        src = el.attrs.get('src', None) or ''
        title = el.attrs.get('title', None) or ''
        title_part = ' "%s"' % title.replace('"', r'\"') if title else ''
        return '![%s](%s%s)' % (alt, src, title_part)


def markdownify(html, **options):
    return MarkdownConverter(**options).convert(html)

In [194]:
author_xml = "<wp:author_login><!\[CDATA\[(.*?)\]\]>.*<wp:author_display_name><!\[CDATA\[(.*?)]]><\/wp:author_display_name>.*<\/wp:author>"

authors = {}

with open('/Users/kamranahmed/Developer/python/bsr-json-regex/posts/authors.xml', 'r') as xml:
    contents = xml.read()
    for auth_id, auth_display_name in re.findall(author_xml, contents):
        authors[auth_id] = auth_display_name

        
media_xml = "<item>[\s\S]*?<guid .*?>(.*?)<\/guid>[\s\S]*?<wp:post_parent>(.*?)<\/wp:post_parent>[\s\S]*?<\/item>"
        
media = {}
        
with open('/Users/kamranahmed/Developer/python/bsr-json-regex/posts/media.xml', 'r') as xml:
    contents = xml.read()
    for url, parent_id in re.findall(media_xml, contents):
        media[parent_id] = url        
        
regex_xml = "<item>[\s\S]*?<title>(.*?)<\/title>[\s\S]*?<link>(.*?)<\/link>[\s\S]*?<pubDate>(.*?)<\/pubDate>[\s\S]*?<dc:creator><!\[CDATA\[(.*?)\]\]><\/dc:creator>[\s\S]*?p=(.*?)<\/guid>[\s\S]*?<content:encoded><!\[CDATA\[([\s\S]*?)\]\][\s\S]*?<\/item>"

posts = []

with open('/Users/kamranahmed/Developer/python/bsr-json-regex/posts/export.xml', 'r') as xml:
    contents = xml.read()
    for post in re.findall(regex_xml, contents):
        posts.append({})
        ID = post[4]
        posts[-1]["id"] = ID
        posts[-1]["title"] = markdownify(post[0])
        posts[-1]["link"] = post[1]
        posts[-1]["date"] = parser.parse(post[2]).strftime('%d %B %Y %I:%M%p')
        posts[-1]["author"] = authors[post[3]]
        if ID in media:
            posts[-1]['featured_image'] = media[ID]
        else:
            posts[-1]['featured_image'] = ''
            
        newlines = re.sub('\n', '<br>', post[5])
        hlines = re.sub('<hr />', '-------', newlines)
        out = re.sub('<!-- .*? -->', '', hlines)
        final = re.sub('\[caption .*?\]', '', out)
        final = re.sub('\[\/caption\]', '', final)

        md = markdownify(final, heading_style=ATX).lstrip()
        final = re.sub('\n\s*\n', '<br><br>', md)
        
        posts[-1]["markdown"] = markdownify(final, heading_style=ATX).lstrip()
        
#         caption = re.sub('\[caption .*?\[\/caption]', '', post[5]
#         comments = re.sub('<!-- .*? -->', '', breaks)
#         hlines = re.sub('<hr />', '-------', comments)
#         md = markdownify(md, heading_style=ATX).lstrip()
#         md = newlines = re.sub(r'\n\s*\n ', '\n\n', hlines)
        
# #         newlines = re.sub('\n\s*\n', '', md)
        
# #         final = re.sub('\[caption .*?\[\/caption]', '', out)
# #         final = re.sub('\n\s*\n', '\n\n', final)

# #         posts[-1]["markdown"] = markdownify(hlines, heading_style=ATX).lstrip()
#         posts[-1]["markdown"] = md
        

In [195]:
# with open('/Users/kamranahmed/Developer/python/bsr-json-regex/md.txt', 'w+') as f:
#     f.write(posts['17781']['markdown'])
print(posts[-1]['markdown'])

***Part I in our COVID-19 series, Swabs and Spit. We take an inside look into how Berkeley scientists have set the bar for COVID-19 testing as the campus community heads back to work.***   
  
As biologists, most of our days are spent toiling in a tucked-away lab prying at core questions important to biology, such as: How do cells regulate their size? How do organisms respond to pathogens? Why do we age? To answer these questions, we have mastered molecular techniques such as PCR, a way to amplify small amounts of DNA in our samples and feed our curiosities by becoming experts about specific cellular pathways and model organisms.   
  
Rarely are we called upon to apply this expertise to solve a tangible, widespread problem with immediate implications for all of humanity. ## The Need for Testing  
  
In early spring, scientists at UC Berkeley started hearing that call. The COVID-19 pandemic highlighted a unique challenge that was emerging all around the world. We needed to increase tes

In [173]:
import json
y = json.dumps(posts, ensure_ascii=False, indent=4)

In [174]:
with open('/Users/kamranahmed/Developer/python/bsr-json-regex/testing.js', 'w', encoding='utf8') as f:
    f.write(y)

In [193]:
with open('/Users/kamranahmed/Developer/js/blog_strapi_gatsby/cms/config/functions/export.js', 'w', encoding='utf8') as json_file:
    json.dump(posts, json_file, ensure_ascii=False, indent=4)

In [166]:
test = re.sub("\[!\[\((.*?)\)\]\(.*?\)", r"![](\1)", "[![(http://3.bp.blogspot.com/-0pp9oQtCdX0/UV8BUtg\\_6\\_I/AAAAAAAAAMk/OVWkD\\_jegeM/s1600/goal.jpg)](http://wp.me/p1xHew-25U)")

In [167]:
test

'![](http://3.bp.blogspot.com/-0pp9oQtCdX0/UV8BUtg\\_6\\_I/AAAAAAAAAMk/OVWkD\\_jegeM/s1600/goal.jpg)'