Skip to content

Loading…

Fix decode bug, and change body_replace to re replace #35

Merged
merged 3 commits into from

2 participants

@cedricporter

If img url contains Chinese, it will throw exception. It seems that img is already a unicode object, so I change img.decode() to img.encode().

urlretrieve(urljoin(data['header']['link'],
            img.encode('utf-8')),
           get_attachment_path(img, i['uid']))

After moidified, it works on Linux.

And I think body_replace will be better if use re.
I use WP-Syntax to highlight, and WP-Syntax use <pre lang=""></pre> to block code.
If I don't use re, I can't migrate the syntax highlight.

for key in body_replace:
    # body = body.replace(key, body_replace[key])
    body = re.sub(key, body_replace[key], body)

And in config.yaml I can do re replace.

body_replace: {
  '<pre.*?lang="(.*?)".*?>': '\n{% codeblock \1 lang:\1 %}\n',
  '<pre.*?>': '\n{% codeblock %}\n',
  '</pre>': '\n{% endcodeblock %}\n',

#    '[python]': '{% codeblock lang:python %}',
#    '[/python]': '{% endcodeblock %}',
}
@thomasf thomasf merged commit 7dd5c92 into thomasf:master
@thomasf
Owner

thanks

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
This page is out of date. Refresh to see the latest.
Showing with 13 additions and 4 deletions.
  1. +4 −0 config.yaml
  2. +4 −3 exitwp.py
  3. +5 −1 html2text.py
View
4 config.yaml
@@ -33,6 +33,10 @@ taxonomies:
# Replace certain patterns in body
# Simply replace the key with its value
body_replace: {
+ # '<pre.*?lang="(.*?)".*?>': '\n{% codeblock \1 lang:\1 %}\n',
+ # '<pre.*?>': '\n{% codeblock %}\n',
+ # '</pre>': '\n{% endcodeblock %}\n',
+
# '[python]': '{% codeblock lang:python %}',
# '[/python]': '{% endcodeblock %}',
}
View
7 exitwp.py
@@ -107,7 +107,8 @@ def gi(q, unicode_wrap=True):
body = gi('content:encoded')
for key in body_replace:
- body = body.replace(key, body_replace[key])
+ # body = body.replace(key, body_replace[key])
+ body = re.sub(key, body_replace[key], body)
img_srcs = []
if body is not None:
@@ -292,10 +293,10 @@ def get_attachment_path(src, dir, dir_prefix='a'):
for img in i['img_srcs']:
try:
urlretrieve(urljoin(data['header']['link'],
- img.decode('utf-8')),
+ img.encode('utf-8')),
get_attachment_path(img, i['uid']))
except:
- print "\n unable to download " + urljoin(data['header']['link'], img.decode('utf-8'))
+ print "\n unable to download " + urljoin(data['header']['link'], img.encode('utf-8'))
if out is not None:
def toyaml(data):
View
6 html2text.py
@@ -339,7 +339,11 @@ def previousIndex(self, attrs):
def drop_last(self, nLetters):
if not self.quiet:
self.outtext = self.outtext[:-nLetters]
-
+
+ def handle_comment(self, data):
+ if data == "more":
+ self.o("<!-- more -->")
+
def handle_emphasis(self, start, tag_style, parent_style):
"""handles various text emphases"""
tag_emphasis = google_text_emphasis(tag_style)
Something went wrong with that request. Please try again.