image downloading

some-programs · Apr 6, 2011 · 080486c · 080486c
1 parent d01dba8
commit 080486c
Show file tree

Hide file tree

Showing 3 changed files with 24 additions and 12 deletions.
diff --git a/README.rst b/README.rst
@@ -57,9 +57,13 @@ Known issues
 ============
 Near future improvements:
  * Target file names are some times less than optimal.
- * Image/attachment downloading not implemented.
+ * Rewriting of image/attachment links if they are downloaded
  * Meaningful translation/filtering of wikipedia publish statuses into something that usable within a fairly standard jekyll setup.
 
+Things I want to do to learn writing better python code:
+ * Refactor code to use less nesting
+ * Refactor code to use more try/except tests instead of if statements
+
 Things that might be resolved later on if I find the time:
  * There will probably be issues when migrating non utf-8 encoded wordpress dump files (if they exist).
  * Integrate one or a few basic jekyll site templates to render complete working jekyll blog setups from wordpress exports.
diff --git a/config.yaml b/config.yaml
@@ -13,7 +13,7 @@ target_format: markdown
 date_format: '%Y-%m-%d %H:%M:%S'
 
 # Try to download and reloacate all images locally to the blog.
-download_images: false
+download_images: False
 
 # Item types we don't want to import.
 item_type_filter: {attachment, nav_menu_item}

diff --git a/exitwp.py b/exitwp.py
@@ -10,7 +10,8 @@
 import yaml
 import tempfile
 from BeautifulSoup import BeautifulSoup
-from urlparse import urlparse
+from urlparse import urlparse, urljoin
+from urllib import urlretrieve
 
 
 '''
@@ -195,10 +196,10 @@ def get_item_uid(item, date_prefix=False, namespace=''):
             result=fn
         return result
 
-    def get_item_path(item, date_prefix=False, dir='', namespace=''):
+    def get_item_path(item, dir=''):
         full_dir=get_full_dir(dir)
         filename_parts=[full_dir,'/']
-        filename_parts.append(get_item_uid(item, date_prefix=date_prefix, namespace=namespace))
+        filename_parts.append(item['uid'])
         filename_parts.append('.')
         filename_parts.append(target_format)
         return ''.join(filename_parts)
@@ -222,10 +223,15 @@ def get_attachment_path(src, dir, dir_prefix='a'):
                 file_infix=file_infix+1
             files[src]=filename=maybe_filename
 
-        target_name=os.path.normpath(blog_dir+'/'+dir_prefix +'/' + dir+'/'+filename)
+        target_dir=os.path.normpath(blog_dir+'/'+dir_prefix +'/' + dir)
+        target_file=os.path.normpath(target_dir+'/'+filename)
+
+        if (not os.path.exists(target_dir)):
+            os.makedirs(target_dir)
+
         #if src not in attachments[dir]:
-        print target_name
-        return target_name
+        ##print target_name
+        return target_file
 
     #data['items']=[]
 
@@ -243,11 +249,12 @@ def get_attachment_path(src, dir, dir_prefix='a'):
         }
 
         if i['type'] == 'post':
-
-            fn=get_item_path(i, date_prefix=True, dir='_posts')
+            i['uid']=get_item_uid(i,date_prefix=True)
+            fn=get_item_path(i, dir='_posts')
             out=open_file(fn)
             yaml_header['layout']='post'
         elif i['type'] == 'page':
+            i['uid']=get_item_uid(i)
             fn=get_item_path(i)
             out=open_file(fn)
             yaml_header['layout']='page'
@@ -257,8 +264,9 @@ def get_attachment_path(src, dir, dir_prefix='a'):
             print "Unknown item type :: " +  i['type']
 
 
-        for a in i['img_srcs']:
-            get_attachment_path(a,fn)
+        if download_images:
+            for img in i['img_srcs']:
+                urlretrieve(urljoin(data['header']['link'],img.decode('utf-8')), get_attachment_path(img, i['uid']))
 
 
         if out is not None: