Skip to content
Browse files

splitter with progress dialog

  • Loading branch information...
1 parent 679f098 commit b94ce7c4f362168a866f391f0ef828845ec58553 tecoholic committed
Showing with 66 additions and 26 deletions.
  1. +2 −6 gui.py
  2. +64 −20 splitter.py
View
8 gui.py
@@ -11,7 +11,7 @@
from wx.lib.wordwrap import wordwrap
from searcher import *
-from splitter import split_xml
+from splitter import RunSplitter
from indexer import IndexDialog
class MainWindow(wx.Frame):
@@ -53,7 +53,7 @@ def __init__(self,parent, title):
# Set values
self.SearchBox.ShowCancelButton(True)
self.ResultBox.BeginFontSize(11)
-
+ self.RunSplitter = RunSplitter
#Events
self.Bind(wx.EVT_TEXT_ENTER, self.SearchIt,self.SearchBox)
self.Bind(wx.EVT_LISTBOX, self.ShowMeaning, self.WordList)
@@ -127,13 +127,9 @@ def ShowAbout(self, e):
400, wx.ClientDC(self))
wx.AboutBox(info)
- def RunSplitter(self, event):
- ''' The RunSplitter function runs the splitter.py function '''
- split_xml('wiki-files/tawiktionary-latest-pages-articles.xml.bz2')
def OpenIndexDialog(self, event):
''' The RunIndexer function runs indexer.py function '''
- #create_index(2)
dia = IndexDialog(self, -1, "Indexer")
dia.ShowModal()
dia.Destroy()
View
84 splitter.py
@@ -11,11 +11,14 @@
import os
import bz2
+import wx
-def split_xml(filename):
- ''' The function gets the filename of wiktionary.xml.bz2 file as input
- and creates smallers chunks of it in a the diretory chunks
+def RunSplitter(event):
+ '''The function gets the filename of wiktionary.xml.bz2 file as input and
+ creates smallers chunks of it in a the diretory chunks.
'''
+ # Filename
+ filename = 'wiki-files/tawiktionary-20110518-pages-articles.xml.bz2'
# Check and create chunk diretory
if not os.path.exists("chunks"):
os.mkdir("chunks")
@@ -27,25 +30,66 @@ def split_xml(filename):
"chunk-"+str(filecount)+
".xml.bz2")
chunkfile = bz2.BZ2File(chunkname(filecount), 'w')
- # Read line by line
- bzfile = bz2.BZ2File(filename)
- for line in bzfile:
- chunkfile.write(line)
- # the </page> determines new wiki page
- if '</page>' in line:
- pagecount += 1
- if pagecount > 1999:
- #print chunkname() # For Debugging
- chunkfile.close()
- pagecount = 0 # RESET pagecount
- filecount += 1 # increment filename
- chunkfile = bz2.BZ2File(chunkname(filecount), 'w')
try:
- chunkfile.close()
- except:
- print 'Files already close'
+ # Read line by line
+ bzfile = bz2.BZ2File(filename)
+ except Exception, ex:
+ go = False
+ msgdlg = wx.MessageDialog(None, str(ex),
+ 'Exception',
+ wx.OK | wx.ICON_ERROR
+ #wx.YES_NO | wx.NO_DEFAULT | wx.CANCEL | wx.ICON_INFORMATION
+ )
+ msgdlg.ShowModal()
+ msgdlg.Destroy()
+
+ st = os.stat(filename)
+ max = int(st.st_size/(1024*102.4))
+ dlg = wx.ProgressDialog("Splitter",
+ "The large file is being split. Kindly Wait!",
+ maximum=max,
+ parent=None,
+ style=wx.PD_CAN_ABORT |
+ wx.PD_APP_MODAL |
+ wx.PD_ELAPSED_TIME |
+ #wx.PD_ESTIMATED_TIME|
+ wx.PD_REMAINING_TIME )
+ go = True
+ while go:
+ for line in bzfile:
+ chunkfile.write(line)
+ # the </page> determines new wiki page
+ if '</page>' in line:
+ pagecount += 1
+ if pagecount > 1999:
+ #print chunkname() # For Debugging
+ chunkfile.close()
+ pagecount = 0 # RESET pagecount
+ filecount += 1 # increment filename
+ chunkfile = bz2.BZ2File(chunkname(filecount), 'w')
+ #update count here
+ cnksize = 0
+ cnkfiles = [fi for fi in os.listdir('chunks')
+ if os.path.isfile(os.path.join('chunks',fi))]
+ for fi in cnkfiles:
+ cnksize += os.stat(os.path.join('chunks',fi)).st_size
+ (go, skip) = dlg.Update(int(cnksize/(1024*102.4)))
+ try:
+ chunkfile.close()
+ except:
+ #print 'Files already close'
+ pass
+ go = False
+
+
+ dlg.Destroy()
+
+#------------------------------------------------------------------------------
+
+
if __name__ == '__main__':
# When the script is self run
- split_xml('wiki-files/tawiktionary-20110518-pages-articles.xml.bz2')
+ # split_xml('wiki-files/tawiktionary-20110518-pages-articles.xml.bz2')
+ print 'This file is not supposed to be run seperately!'

0 comments on commit b94ce7c

Please sign in to comment.
Something went wrong with that request. Please try again.