In [1]:
import re
import time

In [2]:
fpath_lab = 'labresults.html'
fpath_prob = 'problems.html'

In [3]:
class TableConvertor:
    '''
    Used for converting the HTML table into CSV format.\n
    '''  
    def __init__(self, filePath):
        self.__htmlFilePath = filePath
        
        # below are the regular expressions used in the convert function

        self.__CLEAN_RE = re.compile(r"\s+|&#160;")
        self.__ROW_RE = re.compile(r"(?:<tr\s*/>)|(?:<tr\b[^>]*>(?P<row>.+?)</tr>)")
        self.__CELL_RE = re.compile(r"(?:<(th|td)\s*/>)|(?:<(?P<element>th|td)\b[^>]*>(?P<col>.+?)</(?P=element)>)")
        
    def __calcTime(function): # used for decorating the function for calculating the time taken
        def wrapper(self):
            start_time = time.time()
            function(self)
            end_time = time.time()
            return f'Time taken = {(end_time - start_time):0.6f} seconds'
        return wrapper
        
    @__calcTime
    def convert(self):
        html_content = open(self.__htmlFilePath, 'r', encoding = 'utf-8').read()
        html_content = self.__CLEAN_RE.sub(' ', html_content)
        # removing all the whitespaces (newline) and the unicode for space
        
        fileName = self.__htmlFilePath.split('/')[-1].split('.')[0] + '.csv' # creating file name

        with open(fileName, 'w', encoding = 'utf-8') as writeFile:

            row_iter = self.__ROW_RE.finditer(html_content)

            for row in row_iter: # each row of table data

                if row.group('row'):
                    col_iter = self.__CELL_RE.finditer(row.group('row'))

                    line = []

                    for col in col_iter: # each cell of the row
                        if col.group('col'):
                            line.append(col.group('col').replace(',', ''))
                        else:
                            line.append(' ')

                    writeFile.write(','.join(line)) # used for CSV format
                    writeFile.write('\n')

            print('CSV file created')

In [4]:
a = TableConvertor(fpath_lab)
a.convert()

CSV file created


'Time taken = 0.005634 seconds'

In [5]:
b = TableConvertor(fpath_prob)
b.convert()

CSV file created


'Time taken = 0.004054 seconds'