In [1]:
import re


In [2]:
# Cleaning instructions, warning order does matter.
clean_re = [
    (r'Return to top',                                  r''),                           # Unecessary to keep
    (r'To convert from\tto\tMultiply by\n',             r''),                           # Unecessary to keep
    (r'([^\d\n]+\n)',                                   r'# \1'),                       # Make any line without numbers a comment
    (r' *\t *',                                         r'\t'),                         # Remove space before and after tab
    (r'\°',                                             r'deg'),                        # Remove special characters
    (r'\′',                                             r'min'),                        # Remove special characters
    (r'\″',                                             r's'),                          # Remove special characters
    (r'å',                                              r'a'),                          # Remove special characters
    (r'ö',                                              r'o'),                          # Remove special characters
    (r'Å',                                              r'A'),                          # Remove special characters
    (r'γ',                                              r'gamma'),                      # Remove special characters
    (r'Ω',                                              r'ohm'),                        # Remove special characters
    (r'μ',                                              r'u'),                          # Remove special characters
    (r' · ',                                            r' '),                          # Replace multiplication dot
    (r' ?\([0-9./]*? in\)',                             r''),                           # Remove unecessary inch conversions
    (r'\) \d{1,2}\t',                                   r')\t'),                        # Remove footnotes
    (r'(darcy) \d*',                                    r'\1'),                         # Handle special case footnote
    (r'(lambert) \d*',                                  r'\1'),                         # Handle special case footnote
    (r'\'s',                                            r's'),                          # Replace apostrophe s
    (r'.*divide.*\n',                                   r''),                           # Remove odd conversion
    (r'^[deg|kel].*[Tt]\/.*\n',                         r''),                           # Remove non multiply tempurature conversions    
    (r'\(l. y.\)',                                      r'(ly)'),                       # Handle special symbol
    (r'mil(\s)',                                        r'mil (mil)\1'),                # Handle special symbol
    (r'(\s)volt(\s)',                                   r'\1volt (V)\2'),               # Handle special symbol
    (r'carat, metric',                                  r'carat'),                      # Handle special symbol
    (r'((hundredweight) \((\w*?), \d* lb\))',           r'\1 (\2 \3)'),                 # Handle special symbol
    (r'((\w*?) \((troy) or apothecary\) \(\w*?\))',     r'\1 (\3 \2)'),                 # Handle special symbol
    (r'((ton), (\w*?) \(\d* lb\))',                     r'\1 (\2 \3)'),                 # Handle special symbol
    (r'(pica) \((.*?)\)',                               r'\1 (\2) (\2 \1)'),            # Handle special symbol
    (r'(point) \((.*?)\)',                              r'\1 (\2) (\2 \1)'),            # Handle special symbol
    (r'cup \(U.S.\)',                                   r'cup (cup)'),                  # Handle special symbol
    (r'mile, nautical 20',                              r'nautical mile'),              # Handle special symbol
    (r'(kip \(1 kip) = (1000 lbf\))',                   r'\1 equals \2 (kip)'),         # Handle special symbol    
    (r'(acre) (\(.*?\))\t',                             r'\1 \2 (\1)\t'),               # Handle special symbol 
    (r'(acre-foot) (\(.*?\)) *\t',                      r'\1 \2 (acre ft)\t'),          # Handle special symbol 
    (r'(fathom) (\(.*?\))\t',                           r'\1 \2 (\1)\t'),               # Handle special symbol 
    (r'(faraday) (\(.*?\))\t',                          r'\1 \2 (\1)\t'),               # Handle special symbol     
    (r'(tablespoon)\t',                                 r'\1 (Tbsp)\t'),                # Handle special symbol
    (r'(teaspoon)\t',                                   r'\1 (tsp)\t'),                 # Handle special symbol
    (r'(poundal second per square foot)\t',             r'\1 (poundal s / ft2)\t'),     # Handle special symbol
    (r'(poundal per square foot)',                      r'\1 (poundal / ft2)'),         # Handle special symbol
    (r'(horsepower \(550 ft lbf/s\))',                  r'\1 (hp)'),                    # Handle special symbol
    (r'(horsepower) \(([^\d]*?)\)\t',                   r'\1 (\2 hp)\t'),               # Handle special symbol
    (r'(centimeter of mercury) \((\d+ deg.)\)\t',       r'\1 (\2) (cmHg \2)\t'),        # Handle special symbol    
    (r'(inch of mercury) \((\d+ deg.)\)\t',             r'\1 (\2) (inHg \2)\t'),        # Handle special symbol    
    (r'(foot of water) \(([0-9.]+ deg.)\)\t',           r'\1 (\2) (ftH2O \2)\t'),       # Handle special symbol 
    (r'(inch of water) \(([0-9.]+ deg.)\)\t',           r'\1 (\2) (inH2O \2)\t'),       # Handle special symbol 
    (r'(centimeter of water) \(([0-9.]+ deg.)\)\t',     r'\1 (\2) (cmH2O \2)\t'),       # Handle special symbol  
    (r'(centimeter of water) \(([0-9.]+ deg.)\)\t',     r'\1 (\2) (cmH2O \2)\t'),       # Handle special symbol       
    (r'\((.*)\) \(Btu\)',                               r'(Btu \1)'),                   # Handle special symbol       
    (r'(ohm circular-mil \(mil\) per foot)',            r'\1 (Ω mil/ft)'),              # Handle special symbol  
    (r'1015 BtuIT',                                     r'BtuIT 1015'),                 # Handle special symbol  
    (r'(ton-force) \(2000 lbf\)',                       r'\1 (ton f)'),                 # Handle special symbol
    (r'(degree centigrade) 15',                         r'\1 (deg centigrade)'),        # Handle special symbol
    (r'(year \(365 days\))',                            r'\1 (year)'),                  # Handle special symbol
    (r'(year) (\(\w*?)\)',                              r'\1 \2 (\2 \1)'),              # Handle special symbol
    (r'(\w*?) (\(sidreal)\)',                           r'\1 \2 (\2 \1)'),              # Handle special symbol
    (r'((cord) \(128 ft3\))',                           r'\1 (\2)'),                    # Handle special symbol
    (r'(ton, (\w*?), per hour)\t',                      r'\1 (ton \2/h)\t'),            # Handle special symbol
    (r'(ton, (\w*?), per cubic yard)\t',                r'\1 (ton \2/yd3)\t'),          # Handle special symbol 
    (r'(ton of TNT) \(energy equivalent\)',             r'\1 (tonTNT)'),                # Handle special symbol    
    (r'((calorie[^()]*?), kilogram) \((nutrition)\)',   r'\1 (\3 \2 kg)'),              # Handle special symbol    
    (r'((calorie) \((.*)\), kilogram) \((nutrition)\)', r'\1 (\4 \2 \3 kg)'),           # Handle special symbol
    (r'(perm.*) \((\d+ degC)\)',                        r'\1 \2 (\1 \2)'),              # Handle special symbol
    (r'(ton of refrigeration \(12 000 BtuIT/h\))',      r'\1 (tonREFRIG)'),             # Handle special symbol
    (r'\[(Pa s)-1\]',                                   r'/Pa/s'),                      # Handle special symbol
    (r'\(m-1\)',                                        r'(/m)'),                       # Handle special symbol
    (r'(knot) \((nautical mile per hour)\)',            r'\2 (\1)'),                    # Swap name and symbol
    (r'(\(lbf/lb\)) (\(thrust to mass ratio\))',        r'\2 \1'),                      # Swap name and symbol
    (r'\[(\w*?)/\((\w*?) (\w*?)\)\]',                   r'(\1/\2/\3)'),                 # Handle unique bracket
    (r'\[(\w*?)/\((\w*?) (\w*?) (\w*?)\)\]',            r'(\1/\2/3/\4)'),               # Handle unique bracket
    (r'\[(\w*?) (\w*?)\/\((\w*?) (\w*?) (\w*?)\)\]',    r'(\1 \2/\3/\4/\5)'),           # Handle unique bracket
    (r'\[(\w*?) (\w*?) (\w*?)\/\((\w*?) (\w*?)\)\]',    r'(\1 \2 \3/\4/\5)'),           # Handle unique bracket
    (r'\(Gs, G\)',                                      r'(Gs)'),                       # Handle special symbol
    (r'therm \((.*)\)',                                 r'therm (\1_therm)'),           # Handle special symbol
    (r'^([^#()\t]+)\t(.*)$',                            r'\1 (\1)\t\2'),                # Create symbols for items without symbol
    (r'\(based on U\.S\. survey foot\) \((.*)\)',       r'(U.S. survey foot \1)'),      # Handle special based on U.S. survey foot symbols
    (r'\(U\.S\. survey\) \((.*)\)',                     r'(U.S. survey foot \1)'),      # Handle special based on U.S. survey foot symbols
    (r'\(U\.S\..*\) \((.*)\)',                          r'(U.S. \1)'),                  # Handle special U.S. units
    (r'\[Canadian and U\.K\..*\(Imperial\)\] \((.*)\)', r'(U.K. \1)'),                  # Handle special U.S. units
    (r' \)',                                            r')'),                          # Remove empty space in units
    (r'\( ',                                            r'('),                          # Remove empty space in units
    (r'\)\)',                                           r')'),                          # Remove double parens
    (r'(?<! )\(',                                       r' ('),                         # Add splace before paren
    (r'(\d+\.?\d*) *(\d*)\tE',                          r'\1\2e'),                      # Format numbers and add comma
    (r'(.*?)\t(.*)',                                    r'# \1 to \2'),                 # Create comment
    (r'(#.*)\t(.*)',                                    r' : \2, \1'),                  # Swap number and comment
    (r'(.*\(([^\(\)]*)\) to .* \((.*)\))',              r"'\g<2>2\3'\1"),               # Create variable names
    (r'^(.*?)/(.*?) : ',                                r'\1p\2 : '),                   # Remove '/' from variables
    (r'^(.*?)/(.*?) : ',                                r'\1p\2 : '),                   # Remove '/' from variables
    (r'^(.*?)/(.*?) : ',                                r'\1p\2 : '),                   # Remove '/' from variables
    (r'^(.*?)/(.*?) : ',                                r'\1p\2 : '),                   # Remove '/' from variables
    (r'^(.*?)/(.*?) : ',                                r'\1p\2 : '),                   # Remove '/' from variables
    (r'^(.*?)/(.*?) : ',                                r'\1p\2 : '),                   # Remove '/' from variables
    (r'^(.*?)/(.*?) : ',                                r'\1p\2 : '),                   # Remove '/' from variables
    (r'^(.*?)[ ,](.*?) : ',                             r'\1\2 : '),                    # Remove ' ' and ',' from variables
    (r'^(.*?)[ ,](.*?) : ',                             r'\1\2 : '),                    # Remove ' ' and ',' from variables
    (r'^(.*?)[ ,](.*?) : ',                             r'\1\2 : '),                    # Remove ' ' and ',' from variables
    (r'^(.*?)[ ,](.*?) : ',                             r'\1\2 : '),                    # Remove ' ' and ',' from variables
    (r"^'U\.S\.surveyfoot : ",                          r"'US_srvy_ft : "),             # Clean variables
    (r"^'U\.S\.surveyfoot(.*) : ",                      r"'US_srvy_ft_\1 : "),          # Clean variables
    (r"^'U\.S\.(.*) : ",                                r"'US_\1 : "),                  # Clean variables
    (r"^'U\.K\.(.*) : ",                                r"'UK_\1 : "),                  # Clean variables
    (r'^(.*?)\.(.*?) : ',                               r'\1_\2 : '),                   # Remove '.' from variables
    (r'^(.*?)\.(.*?) : ',                               r'\1_\2 : '),                   # Remove '.' from variables
    (r'^(.*?)\.(.*?) : ',                               r'\1_\2 : '),                   # Remove '.' from variables
    (r'^(.*?)\.(.*?) : ',                               r'\1_\2 : '),                   # Remove '.' from variables
]


In [3]:
# Text file of tables from "NIST Guide to the SI" (http://www.nist.gov/pml/pubs/sp811/appenb9.cfm)
si_file = 'SIUnitConversion.txt'

# Read in file
with open(si_file, 'r') as f: 
    txt = f.read()

# Process all regex steps to clean tab delimited data
for pattern, replace in clean_re:
    txt = re.sub(pattern, replace, txt, flags=re.MULTILINE)


In [4]:
# Split into columns separated at '=' and '#' and determine the max column size
cols = []
colmax = [0, 0, 0]
for line in txt.splitlines():
    cells = re.split(':|#', line.decode('ascii', 'ignore'))
    if len(cells) == 4:
        cells = cells[:2] + [cells[2] + cells[3]]
    elif len(cells) == 2:
        cells[0] = '#'
    if len(cells) == 3:    
        lengths = [len(cell) for cell in cells] 
        colmax = [max(cm, l) for cm, l in zip(colmax, lengths)]
    cols.append(cells)

# Reformat into string with columns aligned to ':' and '#'
statements = []
format_string = ''.join(['{{{}: <{}}} {}'.format(i, cm, sep) for i, (cm, sep) in enumerate(zip(colmax, [':', '#', ''])) if cm != 0])
for row in cols:
    if len(row) == 3:
        statements.append('    ' + format_string.format(*row))
    else:
        statements.append('    ' + ''.join(row))

# Combine lines with dictionary wrapper
statements = '\n'.join(['conversion_factors = {\n'] + statements + ['\n}'] )
# Remove trailing whitespace
statements = re.sub(' *\n', '\n', statements)
# Remove any double newlines
statements = re.sub('\n{2,}', '\n', statements)

# Add top comments and converter function
statements = """\"\"\"
SIUnitConversion.py 

Provides SI defined unit conversion factors (in `conversion_factors` dictionary)
and a general converter function (`convert()`) to use them.

The conversion factors used are defined in [Appendix B.9.](http://www.nist.gov/pml/pubs/sp811/appenb9.cfm) 
of the [NIST Guide to the SI](http://www.nist.gov/pml/pubs/sp811/).

\"\"\"

# SI Unit Conversion Factors as defined in Appendix B.9. of the "NIST Guide to the SI"
# Reference: http://www.nist.gov/pml/pubs/sp811/appenb9.cfm
""" + statements + """

def convert(value=0.0, from_unit='', to_unit=''):
    \"\"\"
    Function to convert a value (float) from one unit (symbol string) to another unit (symbol string).
    The conversion factors comes from the `conversion_factors` dictionary where the key is
    the combination of the from unit plus the to unit, and the value is a float of the conversion value.
    
    Example:
    In  [1]: # Convert 1 mile to x meter
             convert(1, 'mi', 'm')
    Out [1]: 1609.344
    \"\"\"
    
    factor_lookup = from_unit + '2' + to_unit
    result = None
    if conversion_factors.has_key(factor_lookup):
        factor = conversion_factors[factor_lookup]
        result = factor * value
    else:
        raise KeyError('This unit conversion is undefined. See `conversion_factors` dictionary for valid conversions.')
    return result
"""

# Remove trailing whitespace, again
statements = re.sub(' *\n', '\n', statements)


In [5]:
# Execute statements to ensure valid python
try:
    exec(statements)   
except:
    print 'Not valid python; there must be a cleaning error.'
    
with open(si_file.replace('txt', 'py'), 'w') as f:
    f.write(statements)