Convert existing ASTER spectral library files to ECOSTRESS Spectral library files
Owner: Susan Meerdink   
Creation Date: 10/9/17   
https://github.com/susanmeerdink/ASTER-Spectral-Library   
This file converts existing ASTER spectral library files to the new ECOSTRESS spectral library format. The existing ASTER files have many different format types including: 
1. No leading space and one space between wavelength and spectrum
2. Leading space and tab in between wavelength and spectrum
3. No leading space and tab in between wavelength and spectrum

In [1]:
# Import Functions
import numpy as np
import os
import glob
from shutil import copyfile

In [2]:
def read_aster_file(filename):
    """
    Function for reading in existing ASTER spectral library file.
    This function formats the metadata fields into then new format, but 
    does not format spectra
    
    Parameters:
    -----------
    1) filename: The filename for one ASTER spectral library text file
    
    Returns:
    --------
    1) arrayNewMeta: string array that contains 20 rows with each row a new metadata field pulled from aster file
    2) arrayNewSpec: string array that contains two columns, the first being wavelength and the second reflectance
    3) newfilename: a string with the new filename
    4) ancillaryflag: a binary value with 0 with no ancillary file and 1 for ancillary file
    """
    inFile = open(filename, 'r')  # Open ASD spectra file
    numRow = 0  # Keeps a counter of the number of rows 
    arrayOrig = []  # empty array to hold file data
    arrayNewMeta = ['Name: ', 'Type: ', 'Class: ', 'Subclass: ', 'Particle Size: ', 
                    'Sample No.: ', 'Owner: ', 'Wavelength Range: ', 'Origin: ',
                    'Collection Date: N/A', 'Description: ', 'Measurement: ', 'First Column: ',
                    'Second Column: ', 'X Units: ', 'Y Units:', 'First X Value: ',
                    'Last X Value: ', 'Number of X Values: ', 'Additional Information: ']  # empty array to hold newly formatted data
    arrayNewSpec = []  # empty array to hold spectra
    ancillaryflag = 0
    
    # Loop through file and pull out every line
    for line in inFile:
        if numRow < 26:
            arrayOrig.append(line.rstrip('\n'))
        else:
            numLine = line.rstrip('\n').split()
            arrayNewSpec.append(numLine)
        numRow += 1

    # Edit metadata to fit new formatting
    arrayNewMeta[0] = arrayNewMeta[0] + arrayOrig[0].split(':')[1].strip()
    
    # Format Type Name
    if 'materials' in arrayOrig[1] or 'Materials' in arrayOrig[1]:
        arrayNewMeta[1] = arrayNewMeta[1] + 'manmade'
    elif 'Rocks' in arrayOrig[1] or 'rocks' in arrayOrig[1]: 
        arrayNewMeta[1] = arrayNewMeta[1] + 'rock'
    elif 'soils' in arrayOrig[1] or 'Soils' in arrayOrig[1]:
        arrayNewMeta[1] = arrayNewMeta[1] + 'soil'
    else: 
        arrayNewMeta[1] = arrayNewMeta[1] + arrayOrig[1].split(':')[1].strip().rstrip('(')
        
    # Format Class Name
    classname = arrayOrig[2].split(':')[1].strip().rstrip('(')
    if classname[-1] == 's' and classname != 'Igneous':
        arrayNewMeta[2] = arrayNewMeta[2] + classname[0:len(classname)-1]
    else:
        arrayNewMeta[2] = arrayNewMeta[2] + classname
    
    # Format Subclass Name
    subclassname = arrayOrig[3].split(':')[1].strip().rstrip('(')
    if subclassname[-1] == 's' and subclassname != 'arenaceous' and subclassname != 'argillaceous' and subclassname != 'gneiss' and subclassname != 'siliceous':
        arrayNewMeta[3] = arrayNewMeta[3] + subclassname[0:len(subclassname)-1]
    else: 
        arrayNewMeta[3] = arrayNewMeta[3] + subclassname
    
    arrayNewMeta[4] = arrayNewMeta[4] + arrayOrig[4].split(':')[1].strip().rstrip('(')
    arrayNewMeta[6] = arrayNewMeta[6] + arrayOrig[6].split(':')[1].strip()
    
    # Format Wavelength Range
    if float(arrayOrig[21].split(':')[1]) < 3 and  float(arrayOrig[22].split(':')[1]) < 3:
        arrayNewMeta[7] = arrayNewMeta[7] + 'VSWIR'
    elif float(arrayOrig[21].split(':')[1]) > 2 and float(arrayOrig[22].split(':')[1]) > 2:
        arrayNewMeta[7] = arrayNewMeta[7] + 'TIR'
    else:
        arrayNewMeta[7] = arrayNewMeta[7] + 'All'

    arrayNewMeta[11] = arrayNewMeta[11] + arrayOrig[16].split(':')[1].strip()
    arrayNewMeta[12] = arrayNewMeta[12] + arrayOrig[17].split(':')[1].strip()
    arrayNewMeta[13] = arrayNewMeta[13] + arrayOrig[18].split(':')[1].strip()
    arrayNewMeta[14] = arrayNewMeta[14] + arrayOrig[19].split(':')[1].strip()
    arrayNewMeta[15] = arrayNewMeta[15] + arrayOrig[20].split(':')[1].strip()
    
    # Format and add sample name
    samplename = arrayOrig[5].split(':')[1].replace(" ", "") # Remove any spaces
    samplename = samplename.replace(".doc", "")  # Remove .doc from name
    samplename = samplename.replace(".txt", "")  # Remove .txt from name
    samplename = samplename.replace(".", "_")  # Remove any periods from name
    arrayNewMeta[5] = arrayNewMeta[5] + samplename

    # Create new filename
    arrayName = os.path.basename(filename).split('.')
    newfilename = (arrayNewMeta[1].split(':')[1].replace(" ", "").rstrip('(') + 
                   '.' + arrayNewMeta[2].split(':')[1].replace(" ", "").rstrip('(') + 
                   '.' + arrayNewMeta[3].split(':')[1].replace(" ", "").rstrip('(') + 
                   '.' + arrayOrig[4].split(':')[1].replace(" ", "").rstrip('(') + 
                   '.' + arrayNewMeta[7].split(':')[1].replace(" ", "") +
                   '.' + samplename + '.' + arrayName[0] + '.' + arrayName[1] +
                   '.spectrum.txt').lower()
    
    # Some files have an additional line of metadata to ignore that is called Collected By:
    if 'Collected' in arrayOrig[9]:  
        arrayNewMeta[8] = arrayNewMeta[8] + arrayOrig[8].split(':')[1].strip()
        # Combine six rows of description into one
        arrayNewMeta[10] = arrayNewMeta[10] + arrayOrig[10].split(':')[1].strip() + arrayOrig[11] + arrayOrig[12] + \
                            arrayOrig[13] + arrayOrig[14] + arrayOrig[15] + arrayOrig[9] + ' Original ASTER Spectral Library name was ' + os.path.basename(filename)
    else:
        arrayNewMeta[8] = arrayNewMeta[8] + arrayOrig[8].split(':')[1].strip() + arrayOrig[9]  # Combine the two origin lines into one
        # Combine six rows of description into one
        arrayNewMeta[10] = arrayNewMeta[10].split(':')[1].strip() + arrayOrig[10] + arrayOrig[11] + arrayOrig[12] + \
                            arrayOrig[13] + arrayOrig[14] + arrayOrig[15] + ' Original ASTER Spectral Library name was ' + os.path.basename(filename)
    
    # Some files do not have the appropriate ancillary data filename, replace with accuracte filename
    if len(arrayOrig[24].split(':')) < 2:
        arrayNewMeta[19] = arrayNewMeta[19] + 'none' 
    elif 'none' in arrayOrig[24].split(':')[1].lower():
        arrayNewMeta[19] = arrayNewMeta[19] + 'none' 
    else:
        arrayNewMeta[19] = arrayNewMeta[19] + str.replace(newfilename, 'spectrum', 'ancillary')
        ancillaryflag = 1
        
    return arrayNewMeta, arrayNewSpec, newfilename, ancillaryflag

In [3]:
def format_spec(origSpec, meta):
    """
    This function formats the aster spectral library spectra into the ecostress
    spectral library format. Also updates the First X Value, Last X Value, and Number of X Values in metadata.
    
    Parameters:
    -----------
    1) origSpec: string array that contains two columns, the first being wavelength and the second reflectance
    
    Returns:
    --------
    1) arraySpec: a numpy array of strings that have only three decimal points for values
    2) meta: the metadata array of strings with updated values
    """ 
    # Remove any blank lines in original file read
    tempSpec = []
    for i in range(len(origSpec)):
        if len(origSpec[i]) == 2:
            tempSpec.append(origSpec[i])

    arraySpec = np.chararray([len(tempSpec), 2], itemsize=7)  # Create new array to hold formatted Spectra

    # Loop through columns (wavelengths then reflectance)
    for c in [0, 1]:     
        # Loop through rows
        for i in range(len(tempSpec)):
            if len(tempSpec[i]) > 1 and len(tempSpec[i]) < 3:
                tempNum = round(float(origSpec[i][c]), 4)  # Convert to float and round to three decimal places
                # If the number is greater than 9 don't add leading space
                if tempNum >= 10:  
                    if len(str(tempNum)) < 7:
                        if 7 - len(str(tempNum)) == 4:
                            tempStr = str(tempNum) + "0000"
                        elif 7 - len(str(tempNum)) == 3:
                            tempStr = str(tempNum) + "000"
                        elif 7 - len(str(tempNum)) == 2:
                            tempStr = str(tempNum) + "00"
                        else:
                            tempStr = str(tempNum) + "0"
                    else:
                        tempStr = str(tempNum)
                 # If the number is less than 10 add a leading space
                elif tempNum < 10 and tempNum >= 0:
                    if len(str(tempNum)) < 6:
                        if 6 - len(str(tempNum)) == 4:
                            tempStr = " " + str(tempNum) + "0000"
                        elif 6 - len(str(tempNum)) == 3:
                            tempStr = " " + str(tempNum) + "000"
                        elif 6 - len(str(tempNum)) == 2:
                            tempStr = " " + str(tempNum) + "00"
                        else:
                            tempStr = " " + str(tempNum) + "0"
                    else: 
                        tempStr = " " + str(tempNum)
                else:
                    tempStr = '0.000'
                arraySpec[i, c] = tempStr
    
    # Update Metadata
    meta[16] = meta[16] + arraySpec[0, 0]
    meta[17] = meta[17] + arraySpec[len(arraySpec)-1, 0]
    meta[18] = meta[18] + str(len(arraySpec))
    return arraySpec, meta

In [4]:
def write_ancillary_file(filename, newfilename, outDir, meta):
    """
    Parameters:
    -----------
    1) filename: Original filename to open ancillary data
    2) newfilename: New filename to save new ancillary data
    3) outDir: New output folder location
    4) meta: metadata array
    
    Return:
    -------
    1) meta: The new metadata array with potentially line 19 changed
    """
    name = str.replace(filename, 'spectrum', 'ancillary')
    if os.path.isfile(name) == 1:
        inFile = open(name, 'r')  # Open ancillary file
        name = str.replace(newfilename, 'spectrum', 'ancillary')
        outFile = open(outDir + name, 'w')  # Set up output file        
        
        lines = inFile.readlines()
        outFile.write(meta[0] + '\n')
        outFile.write(meta[1] + '\n')
        outFile.write(meta[2] + '\n')
        outFile.write(meta[3] + '\n')
        outFile.write(meta[4] + '\n')
        outFile.write(meta[5] + '\n')
        outFile.write(meta[6] + '\n')
        outFile.write(meta[7] + '\n')
        outFile.write(meta[8] + '\n')
        outFile.write(meta[9] + '\n')
        outFile.write(meta[10] + '\n')
        for i in range(0, len(lines)):
            outFile.write(lines[i])
        
        outFile.close()
        inFile.close()
    else: 
        meta[19] = 'Additional Information: None'

    return meta

In [5]:
def write_ecostress_file(currentDir, outDir, filename):
    """
    This function will process a single ASTER text file and output a single
    ECOSTRESS text file. 
    
    Parameters:
    -----------
    1) outDir: a file path for the folder that the output file will be located
    2) currentDir: a file path for the folder that input file is located
    2) filename: The filename for one ASTER spectral library text file
    
    Output File:
    ------------
    1) Outputs a text file containing the metadata, ASD, and Nicolet spectra for one sample.
    See readme for formatting information.
    """
    metaOrig, specOrig, newfilename, ancillaryflag = read_aster_file(currentDir + filename)
    spec, meta = format_spec(specOrig, metaOrig)
    if ancillaryflag == 1:
        meta = write_ancillary_file(filename, newfilename, outDir, meta)
    
    outFile = open(outDir + newfilename, 'w')  # open new file for output
    
    # Loop through metadata
    for i in range(len(meta)):
        outFile.write(meta[i] + '\n')
    
    outFile.write('\n')  # Add one empty line between metadata and spectra
    
    # Loop through spectra
    for j in range(len(spec)):
        outFile.write(spec[j, 0] + '\t' + spec[j, 1] + '\n')
    
    outFile.close()  # Close file   

In [6]:
def convert_aster_files(currentDir, outDir):
    """
    This function finds all the ASTER text files in a folder and calls the 
    write_ecostress_file to format them into ECOSTRESS text files.
    
    Parameters:
    -----------
    1) outDir: a file path for the folder that the output file will be located
    2) currentDir: a file path for the folder that input file is located
    """
    os.chdir(currentDir) # Set this to the current directory
    filelist = []
    count = 0
    error = 0
    copy = 0
    
    # Find all the ASTER text files in directory
    for file in glob.glob("*.spectrum.txt"):
        # print("Processing %s" %file)
        try:
            write_ecostress_file(currentDir, outDir, file)
            count += 1
        except:
            if not os.path.exists(currentDir + 'Error Files\\'):
                os.makedirs(currentDir + 'Error Files\\')
            copyfile(file, currentDir + 'Error Files\\' + file)
            print('Error processing %s' %file)
            error += 1
            continue
    
    print("%i ASTER spectrum files formatted to ECOSTRESS files" %count)
    print("%i errors while formatting ASTER files" %error)

In [7]:
# Example Files
directory = "C:\\Users\\Susan\\Documents\\GitHub\\ASTER-Spectral-Library\\"
inDir = directory + "Example Inputs\\"
outDir = directory + "Example Outputs\\"

convert_aster_files(inDir, outDir)

6 ASTER spectrum files formatted to ECOSTRESS files
0 errors while formatting ASTER files


In [8]:
# Processing ASTER Spectral Library Files
directory = "F:\\Dropbox\\Analysis\\ECOSTRESS Spectral Library\\"
inDir = directory + "ASTER Spectral Library Files\\"
outDir = directory + "ECOSTRESS Spectral Library Files\\"

convert_aster_files(inDir, outDir)

2440 ASTER spectrum files formatted to ECOSTRESS files
0 errors while formatting ASTER files


In [9]:
# # Running Error Files
# directory = "F:\\Dropbox\\Analysis\\ECOSTRESS Spectral Library\\"
# inDir = directory + "ASTER Spectral Library Files\\Error Files\\"
# outDir = directory + "ECOSTRESS Spectral Library Files\\"

# convert_aster_files(inDir, outDir)