This code will run through all ECOSTRESS files in a directory to check that they are correctly formatted and there are no errors in metadata.
Susan Meerdink
Created on: 10/20/17

In [1]:
# Import Functions
import os
import glob
import numpy as np

In [2]:
def check_format(filename):
    """
    This function will check a single file to make sure that it formatted correctly.
    Parameters:
    -----------
    1) filename: file to be read in and checked
    
    Outputs:
    --------
    1) error: integer that designates the first row that has an error. If not errors it will return -9999
        -9999: everything is ok with the file, no errors found
    2) message: a string saying what is wrong
    """
    inFile = open(filename, 'rb')  # Open ECOSTRESS file
    error = -9999
    message = 'Everything is correct'
    names = ['Name', 'Type', 'Class', ' ', ' ', 'Sample No.', 
             'Owner','Wavelength Range','Origin','Collection Date', 
             'Description', 'Measurement','First Column','Second Column',
             'X Units','Y Units', 'First X Value', 'Last X Value', 
             'Number of X Values', 'Additional Information', ' ']
    values = np.array([0, 0, 0])  # Array to hold values from metadata of first x value, last x value, and number of x values 
    wave = []  # array that will hold wavelengths
    lines = inFile.readlines()  # Read in file
    
    # Loop through metadata and check
    for i in range(0, len(lines)-1):
        # Check that metadata is in order and present
        if i in [0, 1, 2, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]: 
            if names[i] not in lines[i]:
                error = i + 1
                message = 'Metadata field is not correct'
                break
        if i == 3:
            if 'Subclass' not in lines[i] and 'Genus' not in lines[i]:
                error = i + 1
                message = 'Does not have Subclass or Genus as metadata field'
                break
        if i == 4:
            if 'Particle Size' not in lines[i] and 'Species' not in lines[i]:
                error = i + 1
                message = 'Does not have Particle Size or Species as metadata field'
                break
        if i == 14:
            if 'micro' in lines[i] and float(lines[22].split()[0]) > 200:
                error = i + 1
                message = 'Wavelengths are not in micrometers'
        if i == 18:
            if len(lines) - 21 != int(lines[i].split(':')[1]):
                error = i + 1
                message = "Correct length of spectra is %i" %(len(lines) - 21)
                break
        if i == 21:  # Check to make sure first x value matches spectra
            if lines[i].split()[0].strip() not in lines[16].split(':')[1].strip() and lines[16].split(':')[1].strip() not in lines[i].split()[0].strip():
                error = i + 1
                message = 'The First X value does not match spectra first x value'
                break
        if i == len(lines):  # Check to make sure the last x value matches spectra
            if lines[len(lines)].split()[0].strip() not in lines[17].split(':')[1].strip() and lines[17].split(':')[1].strip() not in lines[len(lines)].split()[0].strip(): 
                error = i + 1
                message = 'The last X value does not match spectra last x value'
                break
        if i > 20:
            wave.append(float(lines[i].split()[0].strip()))
    
    # Check if there are duplicate wavelengths - this happens when the original files don't have enough decimal points.
    if len(wave) != len(set(wave)):
        error = 21
        message = 'Duplicate wavelengths present'
    
    # Check if wavelengths are sorted
    if sorted(wave) != wave and sorted(wave, reverse=True) != wave:
        error = 21
        message = 'Wavelengths are not in order'
        
    return error, message

In [3]:
def check_files(currentDir):
    """
    This function finds all the ASTER text files in a folder and calls the 
    write_ecostress_file to format them into ECOSTRESS text files.
    
    Parameters:
    -----------
    1) currentDir: a file path for the folder that input file is located
    """
    os.chdir(currentDir) # Set this to the current directory
    outfile = open('file_errors.csv', 'w')  # open file
    correct = 0
    error = 0
    
    # Find all the ASTER text files in directory
    for file in glob.glob("*.spectrum.txt"):
        # print("Processing %s" %file)
        if ".." in file:
            check = 0
            message = "Error in filename"
            outfile.write(file + ',' + str(check) + ',' + message + '\n')
            error += 1
        else:
            check, message = check_format(file)
            if check == -9999:
                correct += 1
            else:
                outfile.write(file + ',' + str(check) + ',' + message + '\n')
                error += 1
                continue
            
    outfile.close()
    print("%i ECOSTRESS files are correctly formatted" %correct)
    print("%i ECOSTRESS files have errors" %error)

In [4]:
# Example Files
directory = "C:\\Users\\Susan\\Documents\\GitHub\\ASTER-Spectral-Library\\"
inDir = directory + "Example Outputs\\"
check_files(inDir)

6 ECOSTRESS files are correctly formatted
0 ECOSTRESS files have errors


In [18]:
# Check ECOSTRESS Files
inDir = "F:\\Dropbox\Analysis\\ECOSTRESS Spectral Library\\ECOSTRESS Spectral Library Files\\"
check_files(inDir)

2880 ECOSTRESS files are correctly formatted
0 ECOSTRESS files have errors


In [6]:
# Get all Files
os.chdir(inDir)
allFiles = glob.glob("*.spectrum.txt")
allType = np.chararray([len(allFiles),1], itemsize=50)
allClass = np.chararray([len(allFiles),1], itemsize=50)
allSubclass = np.chararray([len(allFiles),1], itemsize=50)
allParticle = np.chararray([len(allFiles),1], itemsize=50)
allWave = np.chararray([len(allFiles),1], itemsize=50)
allSample = np.chararray([len(allFiles),1], itemsize=50)
allOwner = np.chararray([len(allFiles),1], itemsize=50)
allInstr = np.chararray([len(allFiles),1], itemsize=50)

for i in range(0, len(allFiles)):
    allType[i,0] = allFiles[i].split('.')[0]
    allClass[i] = allFiles[i].split('.')[1]
    allSubclass[i] = allFiles[i].split('.')[2]
    allParticle[i] = allFiles[i].split('.')[3]
    allWave[i] = allFiles[i].split('.')[4]
    allSample[i] = allFiles[i].split('.')[5]
    allOwner[i] = allFiles[i].split('.')[6]
    allInstr[i] = allFiles[i].split('.')[7]

u, indices = np.unique(allType, return_inverse=True)
print(u)

for i in range(0, len(u)):
    print('There are %i in the %s' %(len(u[0]), u[i]))
    print(np.unique(allClass[np.where(indices == i)]))
    print(np.unique(allSubclass[np.where(indices == i)]))
    print(np.unique(allParticle[np.where(indices == i)]))

['manmade' 'meteorites' 'mineral' 'rock' 'soil' 'water']
There are 7 in the manmade
['concrete' 'generalconstructionmaterial' 'reflectancetarget' 'road'
 'roofingmaterial']
['brick' 'cementcinderblock' 'cinder' 'constructionconcrete' 'glas'
 'marble' 'metal' 'none' 'paint' 'pavingasphalt' 'pavingconcrete'
 'roofingpaper' 'roofingshingle' 'rubber' 'shingle' 'tar' 'tile' 'wood']
['solid']
There are 7 in the meteorites
['achondrite' 'anomalou' 'carbonaceouschondrite' 'enstatitechondrite'
 'ordinarychondrite']
['aubrite' 'c4' 'ci' 'cm2' 'co' 'co3' 'cv3' 'diogenite' 'e6' 'eucrite' 'h3'
 'h4' 'h5' 'h6' 'howardite' 'l3' 'l5' 'l6' 'll6' 'none' 'shergottite'
 'urelite']
['fine']
There are 7 in the mineral
['arsenate' 'borate' 'carbonate' 'chloride' 'element' 'halide' 'hydroxide'
 'oxide' 'phosphate' 'silicate' 'sulfate' 'sulfide' 'tungstate']
['cyclosilicate' 'inosilicate' 'nesosilicate' 'none' 'phyllosilicate'
 'sorosilicate' 'tectosilicate']
['coarse' 'fine' 'medium' 'none' 'solid']
There are

In [7]:
for i in allFiles:
    if 'cycloilicate' in i:
        print(i)