In [1]:
import pandas as pd

def _sort_indices(master):
    
    """
    gets index list ready for manipulating dataset rows
    
    :master: list to be sorted -- generated by _check_absorber_int()
    """
    
    for index in range(len(master)-1):
        
        if index+1 >= len(master):
            break
    
        else:
            
            if master[index] == master[index+1]:
                del master[index+1]
            
            if master[index][1] == master[index+1][0]:
                master[index].append(master[index+1][1])
                del master[index+1]
                
    return master



def _check_interval(entry1, entry2, check_num, index_list, iteration1, iteration2):
    
    """
    checks entries in given consecutive rows -- if they're close enough to the same value, index numbers returned as a list to be added to a master list of all indices to be changed. Used in _check_absorber_int()
    
    :entry1: value of lower row number to be checked
    
    :entry2: value of upper row number to be checked
    
    :check_num: integer -- maximum value used to determine whether the interval needs to be modified for the given set of indices
    
    :index_list: list in which index numbers will be stroed so we know what to fix
    
    :iteration1: index (row) number of entry1
    
    :iteration2: index (row) number of entry2
    """
    
    for num in range(check_num):
        
        if entry1+num == entry2:
            index_list.append([iteration1, iteration2])
        else:
            pass

        
        
def _check_absorber_int(data, indices, check_num=3):
    
    """
    Determines if absorber intervals need to be checked for a given dataset or set of datasets by looping through ions in a given dataset (such information stored in "indices" parameter). If yes, calls _check_interval() and _sort_indices(). Returns a dictionary of row numbers corresponding to absorbers that need to be combined.
    
    :data: dictionary containing all pandas DataFrames to be checked
    
    :indices: dictionary generated in record_indices(). Contains the number of lightray indices for a given ion in a given dataset
    
    :check_num: integer -- maximum value used to determine whether the interval needs to be modified for the given set of indices. Default is 3. 
    """
    
    ind = {}
    master = {}
    
    for ds in data:
        index_list = []
        
        for key in indices.keys():
            problem_list = indices[key]
            
            for index in problem_list:
                
                if index == 0:
                    pass

                else:

                    if data[ds]['lightray_index'][index] == data[ds]['lightray_index'][index-1]:

                        _check_interval(data[ds]['interval_start'][index], data[ds]['interval_start'][index+1], check_num, index_list, index-1, index)
                        _check_interval(data[ds]['interval_start'][index], data[ds]['interval_end'][index+1], check_num, index_list, index-1, index)
                        _check_interval(data[ds]['interval_end'][index], data[ds]['interval_start'][index+1], check_num, index_list, index-1, index)
                        _check_interval(data[ds]['interval_end'][index], data[ds]['interval_end'][index+1], check_num, index_list, index-1, index)

                    else:

                        pass
                    
            ind[f'data{ds}_indexlist'] = index_list
            
    for indl in ind:
        ind[indl].sort()
        master[f'{indl}'] = _sort_indices(ind[indl])
        
    return master



def _find_index_range(dataset, ion):
    
    """
    for a given dataset and ion, determines the number of absorbers dedicated to that ion in that dataset. Returns as a list a range of row numbers to be checked against the same ions in other datasets. 
    
    :dataset: one of the datasets contained in the dictionary of pandas DataFrames
    
    :ion: string. Used for indexing to obtain row numbers
    """
    
    index_range = []
    
    for index in range(len(dataset['name'])):
        
        if dataset['name'][index] == ion:
            index_range.append(index)
        else:
            pass
        
    return index_range



def record_indices(dic):
    """
    Generates a dictionary ("data") containing information on the number of rows/absorbers dedicated to a specific ion in a dataset. Each dataset is a primary key in the dataset. Ions and their number of absorbers found in each dataset are stored as dictionaries within its primary key. "Data" is used to produce a dictionary of indices to be checked by _check_absorber_int()
    
    :dic: dictionary containing each dataset as a pandas DataFrame
    """
    
    data = {}
    indices = {}

    for i in range(len(dic.keys())):
        data[f'data{i}'] = {}

    for ds in dic:

        for index in range(len(dic[ds]['name'])):

            if index == 0:
                data[ds][f"{dic[ds]['name'][index]}"] = {}
                data[ds][f'{dic[ds]["name"][index]}'][f'{dic[ds]["lightray_index"][index]}'] = 1
            else:

                if dic[ds]['name'][index] == dic[ds]['name'][index-1]:

                    if dic[ds]['lightray_index'][index] == dic[ds]['lightray_index'][index-1]:
                        data[ds][f'{dic[ds]["name"][index]}'][f'{dic[ds]["lightray_index"][index]}'] += 1
                    else:
                        data[ds][f'{dic[ds]["name"][index]}'][f'{dic[ds]["lightray_index"][index]}'] = 1

                else:
                    data[ds][f'{dic[ds]["name"][index]}'] = {}
                    data[ds][f'{dic[ds]["name"][index]}'][f'{dic[ds]["lightray_index"][index]}'] = 1
                    
        for ds in range(len(data.keys())-1):
            
            for key in data[f'data{ds}'].keys():
                
                if key not in data[f'data{ds+1}']:
                    pass
                else:
                    indices[f'{key}'] = _find_index_range(dic[f'data{ds}'], key)

    return indices
           
        
        
def check_abs_differences(filename_list, **kwargs):
    """
    Called by user. Loads datasets as pandas DataFrames and stores them in a dictionary for ease of indexing. If the datasets are not the same length, an inconsistency in the number of absorbers for a given ion and lightray_index is assumed. record_indices() and _check_absorber_int()are called, ultimately returning a dictionary of row numbers relating to absorbers that need to be combined. 
    
    :filename_list: list containing path and filename for each dataset. Files can have any name, but are referred to as "data{i} throughout the rest of the process (where "i" is a different integer related to filename's place in the list)"
    
    :kwargs: keyword arguments to be passed to pandas.read_csv() for the purpose of loading the data
    """
    
    data_lengths = []
    dic = {}
    rec_ind = 0
    
    for i in range(len(filename_list)):
        dic[f'data{i}'] = pd.read_csv(filename_list[i], **kwargs)
        data_lengths.append(len(dic[f'data{i}']))
    
    for num in range(len(data_lengths)-1):
        if data_lengths[num] == data_lengths[num+1]:
            pass
        else:
            rec_ind += 1
    
    if rec_ind != 0:
        indices = record_indices(dic)
        master = _check_absorber_int(dic, indices)
        return master
    else:
        print('all is gucci babyyyyyy')
        return ['bitch']

In [2]:
filename_list = ['../test_sal/interval_tests/data/ionlist0data_.csv', '../test_sal/interval_tests/data/ionlist1data_.csv', '../test_sal/interval_tests/data/ionlist2data_.csv']
kwargs = dict(delim_whitespace=True)
thing=check_abs_differences(filename_list, **kwargs)