In [None]:
def excel_to_dataframe(excel_file):
    # INPUT excel file
    # OUTPUT dataframe (df)

    df = pd.read_excel(excel_file)
    return df


def find_ladders(df):
    # INPUT column labels from the excel file → maybe it's better to look for it in the dataframe we just uploaded?
    # OUTPUT list of ladder labels and positions in the dataframe (df)

    # List of ladder labels
    ladder_label = ["Grey_Values_Ladder1", "Grey_Values_Ladder2"]

    label_list = []

    # Check each required column
    for label in ladder_label:
        if label in df.columns:
            position = df.columns.get_loc(label)
            label_list.append((label, position))
            return label_list
        else:
             raise ValueError(f"Ladder column '{label}' is missing.")


def monoExp(x, a, k, b): 
    # INPUT x, a, k, b
    # OUTPUT result of an exponential function f(x)=A*e^(-k*x)+b based on the input variables x,a,k,b

    func = a * np.exp(-k * x) + b 
    return func


def monoExpInverse(y, a, k, b):
    # INPUT y, a, k, b
    # OUTPUT result of a logaritmic function

    if (y - b) / a > 0:  # Ensure the logarithm argument is positive
        x = -np.log((y - b) / a) / k
    else:
        raise ValueError("No real solution exists.")
    return x
    

def fitExp(xs, ys, report_R2 = False):
    # INPUT distance (xs) and intensity (ys) values (minimum 4)
    # OUTPUT a list of variables a, k and b (R-squared is optional)
    
    variable_list = []
    
    # Perform the fit
    p0 = (60, 1, 30) # start with values near those we expect
    params, cv = scipy.optimize.curve_fit(monoExp, xs, ys, p0, maxfev = 5000)
    a, k, b = params

    variable_list.append(a)
    variable_list.append(k)
    variable_list.append(b)

    if report_R2 == True: 
        # Determine quality of the fit
        squaredDiffs = np.square(ys - monoExp(xs, a, k, b))
        squaredDiffsFromMean = np.square(ys - np.mean(ys))
        R2 = 1 - np.sum(squaredDiffs) / np.sum(squaredDiffsFromMean)
        variable_list.append(R2)  
        
    return variable_list


def fit_Exp_to_ladder(ladder_label, ladder_lane, df, points_per_fit):
    # INPUT 
    # ladder_label: defines which ladder to use
    # ladder_lane: list of nucleotide values
    # df: provides the distance and intensity values for all lanes (must include at least one ladder lane)
    # points_per_fit: how many points to fit (method)
    # OUTPUT ladder_df (distance range, nucleotide range, a, k, b, R-squared)

    # Find the peak values of the dataset
    if ladder_label == "Grey_Values_Ladder1":
        grey_values_baseline = df['Grey_Values_Ladder1_baseline']
        norm_ladder_distance = df['Ladder1_norm_distance']

    elif ladder_label == "Grey_Values_Ladder2":
        grey_values_baseline = df['Grey_Values_Ladder2_baseline']
        norm_ladder_distance = df['Ladder2_norm_distance']
        
    peaks_, values_ = find_peaks(grey_values_baseline, height = 600)
    peak_values_ = norm_ladder_distance.iloc[peaks_]
    peak_values_ = list(peak_values_)

    # Determine the range in which the ladder is evaluated (only between first and last peak)
    cut_x_data = norm_ladder_distance
    cut_x_data = cut_x_data[peaks_[0]:peaks_[-1]+1]

    cut_y_data = grey_values_baseline
    cut_y_data = cut_y_data[peaks_[0]:peaks_[-1]+1]

    # Create a dataframe to store the local variables 
    ladder_df = pd.DataFrame(ladder_lane, columns=["Ladder"])
    values_df = pd.DataFrame(peak_values_, columns=["Norm. distance"])
    curve_data = pd.concat([ladder_df, values_df], axis=1)

    # Create a list of overlapping values of ladder lane (e.g. 140-116-81-71, 116-81-71-66...)
    curve_data_list = curve_data.values.tolist()
    curve_data_list_nt = []
    curve_data_list_dist = []

    for i, n  in curve_data_list:
        curve_data_list_nt.append(i)
        curve_data_list_dist.append(n)

    overlap_list_nt = []
    overlap_list_dist = []

    for i in range(0, len(curve_data_list_nt) - overlap, size - overlap):            
        k = curve_data_list_nt[i:i + size]
        j = curve_data_list_dist[i:i + size]
        overlap_list_nt.append(k)
        overlap_list_dist.append(j)

    values_list = []
    
    for i in range(len(overlap_list_dist)):
        xs = np.array(overlap_list_dist[i])
        ys = np.array(overlap_list_nt[i])
        a = fitExp(xs, ys)
        values_list.append(a)

    # Apply the fitting to the datapoints 
    ranges_list = []
    data_ranges_list = []

    for i in range(len(peaks_)):
        if i+1 != len(peaks_):
            ranges = list((peaks_[i], peaks_[i+1]))
            ranges_list.append(ranges)

    for i, n in ranges_list:
        data_range = list(cut_x_data[i-peaks_[0]:n-peaks_[0]+1])
        data_ranges_list.append(data_range)


    new_data_ranges_list = [list(dict.fromkeys(data_ranges_list[0]+data_ranges_list[1])), data_ranges_list[2], data_ranges_list[3],
                       data_ranges_list[4], data_ranges_list[5], data_ranges_list[6], data_ranges_list[7],
                       list(dict.fromkeys(data_ranges_list[8]+data_ranges_list[9])), data_ranges_list[10]]

    exp_x_list_4p = []

    for j, k in enumerate(new_data_ranges_list):
    
        if j != 0:
            del k[0]  
        
        for i in k:
            values_list1 = values_list[j]
            value = values_list1[0] * math.exp(-values_list1[1]*i)+values_list1[2]
            exp_x_list_4p.append(value)

    exp_ladder_list_4p = []

    for i, n in enumerate(exp_x_list_4p):
        if i in peaks_-18:
            exp_ladder_list_4p.append(n)
            

    # Custom names for the columns corresponding to values_list
    custom_names = ['a', 'k', 'b']

    # Prepare data for the dataframe
    data = []
    for l1, l2 in zip(new_data_ranges_list, values_list):
        for value in l1:
            row = [value] + l2  # Add all values of values_list as separate columns
            data.append(row)

    # Create the dataframe with custom column names
    columns = ['Distance'] + custom_names
    ladder_df = pd.DataFrame(data, columns = columns)

    ladder_df.insert(loc = 1, column = 'Size in nt', value = exp_x_list_4p)

    if report_R2 = True:
        ladder_df.insert(loc = 5, column = 'R-squared', value = values_list[3])
    
    return ladder_df

    
def ladder_range_dist_selection(distance, ladder_df, ladder_label):

    min_value = ladder_df['Distance'].min()
    max_value = ladder_df['Distance'].max()

    if distance < min_value or distance > max_value:
        raise ValueError("Target value is out of range!")
    else:
        # Find the closest range containing the target
        range_row = ladder_df[(ladder_df['Distance'] <= distance) & (ladder_df['Distance'] > distance - 0.1)]
    
        if not range_row.empty:
            result = range_row.iloc[0].to_dict()
            return (result["a"], result["k"], result["b"], result["R2"])
        else:
            result = None
            raise ValueError(f"No exact match for {distance}, and no suitable range found.")
    

def ladder_range_nucl_selection(nucleotide, ladder_df, ladder_label):
    
    min_value = ladder_df['Size_in_nt'].min()
    max_value = ladder_df['Size_in_nt'].max()

    if nucleotide < min_value or nucleotide > max_value:
        raise ValueError("Nucleotide value is out of range!")

    else:
        # Find the closest range containing the target
        range_row = ladder_df[(ladder_df['Size_in_nt'] <= distance) & (ladder_df['Size_in_nt'] > distance - 1)]

        if not range_row.empty:
            result = range_row.iloc[0].to_dict()
            return (result["a"], result["k"], result["b"], result["R2"]) 
        else:
            result = None
            raise ValueError(f"No exact match for {nucleotide}, and no suitable range found.")
   
    
def nucleotides_given_distance(distance, ladder_label, ladder_df, whole_nucleotides = False):
    # INPUT distance value, which ladder, output from fit_Exp_to_ladder (df), optional to return whole nucleotides or approx. 
    # OUTPUT size in nucleotides

    #get the ladder range needed for the distance provided
    ladder_range = ladder_range_dist_selection(distance, ladder_label)
    
    #retrieve the coefficients based on the ladder range
    a,k,b,R2 = ladder_df('dist_range'=ladder_range)
    
    #calculate the size in nucleotides
    if whole_nucleotides == sFalse: 
        nts = monoExp(distance, a, k, b)
    else: 
        nts = int(monoExp(distance, a, k, b))
    return: nts

    
def distance_given_nucleotides(nucleotide, ladder_label, ladder_df):
    # INPUT nucleotide size, ladder_laber, output from fit_Exp_to_ladder (df)
    # OUTPUT distance

    #get the ladder range needed for the nucleotide provided
    ladder_range = ladder_range_nucl_selection(nucleotide, ladder_label)

    #retrieve the coefficients based on the ladder range
    a,k,b,R2 = ladder_df('nucl_range' = ladder_range)

    dist = monoExpInverse(nucleotide a, k, b) 
    
    return: dist


In [None]:
# brings everything togather for the main function