**I. Data upload, preparation and normalization**

1. Importing packages and libraries.

In [None]:
import numpy as np
import scipy
import pandas as pd
import math
import matplotlib.pyplot as plt
from scipy.signal import find_peaks, argrelextrema, peak_widths
from scipy.optimize import curve_fit
from pybaselines import Baseline
from sklearn.metrics import auc
import cv2
#tuckeys test

In [None]:
cy5_excel = "data/1 T19G + U19U CY5.xlsx"

In [None]:
grey_values_list = ["Grey_Values_T0", "Grey_Values_T10s", "Grey_Values_T30s", "Grey_Values_T1M", "Grey_Values_T3M", "Grey_Values_T6M",
                   "Grey_Values_T30M", "Grey_Values_T100M", "Grey_Values_L1", "Grey_Values_L2"]

In [None]:
# Uploading the data of all lanes.

#CY5
data_cy5 = pd.read_excel(cy5_excel)

for d in data_cy5:
    for l in grey_values_list:
        if d == l:
            data_cy5 = data_cy5.rename(columns={d: d+'_CY5'})

In [None]:
data = data_cy5.filter(["Distance", "Grey_Values_T6M_CY5"], axis=1)

In [None]:
plt.figure(figsize=(14, 4))
plt.plot(data["Distance"],data["Grey_Values_T6M_CY5"], linewidth = 2.5)

plt.xlabel("Distance")
plt.ylabel("Intensity Values")
#plt.legend()


2. Removing background.

In [None]:
baseline_fitter = Baseline(x_data=data["Distance"])
imodpoly = baseline_fitter.imodpoly(data["Grey_Values_T6M_CY5"], poly_order=3, num_std=0.8)
new_y = data["Grey_Values_T6M_CY5"]-imodpoly[0]
new_y[new_y < 0] = 0
data["Grey_Values_T6M_CY5_baseline"] = new_y

In [None]:
plt.figure(figsize=(14, 4))
plt.plot(data["Distance"],data["Grey_Values_T6M_CY5_baseline"], linewidth = 2.5)

plt.xlabel("Distance")
plt.ylabel("Intensity Values")

3. Creating test data - adjusting the height of the peaks, removing most of the imperfections.

In [None]:
new_y2 = data["Grey_Values_T6M_CY5_baseline"]*2
new_y2[new_y2 < 1200] = 1200
data["Grey_Values_T6M_CY5_baseline_2"] = new_y2

In [None]:
plt.figure(figsize=(14, 4))
plt.plot(data["Distance"],data["Grey_Values_T6M_CY5_baseline_2"], linewidth = 2.5)

plt.xlabel("Distance")
plt.ylabel("Intensity Values")

In [None]:
baseline_fitter = Baseline(x_data=data["Distance"])
imodpoly3 = baseline_fitter.imodpoly(data["Grey_Values_T6M_CY5_baseline_2"], poly_order=3, num_std=0.8)
new_y3 = data["Grey_Values_T6M_CY5_baseline_2"]-imodpoly3[0]
new_y3[new_y3 < 0] = 0
data["Grey_Values_T6M_CY5_baseline_3"] = new_y3

In [None]:
plt.figure(figsize=(14, 4))
plt.plot(data["Distance"],data["Grey_Values_T6M_CY5_baseline_3"], linewidth = 2.5)

plt.xlabel("Distance")
plt.ylabel("Intensity Values")

4. Normalizing the distance in the test lane.

In [None]:
peaks_list = []
peaks, values = find_peaks(data["Grey_Values_T6M_CY5_baseline_3"], height=1000)
peak_values = data.iloc[peaks, 0]
peak_values = list(peak_values)
peak_data = (peak_values)
peaks_list.append(peak_data)

In [None]:
# Markers (min and max values of peaks of each lane) are taken only from CY5 - CY3 tends to be more inconsitent.

min_max_list = []

for i in peaks_list:
    min_peak = i[0]
    max_peak = i[-1]
    min_max = list((min_peak, max_peak))
    min_max_list.append(min_max)
min_max_list = min_max_list
min_max_list

In [None]:
# Normalizing the distance

norm_list = []
name_list = ["Test Data"]

for i, n in min_max_list:
    norm_distance = ((data["Distance"] - i) / (n - i))
    norm_list.append(norm_distance)

data["Test_Data_norm_distance"] = np.reshape(norm_list, (-1, 1))

In [None]:
data

5. Plotting the 0-1 normalized data.

In [None]:
plt.figure(figsize=(14, 4))
plt.plot(data["Test_Data_norm_distance"],data["Grey_Values_T6M_CY5_baseline_3"], linewidth = 2.5)

plt.xlabel("Distance")
plt.ylabel("Intensity Values")

In [None]:
plt.figure(figsize=(14, 4))
plt.plot(data["Test_Data_norm_distance"],data["Grey_Values_T6M_CY5_baseline_3"], linewidth = 2.5)

plt.xlabel("Distance")
plt.ylabel("Intensity Values")
plt.xlim(0, 1)

**II. Ladder**

1. Finding the peak values. 

In [None]:
data_ladder = pd.read_csv("Ladder1_CY5_norm_distance.csv", comment='#',sep=',')
data_ladder.loc[len(data_ladder)] = {"Ladder1_CY5_norm_distance" : 1.059032, "Grey_Values_L1_CY5_baseline" : 22.244741}
data_ladder

In [None]:
#Laddder lane 1 (CY5)

plt.figure(figsize=(15,6))
plot = plt.plot(data_ladder["Ladder1_CY5_norm_distance"], data_ladder['Grey_Values_L1_CY5_baseline'])
plt.xlabel("Distance, cm")
plt.ylabel("Grey Values")

plt.show()

In [None]:
#Finding ladder 1 peak values:

peaks_l1, values_l1 = find_peaks(data_ladder['Grey_Values_L1_CY5_baseline'], height=600)
peak_values_l1 = data_ladder.iloc[peaks_l1, 0]
peak_values_l1 = list(peak_values_l1)
peak_values_l1

In [None]:
plt.figure(figsize=(15,6))
plot = plt.plot(data_ladder["Ladder1_CY5_norm_distance"], data_ladder['Grey_Values_L1_CY5_baseline'])
plt.scatter(peak_values_l1, values_l1.values(), color='red')
plt.xlabel("Distance, cm")
plt.ylabel("Grey Values")
plt.show()

In [None]:
cut_x_data = data_ladder['Ladder1_CY5_norm_distance']
cut_x_data = cut_x_data[peaks_l1[0]:peaks_l1[-1]+1]

cut_y_data = data_ladder['Grey_Values_L1_CY5_baseline']
cut_y_data = cut_y_data[peaks_l1[0]:peaks_l1[-1]+1]

In [None]:
plt.figure(figsize=(15,6))
plot = plt.plot(cut_x_data, cut_y_data)
plt.scatter(peak_values_l1, values_l1.values(), color='red')
plt.xlabel("Distance, cm")
plt.ylabel("Grey Values")
plt.show()

2.1  Converting distance to nt. Exponential distance between peaks based on a single equation of all data points.

In [None]:
ladder_lane = [141, 116, 81, 71, 66, 60, 55, 51, 46, 41, 35, 28]

In [None]:
ladder_df = pd.DataFrame(ladder_lane, columns=["Ladder"])
values_df = pd.DataFrame(peak_values_l1, columns=["Norm. distance"])
curve_data = pd.concat([ladder_df, values_df], axis=1)
curve_data 

In [None]:
def fitting_function(xs, ys):

    const_list = []
    
    # perform the fit
    p0 = (60, 1, 30) # start with values near those we expect
    params, cv = scipy.optimize.curve_fit(monoExp, xs, ys, p0, maxfev=5000)
    m, t, b = params

    # determine quality of the fit
    squaredDiffs = np.square(ys - monoExp(xs, m, t, b))
    squaredDiffsFromMean = np.square(ys - np.mean(ys))
    rSquared = 1 - np.sum(squaredDiffs) / np.sum(squaredDiffsFromMean)
    print(f"R² = {rSquared}")

    # plot the results
    plt.figure(figsize=(10,6))
    plt.plot(xs, ys, '.', label="data")
    model_x = np.arange(np.min(xs), np.max(xs), step = 0.01)
    plt.plot(model_x, monoExp(model_x, m, t, b), label="fitted")
   # plt.title("Fitted Exponential Curve")

    # inspect the parameters
    print(f"Y = {m} * e^(-{t} * x) + {b}")
    
    const_list.append(m)
    const_list.append(t)
    const_list.append(b)
    
    return const_list

def monoExp(x, m, t, b):
    return m * np.exp(-t * x) + b
    

In [None]:
curve_data_list = curve_data.values.tolist()
curve_data_list

In [None]:
curve_data_list_nt = []
curve_data_list_dist = []

for i, n  in curve_data_list:
    curve_data_list_nt.append(i)
    curve_data_list_dist.append(n)

In [None]:
overlap = 3
size = 4

overlap_list_nt = []
overlap_list_dist = []

for i in range(0, len(curve_data_list_nt) - overlap, size - overlap):            
    k = curve_data_list_nt[i:i + size]
    j = curve_data_list_dist[i:i + size]
    overlap_list_nt.append(k)
    overlap_list_dist.append(j)

In [None]:
values_list = []
for i in range(len(overlap_list_dist)):
    xs = np.array(overlap_list_dist[i])
    ys = np.array(overlap_list_nt[i])
    a = fitting_function(xs, ys)
    values_list.append(a)

In [None]:
ranges_list = []
data_ranges_list = []

for i in range(len(peaks_l1)):
    if i+1 != len(peaks_l1):
        ranges = list((peaks_l1[i], peaks_l1[i+1]))
        ranges_list.append(ranges)

for i, n in ranges_list:
    data_range = list(cut_x_data[i-peaks_l1[0]:n-peaks_l1[0]+1])
    data_ranges_list.append(data_range)

In [None]:
# Dividing data into ranges and removing overlapping values 

new_data_ranges_list = [list(dict.fromkeys(data_ranges_list[0]+data_ranges_list[1])), data_ranges_list[2], data_ranges_list[3],
                       data_ranges_list[4], data_ranges_list[5], data_ranges_list[6], data_ranges_list[7], data_ranges_list[8],
                       list(dict.fromkeys(data_ranges_list[9]+data_ranges_list[10]))]

for i in range(len(new_data_ranges_list)-1):
    remove_last = new_data_ranges_list[i]
    del remove_last[-1]

In [None]:
exp_x_list_4p = []

for j, k in enumerate(new_data_ranges_list):
    for i in k:
        values_list1 = values_list[j]
        value = values_list1[0] * math.exp(-values_list1[1]*i)+values_list1[2]
        exp_x_list_4p.append(value)
        
exp_x_list_4p

**3. AUC**

1. Applying conversion to nt.

In [None]:
cut_test_distance_data = data["Test_Data_norm_distance"]
cut_test_distance_data = cut_test_distance_data[peaks_l1[0]:peaks_l1[-1]+1]

cut_test_grey_data = data["Grey_Values_T6M_CY5_baseline_3"]
cut_test_grey_data = cut_test_grey_data[peaks_l1[0]:peaks_l1[-1]+1]

In [None]:
plt.figure(figsize=(15,6))
plot = plt.plot(cut_test_distance_data, cut_test_grey_data)####
#ax = plt.gca().invert_xaxis()
plt.xlabel("Size in nt")
plt.ylabel("Grey Values")
plt.show()

In [None]:
# Creating new data ranges for test data

data_ranges_list_test = []

for i, n in ranges_list:
    data_range = list(cut_test_distance_data[i-peaks_l1[0]:n-peaks_l1[0]+1])
    data_ranges_list_test.append(data_range)

In [None]:
# Dividing data into ranges and removing overlapping values 

new_data_ranges_list_test = [list(dict.fromkeys(data_ranges_list_test[0]+data_ranges_list_test[1])), data_ranges_list_test[2], data_ranges_list_test[3],
                       data_ranges_list_test[4], data_ranges_list_test[5], data_ranges_list_test[6], data_ranges_list_test[7], data_ranges_list_test[8],
                       list(dict.fromkeys(data_ranges_list_test[9]+data_ranges_list_test[10]))]

for i in range(len(new_data_ranges_list_test)-1):
    remove_last = new_data_ranges_list_test[i]
    del remove_last[-1]

In [None]:
exp_test_x_list = []

for j, k in enumerate(new_data_ranges_list_test):
    print(j)
    for i in k:
        values_list1 = values_list[j]
        print(values_list[2])
        value = values_list1[0] * math.exp(-values_list1[1]*i)+values_list1[2]
        exp_test_x_list.append(value)

In [None]:
plt.figure(figsize=(15,6))
plot = plt.plot(exp_test_x_list, cut_test_grey_data)####
ax = plt.gca().invert_xaxis()
plt.xlabel("Size in nt")
plt.ylabel("Grey Values")
plt.show()

In [None]:
values_ladder = pd.DataFrame(exp_test_x_list, columns=["Exp X"])
cut_data = pd.concat([values_ladder, cut_test_grey_data.reset_index()], axis=1)
cut_data

In [None]:
peaks_list = []
peaks, values = find_peaks(cut_test_grey_data, height=1000)
peak_values = cut_data.iloc[peaks, 0]
peak_values = list(peak_values)
peak_data = (peak_values)
peaks_list.append(peak_data)

In [None]:
plt.figure(figsize=(15,6))
plot = plt.plot(exp_test_x_list, cut_test_grey_data)####
plt.scatter(peak_values, values.values(), color='red')
ax = plt.gca().invert_xaxis()
plt.xlabel("Size in nt")
plt.ylabel("Grey Values")
plt.show()

In [None]:
widths, width_heights, left_ips, right_ips = peak_widths(cut_test_grey_data, peaks, rel_height=1)

In [None]:
from scipy.interpolate import interp1d

def index_to_xdata(xdata, indices):
    "interpolate the values from signal.peak_widths to xdata"
    ind = np.arange(len(xdata))
    f = interp1d(ind,xdata)
    return f(indices)

widths1 = index_to_xdata(exp_test_x_list, widths)
left_ips1 = index_to_xdata(exp_test_x_list, left_ips)
right_ips1 = index_to_xdata(exp_test_x_list, right_ips)

In [None]:
plt.figure(figsize=(15,6))
plot = plt.plot(exp_test_x_list, cut_test_grey_data)####
plt.scatter(peak_values, values.values(), color='red')
plt.hlines(width_heights, left_ips1, right_ips1, color='r')
ax = plt.gca().invert_xaxis()
plt.xlabel("Size in nt")
plt.ylabel("Grey Values")
plt.show()

2. Selecting intact DNA area.

In [None]:
left_ips = list(left_ips)
right_ips = list(right_ips)
for i, n in enumerate(left_ips):
    left_ips[i] = int(n)
    
for i, n in enumerate(right_ips):
    right_ips[i] = int(n)


In [None]:
intact_dna_area = auc(exp_test_x_list[left_ips[1]:right_ips[1]], cut_test_grey_data[left_ips[1]:right_ips[1]])

In [None]:
intact_dna_area

3. Selecting cut DNA I area.

In [None]:
cut_dna_i_area = auc(exp_test_x_list[left_ips[2]:right_ips[2]], cut_test_grey_data[left_ips[2]:right_ips[2]])

In [None]:
cut_dna_i_area

4. Selecting cut DNA II area.

In [None]:
cut_dna_ii_area = auc(exp_test_x_list[left_ips[3]:right_ips[3]], cut_test_grey_data[left_ips[3]:right_ips[3]])

In [None]:
cut_dna_ii_area

In [None]:
plt.figure(figsize=(15,6))
plot = plt.plot(exp_test_x_list, cut_test_grey_data)####
plt.fill_between(exp_test_x_list[left_ips[1]:right_ips[1]], cut_test_grey_data[left_ips[1]:right_ips[1]], color='yellow', alpha=0.3)
plt.fill_between(exp_test_x_list[left_ips[2]:right_ips[2]], cut_test_grey_data[left_ips[2]:right_ips[2]], color='green', alpha=0.3)
plt.fill_between(exp_test_x_list[left_ips[3]:right_ips[3]], cut_test_grey_data[left_ips[3]:right_ips[3]], color='blue', alpha=0.3)
ax = plt.gca().invert_xaxis()
plt.xlabel("Size in nt")
plt.ylabel("Grey Values")
plt.show()

5. Normalizing the each area with the total area.

In [None]:
total_area = intact_dna_area+cut_dna_i_area+cut_dna_ii_area
total_area

In [None]:
intact_norm_list = []
intact_norm = intact_dna_area/total_area
intact_norm_list.append(intact_norm)
intact_norm_list

In [None]:
cut_dna_i_norm_list = []
cut_dna_i_norm = cut_dna_i_area/total_area
cut_dna_i_norm_list.append(cut_dna_i_norm)
cut_dna_i_norm_list

In [None]:
cut_dna_ii_norm_list = []
cut_dna_ii_norm = cut_dna_ii_area/total_area
cut_dna_ii_norm_list.append(cut_dna_ii_norm)
cut_dna_ii_norm_list

In [None]:
#auc_percent_substrate_list[-1] = 0
#time_points = [0, 10, 30, 60, 180, 360, 1800, 6000]
time_points = [360]
time = pd.DataFrame(time_points, columns=["Time_Points, s"])
auc_int_dna = pd.DataFrame(intact_norm_list, columns=["Intact DNA"])
auc_cut_dna_i = pd.DataFrame(cut_dna_i_norm_list, columns=["Cut DNA I"])
auc_cut_dna_ii = pd.DataFrame(cut_dna_ii_norm_list, columns=["Cut DNA II"])
df = pd.concat([time, auc_int_dna, auc_cut_dna_i, auc_cut_dna_ii], axis=1)
df

In [None]:
df.to_csv('analysis_results.csv', encoding='utf-8', index = False, header = False)