# Lunatic data extraction

* Code assumes the lunatic report contains both "Concentration (mg/mL)" and wavelength scanning results.
* Please specify input file path and output folder path before running the code. 

## Prepare coding environment

In [3]:
import pandas as pd

## Prepare filepaths

In [5]:
# Input file path
lunatic_report = './example_data/2024-10-31_122352_401285_DPD__Mumps_IL_PreFHN_AEX_screen1 pH 6.5 plate1.xlsx'

# Establish file saving paths
output_folder_path = './output/'
scan_filepath = output_folder_path + 'wavelength_scan.csv'
con_filepath = output_folder_path + 'concentration.csv'

## Clean data

In [7]:
df = pd.read_excel(lunatic_report)

# Clean out metadata above "Plate ID" in column 1
index = df[df['Report'].astype(str).str.contains('Plate ID', na=False)].index[0] # Find the index of the row where 'Plate ID' appears in column 1
filtered_df = df.loc[index:] # Filter out all rows above the found index
    
# Use the first row as the header and drop it from the data
filtered_df.columns = filtered_df.iloc[0]  # Set the first row as the header
filtered_df = filtered_df.drop(index)          # Drop the first row now that it is used as the header
filtered_df = filtered_df.reset_index(drop=True) # Reset the index to start from 0

## Create functions to extract data

In [9]:
# create a function to extract "Concentration (mg/mL)" from .csv files
def extract_con(df):
    # Extract columns 'Plate\nPosition' and 'Concentration\n(mg/ml)'
    reseted_df = df[['Plate\nPosition', 'Concentration\n(mg/ml)']]
    con_df = reseted_df.rename(columns={
        'Plate\nPosition': 'PlatePosition',
        'Concentration\n(mg/ml)': 'Concentration(mg/mL)',
    })
    
    pd.set_option('future.no_silent_downcasting', True) # future-proof the code related to how panda handles downcasting
    con_df = con_df.infer_objects(copy=False) # ensure the object columns are inferred correctly without silent downcasting
    
    con_df = con_df.fillna("N/A")

    return con_df

In [10]:
# create a function to extract wavelength scanning data from column 11 to column 231
def extract_scan (df):
    scan_df = df.iloc[:, 10:231]  # Use 10 because slicing in Python excludes the endpoint
    scan_df.columns = scan_df.columns.str.slice(1, 4) # Use the 2nd to 4th characters of each column name as the new header
    scan_df = scan_df.apply(pd.to_numeric) # make the entire dataframe numeric
    
    # Extract column 2 (index 1)
    plate_position = filtered_df.iloc[:, 1]

    # Add column 2 to scan_df
    scan_df = pd.concat([plate_position, scan_df], axis=1)
    scan_df = scan_df.rename(columns={
        'Plate\nPosition': 'PlatePosition'
    })
    return scan_df

## Extract data

In [12]:
con_df = extract_con(filtered_df)
scan_df = extract_scan(filtered_df)

## Save dataframes to .csv

In [14]:
con_df.to_csv(con_filepath, index=False)
scan_df.to_csv(scan_filepath, index=False)

print ('Data extraction successful!')

Data extraction successful!
