In [1]:
import pandas as pd

def split_and_combine_csv(file_path, chunk_size, metadata_lines=6):
    all_data = []

    # Read the header separately after skipping metadata lines
    with open(file_path, 'r') as f:
        for _ in range(metadata_lines):
            next(f)
        header = next(f).strip().split(',')

    chunk_iter = pd.read_csv(file_path, skiprows=metadata_lines+1, chunksize=chunk_size, header=None, low_memory=False)

    for chunk in chunk_iter:
        chunk.columns = header  # Assign the header to each chunk
        all_data.append(chunk)

    combined_df = pd.concat(all_data, ignore_index=True)
    return combined_df

# Example usage
combined_df = split_and_combine_csv('scenario_two_run.csv', 200000)
combined_df

FileNotFoundError: [Errno 2] No such file or directory: 'scenario_three_run.csv'

In [2]:
combined_df

In [3]:
combined_df.columns = [ eval(i) for i in combined_df.columns ] 

In [4]:
good_data = combined_df[[ '[run number]',
                'Detection-Limit',
                'global-percent', 'global-logCFU', 'global-sd',
                'initial_contaminated','avocado-counter',
                'probality-Of-Contamination','Contamination-Level','SD',
                'initial_batch_contaminated',
                'pfactor', '[step]','enter_processing_list']]
good_data

In [5]:
good_data.columns = ['[run number]', 'Detection-Limit', 'contaminated_surface_percent', 'contaminated_surface_logCFU',
       'contaminated_surface_sd', 'initial_contaminated_avocado_counter', 'avocado-counter_total',
       'probality-Of-Contamination_for_avocado', 'Contamination-Level_avocado_logCFU', 'avocado_SD',
       'percentage_of_batch_initial_contaminated', 'pfactor_for_surface_ratio','steps','enter_processing_list']

In [6]:
def split_to_columns(x):
    return x[2:-2].split("] [")

def process_list(lst):
    return [i for i in lst] 

def split_string(x):
    return x.split(" ")

def combine_data_frame(x):
    return pd.concat([x, pd.DataFrame(x.Output_packed.tolist(), 
                                      index=x.index,
                                      columns=["Process_ID", "Time_In_Seconds","CFU","logCFU",
                                       "Handler", "Processed_Crates","Safety_Status","Patch_Contamination"])], axis=1)

def data_filtering(df):
    # df = df.drop("Output_packed",axis=1)
    df = df[df.enter_processing_list.apply(lambda x: len(x) > 3)]
    df = df.drop_duplicates("enter_processing_list")
    df.enter_processing_list = df.enter_processing_list.apply(split_to_columns)

    df = df.explode('enter_processing_list')
    df.enter_processing_list = df.enter_processing_list.apply(split_string)
    df = df.rename(columns={'enter_processing_list': 'Output_packed'})
    df = df.reset_index(drop=True)
    df = combine_data_frame(df)
    df = df.drop("Patch_Contamination",axis=1)
    return df

output = data_filtering(good_data)
output

In [7]:
len(output[output["[run number]"] == 1])

In [8]:
output.to_csv("experiment_two_results.csv")