# Class

In [62]:
import numpy as np

class DataExtraction:
    def __init__(self, dataframe):
        self.dataframe = dataframe

    def remove_data_missing(self):
        self.dataframe = self.dataframe[
            (self.dataframe['status'] == "Completed") &
            (self.dataframe['Code'].notnull())
        ].reset_index(drop=True)
        return self.dataframe

    def remove_unnecessary_data(self, analysis_columns):
        self.dataframe = self.dataframe[analysis_columns]
        return self.dataframe

    def filter_specimen(self, blood_specimens, tissue_specimens):
        self.dataframe = self.dataframe[
            (self.dataframe['type'].isin(blood_specimens)) |
            (self.dataframe['type'].isin(tissue_specimens))
        ]
        return self.dataframe

    def filter_diagnosis(self, non_small_cell_lung_cancer, lung_cancer, metastasis, squamous_cell_carcinoma):
        conditions = [
            (self.dataframe['Diag'].isin(non_small_cell_lung_cancer)),
            (self.dataframe['Diag'].isin(lung_cancer)),
            (self.dataframe['Diag'].isin(metastasis)),
            (self.dataframe['Diag'].isin(squamous_cell_carcinoma)),
        ]
        diagnosis_groups = [
            "Non small cell lung cancer",
            "Lung cancer",
            "Metastasis",
            "Squamous cell carcinoma"
        ]
        self.dataframe['diagnosis_group'] = np.select(conditions, diagnosis_groups, default='Other')
        self.dataframe = self.dataframe.drop(columns=['Diag'])
        return self.dataframe

    def create_patient_dictionary(self):
        code_dictionary = {}
        for i in range(self.dataframe.__len__()):
            try:
                code = self.dataframe['Code'][i]
            except KeyError:
                print(f"The code at row {i} was not found in the dataframe.")
                continue

            if code not in code_dictionary:
                code_dictionary[code] = {}
                code_dictionary[code]['count_times'] = 1
            else:
                code_dictionary[code]['count_times'] += 1

            time_n = "time_" + str(code_dictionary[code]['count_times'])
            code_dictionary[code][time_n] = {}
            time_n_templete = code_dictionary[code][time_n]

            time_n_templete['age'] = float(self.dataframe['Age'][i])
            time_n_templete['sex'] = self.dataframe['SEX'][i]
            time_n_templete['cost'] = float(self.dataframe['cost'][i])
            time_n_templete['specimen_type'] = self.dataframe['Specimen Type'][i]
            time_n_templete['source'] = self.dataframe['Source'][i]
            time_n_templete['tumor_percentage'] = float(self.dataframe['%tumor'][i]) * 100
            time_n_templete['diagnosis_group'] = self.dataframe['diagnosis_group'][i]
            time_n_templete['test'] = self.dataframe['test'][i]

        return code_dictionary

# Prototype

In [31]:
code_dictionary = {
    "code": {
        "times": int,
        "Time_X": {
            "age": int,
            "sex": str,
            "source": str,
            "tumor_percentage": float,
            "method": {dict},
            "result": {dict},
            "diagnosis": str
        }
    }
}

In [32]:
import pandas as pd

data_path = "/home/thanawat-nawan/NSCLC-OncogenicDriver_and_ResistanceAlterations-ClinicalOutcome/Data/NSCLC Data Collection revised 14.05.2568.xlsx"
total_patient = pd.read_excel(data_path, sheet_name=0)

In [33]:
# Removing missing data and separate blood and tissue specimens and
# Exact columns that used in analysis

blood_specimen_list = ["Blood, PAXgene", "Blood, Streck", "Blood, Roche", "Effusion, Pleural", "FNA"]
tissue_specimen_list = ["Tissue, FFPE", "Tissue, Fresh", "Cytological slide", "Slide, stained"]

analysis_columns = ["Code", "type", "Age", "SEX", "test", "Diag", "cost", "Specimen Type", "Source", "%tumor", "Method1", "Result1", "Method2", "Result2", "Method3", "Result3", "Method4", "Result4", "Method5", "Result5"]

# Grouping diagnosis into 4 groups: Non-small cell lung cancer (NSCLC), Lung cancer, Metastasis, Squamous cell carcinoma
non_small_cell_lung_cancer_list = [
    "Non-small cell lung cancer", "Non-small cell carcinoma", "non-small cell lung cancer", "non-small cell carcinoma", "Non small cell lung cancer", "Recurrent Non-small cell lung cancer", "Large cell lung carcinoma", "Large cell carcinoma"]
lung_cancer_list = ["Lung carcinoma", "Lung adenocarcinoma", "Lung cancer", "Lung pleomorphic carcinoma"]
metastasis_list = ["Metastatic adenocarcinoma", "Metastatic carcinoma", "Metastatic cancer", "Metastatic brain cancer"]
squamous_cell_carcinoma = ["Squamous cell carcinoma", "squamous cell carcinoma", "Squamous cell lung cancer", "Squamous cell Carcinoma"]

In [76]:
data = DataExtraction(total_patient)
data.remove_data_missing()
data.filter_specimen(blood_specimen_list, tissue_specimen_list)
data.remove_unnecessary_data(analysis_columns)
df = data.filter_diagnosis(non_small_cell_lung_cancer_list, lung_cancer_list, metastasis_list, squamous_cell_carcinoma)
data_dict = data.create_patient_dictionary()

The code at row 356 was not found in the dataframe.
The code at row 622 was not found in the dataframe.
The code at row 1391 was not found in the dataframe.
The code at row 1579 was not found in the dataframe.


In [78]:
# Get the patient ID (key of the outer dictionary)
patient_id = list(data_dict.keys()) # 'C2-826022'

# Get the data for 'time_1'
patient_data = data_dict[patient_id]['time_1']

# Create a list of dictionaries, where each dictionary represents a row
# Add the patient ID as a column
row_data = {**patient_data, 'patient_id': patient_id} # Combine dicts

# Create the DataFrame from a list containing this single dictionary
df = pd.DataFrame([row_data])

# If you want 'patient_id' to be the first column:
cols = ['patient_id'] + [col for col in df.columns if col != 'patient_id']
df = df[cols]

df

TypeError: unhashable type: 'list'

In [69]:
dict

{'C2-826022': {'count_times': 1,
  'time_1': {'age': 54.0,
   'sex': 'Female',
   'cost': 8890.0,
   'specimen_type': 'Blood',
   'source': 'Plasma',
   'tumor_percentage': nan,
   'diagnosis_group': 'Lung cancer',
   'test': 'EGFR mutation'}},
 'B9-925341': {'count_times': 2,
  'time_1': {'age': 60.0,
   'sex': 'Female',
   'cost': 3590.0,
   'specimen_type': 'Blood',
   'source': 'Plasma',
   'tumor_percentage': nan,
   'diagnosis_group': 'Non small cell lung cancer',
   'test': 'EGFR T790M mutation'},
  'time_2': {'age': 60.0,
   'sex': 'Female',
   'cost': 3590.0,
   'specimen_type': 'Blood',
   'source': 'Plasma',
   'tumor_percentage': nan,
   'diagnosis_group': 'Lung cancer',
   'test': 'EGFR T790M mutation'}},
 'B3-963190': {'count_times': 1,
  'time_1': {'age': 79.0,
   'sex': 'Female',
   'cost': 3590.0,
   'specimen_type': 'Blood',
   'source': 'Plasma',
   'tumor_percentage': nan,
   'diagnosis_group': 'Lung cancer',
   'test': 'EGFR T790M mutation'}},
 'C2-451069': {'count

In [66]:
pd.DataFrame.from_dict(dict, orient='index')

Unnamed: 0,count_times,time_1,time_2,time_3,time_4,time_5,time_6,time_7
C2-826022,1,"{'age': 54.0, 'sex': 'Female', 'cost': 8890.0,...",,,,,,
B9-925341,2,"{'age': 60.0, 'sex': 'Female', 'cost': 3590.0,...","{'age': 60.0, 'sex': 'Female', 'cost': 3590.0,...",,,,,
B3-963190,1,"{'age': 79.0, 'sex': 'Female', 'cost': 3590.0,...",,,,,,
C2-451069,1,"{'age': 71.0, 'sex': 'Female', 'cost': 7410.0,...",,,,,,
A8-991646,1,"{'age': 70.0, 'sex': 'Male', 'cost': 32208.0, ...",,,,,,
...,...,...,...,...,...,...,...,...
C1-629920,1,"{'age': 75.0, 'sex': 'Female', 'cost': 7410.0,...",,,,,,
B6-077270,1,"{'age': 60.0, 'sex': 'Male', 'cost': 7410.0, '...",,,,,,
C3-464948,1,"{'age': 49.0, 'sex': 'Male', 'cost': 7410.0, '...",,,,,,
C9-099390,1,"{'age': 69.0, 'sex': 'Female', 'cost': 7410.0,...",,,,,,


In [91]:
flattened_data = []

for patient_id, patient_info in data_dict.items():
    # Iterate through all 'time_X' entries for the current patient
    for key, value in patient_info.items():
        if key.startswith('time_'):
            # Create a new dictionary for each row
            row = {'patient_id': patient_id, 'time_entry': key}
            row.update(value) # Add all details from the 'time_X' dictionary
            flattened_data.append(row)

# Create the DataFrame from the list of flattened dictionaries
df = pd.DataFrame(flattened_data)

# Optional: Reorder columns to have 'patient_id' and 'time_entry' first
if 'patient_id' in df.columns and 'time_entry' in df.columns:
    cols = ['patient_id', 'time_entry'] + [col for col in df.columns if col not in ['patient_id', 'time_entry']]
    df = df[cols]

In [92]:
df

Unnamed: 0,patient_id,time_entry,age,sex,cost,specimen_type,source,tumor_percentage,diagnosis_group,test
0,C2-826022,time_1,54.0,Female,8890.0,Blood,Plasma,,Lung cancer,EGFR mutation
1,B9-925341,time_1,60.0,Female,3590.0,Blood,Plasma,,Non small cell lung cancer,EGFR T790M mutation
2,B9-925341,time_2,60.0,Female,3590.0,Blood,Plasma,,Lung cancer,EGFR T790M mutation
3,B3-963190,time_1,79.0,Female,3590.0,Blood,Plasma,,Lung cancer,EGFR T790M mutation
4,C2-451069,time_1,71.0,Female,7410.0,FFPE,Lung,30.0,Lung cancer,EGFR mutation
...,...,...,...,...,...,...,...,...,...,...
3494,C1-629920,time_1,75.0,Female,7410.0,FFPE,Lung,20.0,Lung cancer,EGFR mutation
3495,B6-077270,time_1,60.0,Male,7410.0,FFPE,Bronchus tissue,5.0,Lung cancer,EGFR mutation
3496,C3-464948,time_1,49.0,Male,7410.0,FFPE,Lymph node,70.0,Lung cancer,EGFR mutation
3497,C9-099390,time_1,69.0,Female,7410.0,FFPE,Pleural nodule,95.0,Non small cell lung cancer,EGFR mutation
