# Class

In [48]:
import numpy as np

class DataExtraction:
    def __init__(self, dataframe):
        self.dataframe = dataframe

    def remove_data_missing(self):
        self.dataframe = self.dataframe[
            (self.dataframe['status'] == "Completed") &
            (self.dataframe['Code'].notnull())
        ]
        return self.dataframe

    def remove_unnecessary_data(self, analysis_columns):
        self.dataframe = self.dataframe[analysis_columns]
        return self.dataframe

    def filter_specimen(self, blood_specimens, tissue_specimens):
        self.dataframe = self.dataframe[
            (self.dataframe['type'].isin(blood_specimens)) |
            (self.dataframe['type'].isin(tissue_specimens))
        ]
        return self.dataframe

    def filter_diagnosis(self, non_small_cell_lung_cancer, lung_cancer, metastasis, squamous_cell_carcinoma):
        conditions = [
            (self.dataframe['Diag'].isin(non_small_cell_lung_cancer)),
            (self.dataframe['Diag'].isin(lung_cancer)),
            (self.dataframe['Diag'].isin(metastasis)),
            (self.dataframe['Diag'].isin(squamous_cell_carcinoma)),
        ]
        diagnosis_groups = [
            "Non small cell lung cancer",
            "Lung cancer",
            "Metastasis",
            "Squamous cell carcinoma"
        ]
        self.dataframe['diagnosis_group'] = np.select(conditions, diagnosis_groups, default='Other')
        self.dataframe = self.dataframe.drop(columns=['Diag'])
        return self.dataframe

    def create_dictionary(self):
        for i in range(self.dataframe.__len__):
            code = self.dataframe['Code'][i]

        code_dictionary = {
            "code": {
                "code": str,
                "times": int,
                "Time_X": {
                    "age": int,
                    "sex": str,
                    "source": str,
                    "tumor_percentage": float,
                    "method": {dict},
                    "result": {dict},
                    "diagnosis": str
                }
            }
        }

In [49]:
class Utility:
    def __init__(self, dataframe):
        self.dataframe = dataframe
        self.unique_item = {}

    def exact_unique_item(self):
        for column in self.dataframe.columns:
            if self.dataframe[column].dtype == 'object':
                self.dataframe[column] = self.dataframe[column].astype('str')
                try:
                    self.unique_item[column] = {}
                    for i in self.dataframe[column]:
                        if i != i:
                            self.unique_item[column] = self.unique_item[column].get("nan value", 0) + 1
                        else:
                            if i in self.unique_item[column]:
                                self.unique_item[column][i] += 1
                            else:
                                self.unique_item[column][i] = 1
                except TypeError as e:
                    print(f"The column {column} has an error: {e}")
        return self.unique_item

# Prototype

In [50]:
import pandas as pd

data_path = "/home/thanawat-nawan/NSCLC-OncogenicDriver_and_ResistanceAlterations-ClinicalOutcome/Data/NSCLC Data Collection revised 14.05.2568.xlsx"
total_patient = pd.read_excel(data_path, sheet_name=0)

In [51]:
# Removing missing data and separate blood and tissue specimens and
# Exact columns that used in analysis

blood_specimen_list = ["Blood, PAXgene", "Blood, Streck", "Blood, Roche", "Effusion, Pleural", "FNA"]
tissue_specimen_list = ["Tissue, FFPE", "Tissue, Fresh", "Cytological slide", "Slide, stained"]

analysis_columns = ["Code", "type", "Age", "SEX", "test", "Diag", "cost", "Specimen Type", "Source", "%tumor", "Method1", "Result1", "Method2", "Result2", "Method3", "Result3", "Method4", "Result4", "Method5", "Result5"]

# Grouping diagnosis into 4 groups: Non-small cell lung cancer (NSCLC), Lung cancer, Metastasis, Squamous cell carcinoma
non_small_cell_lung_cancer_list = [
    "Non-small cell lung cancer", "Non-small cell carcinoma", "non-small cell lung cancer", "non-small cell carcinoma", "Non small cell lung cancer", "Recurrent Non-small cell lung cancer", "Large cell lung carcinoma", "Large cell carcinoma"]
lung_cancer_list = ["Lung carcinoma", "Lung adenocarcinoma", "Lung cancer", "Lung pleomorphic carcinoma"]
metastasis_list = ["Metastatic adenocarcinoma", "Metastatic carcinoma", "Metastatic cancer", "Metastatic brain cancer"]
squamous_cell_carcinoma = ["Squamous cell carcinoma", "squamous cell carcinoma", "Squamous cell lung cancer", "Squamous cell Carcinoma"]

In [52]:
data = DataExtraction(total_patient)
data.remove_data_missing()
data.filter_specimen(blood_specimen_list, tissue_specimen_list)
data.remove_unnecessary_data(analysis_columns)
data.filter_diagnosis(non_small_cell_lung_cancer_list, lung_cancer_list, metastasis_list, squamous_cell_carcinoma)

Unnamed: 0,Code,type,Age,SEX,test,cost,Specimen Type,Source,%tumor,Method1,Result1,Method2,Result2,Method3,Result3,Method4,Result4,Method5,Result5,diagnosis_group
0,C2-826022,"Blood, PAXgene",54.0,Female,EGFR mutation,8890.0,Blood,Plasma,,Cobas,Negative,,,,,,,,,Lung cancer
1,B9-925341,"Blood, PAXgene",60.0,Female,EGFR T790M mutation,3590.0,Blood,Plasma,,"Inhouse, Cobas",No T790M,,Ex 19 deletion,,,,,,,Non small cell lung cancer
2,B3-963190,"Blood, PAXgene",79.0,Female,EGFR T790M mutation,3590.0,Blood,Plasma,,"Inhouse, Cobas",No T790M,,Negative,,,,,,,Lung cancer
4,C2-451069,"Tissue, FFPE",71.0,Female,EGFR mutation,7410.0,FFPE,Lung,0.30,Cobas,Negative,,,,,,,,,Lung cancer
6,A8-991646,"Tissue, FFPE",70.0,Male,Lung cancer actionable mutation panel & Lung c...,32208.0,FFPE,Lung,0.60,NGS,Lung DNA - KRAS G12V AND EGFR amplification\n...,,,,,,,,,Lung cancer
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5613,C3-284027,"Tissue, FFPE",69.0,Male,Lung cancer actionable mutation panel & Lung c...,17000.0,FFPE,Left iliac bone,0.25,"Idylla, Cobas, NGS",ALK expression imbalance,,Negative,,Negative,,Negative,,,Non small cell lung cancer
5614,B0-031746,"Tissue, FFPE",78.0,Female,Lung cancer actionable mutation panel,17000.0,FFPE,Lung,0.10,NGS,Negative,,,,,,,,,Lung cancer
5615,C3-465606,"Tissue, Fresh",,Female,EGFR mutation,7410.0,Others,Lung,,Cobas,Negative,,,,,,,,,Lung cancer
5616,C2-779109,"Tissue, FFPE",40.0,Female,Lung cancer fusion,17000.0,FFPE,Lung nodule,0.60,Idylla,Negative,,,,,,,,,Lung cancer
