# Class

In [45]:
import numpy as np

class DataExtraction:
    def __init__(self, dataframe):
        self.dataframe = dataframe

    def remove_data_missing(self):
        self.dataframe = self.dataframe[
            (self.dataframe['status'] == "Completed") &
            (self.dataframe['Code'].notnull())
        ]
        return self.dataframe

    def remove_unnecessary_data(self, analysis_columns):
        self.dataframe = self.dataframe[analysis_columns]
        return self.dataframe

    def filter_specimen(self, blood_specimens, tissue_specimens):
        self.dataframe = self.dataframe[
            (self.dataframe['type'].isin(blood_specimens)) |
            (self.dataframe['type'].isin(tissue_specimens))
        ]
        return self.dataframe

    def filter_diagnosis(self, non_small_cell_lung_cancer, lung_cancer, metastasis, squamous_cell_carcinoma):
        conditions = [
            (self.dataframe['Diag'].isin(non_small_cell_lung_cancer)),
            (self.dataframe['Diag'].isin(lung_cancer)),
            (self.dataframe['Diag'].isin(metastasis)),
            (self.dataframe['Diag'].isin(squamous_cell_carcinoma)),
        ]
        diagnosis_groups = [
            "Non small cell lung cancer",
            "Lung cancer",
            "Metastasis",
            "Squamous cell carcinoma"
        ]
        self.dataframe['diagnosis_group'] = np.select(conditions, diagnosis_groups, default='Other')
        return self.dataframe

# Prototype

In [15]:
import pandas as pd

data_path = "/home/thanawat-nawan/NSCLC-OncogenicDriver_and_ResistanceAlterations-ClinicalOutcome/Data/NSCLC Data Collection revised 14.05.2568.xlsx"
total_patient = pd.read_excel(data_path, sheet_name=0)

In [42]:
# Removing missing data and separate blood and tissue specimens and
# Exact columns that used in analysis

blood_specimen_list = ["Blood, PAXgene", "Blood, Streck", "Blood, Roche", "Effusion, Pleural", "FNA"]
tissue_specimen_list = ["Tissue, FFPE", "Tissue, Fresh", "Cytological slide", "Slide, stained"]

analysis_columns = ["Code", "type", "Age", "SEX", "test", "Diag", "Disease", "cost", "Specimen Type", "Source", "%tumor", "Method1", "Result1", "Method2", "Result2", "Method3", "Result3", "Method4", "Result4", "Method5", "Result5"]

# Grouping diagnosis into 4 groups: Non-small cell lung cancer (NSCLC), Lung cancer, Metastasis, Squamous cell carcinoma
non_small_cell_lung_cancer_list = [
    "Non-small cell lung cancer", "Non-small cell carcinoma", "non-small cell lung cancer", "non-small cell carcinoma", "Non small cell lung cancer", "Recurrent Non-small cell lung cancer", "Large cell lung carcinoma", "Large cell carcinoma"]
lung_cancer_list = ["Lung carcinoma", "Lung adenocarcinoma", "Lung cancer", "Lung pleomorphic carcinoma"]
metastasis_list = ["Metastatic adenocarcinoma", "Metastatic carcinoma", "Metastatic cancer", "Metastatic brain cancer"]
squamous_cell_carcinoma = ["Squamous cell carcinoma", "squamous cell carcinoma", "Squamous cell lung cancer", "Squamous cell Carcinoma"]

In [47]:
data = DataExtraction(total_patient)
data.remove_data_missing()
data.filter_specimen(blood_specimen_list, tissue_specimen_list)
data.remove_unnecessary_data(analysis_columns)
data.filter_diagnosis(non_small_cell_lung_cancer_list, lung_cancer_list, metastasis_list, squamous_cell_carcinoma)

Unnamed: 0,Code,type,Age,SEX,test,Diag,Disease,cost,Specimen Type,Source,...,Result1,Method2,Result2,Method3,Result3,Method4,Result4,Method5,Result5,diagnosis_group
0,C2-826022,"Blood, PAXgene",54.0,Female,EGFR mutation,Lung carcinoma,,8890.0,Blood,Plasma,...,Negative,,,,,,,,,Lung cancer
1,B9-925341,"Blood, PAXgene",60.0,Female,EGFR T790M mutation,Non-small cell lung cancer,,3590.0,Blood,Plasma,...,No T790M,,Ex 19 deletion,,,,,,,Non small cell lung cancer
2,B3-963190,"Blood, PAXgene",79.0,Female,EGFR T790M mutation,Lung adenocarcinoma,,3590.0,Blood,Plasma,...,No T790M,,Negative,,,,,,,Lung cancer
4,C2-451069,"Tissue, FFPE",71.0,Female,EGFR mutation,Lung adenocarcinoma,,7410.0,FFPE,Lung,...,Negative,,,,,,,,,Lung cancer
6,A8-991646,"Tissue, FFPE",70.0,Male,Lung cancer actionable mutation panel & Lung c...,Lung adenocarcinoma,,32208.0,FFPE,Lung,...,Lung DNA - KRAS G12V AND EGFR amplification\n...,,,,,,,,,Lung cancer
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5613,C3-284027,"Tissue, FFPE",69.0,Male,Lung cancer actionable mutation panel & Lung c...,Non-small cell lung cancer,Lung cancer,17000.0,FFPE,Left iliac bone,...,ALK expression imbalance,,Negative,,Negative,,Negative,,,Non small cell lung cancer
5614,B0-031746,"Tissue, FFPE",78.0,Female,Lung cancer actionable mutation panel,Lung cancer,Lung cancer,17000.0,FFPE,Lung,...,Negative,,,,,,,,,Lung cancer
5615,C3-465606,"Tissue, Fresh",,Female,EGFR mutation,Lung cancer,Lung cancer,7410.0,Others,Lung,...,Negative,,,,,,,,,Lung cancer
5616,C2-779109,"Tissue, FFPE",40.0,Female,Lung cancer fusion,Lung cancer,Lung cancer,17000.0,FFPE,Lung nodule,...,Negative,,,,,,,,,Lung cancer


In [18]:
from exact_unique_item import *
unique = UniqueItem(total_patient, ['Diag'])
unique.count_unique_items()

{'Lung carcinoma': 16,
 'Non-small cell lung cancer': 1081,
 'Lung adenocarcinoma': 1702,
 'Lung cancer': 2337,
 'Metastatic adenocarcinoma': 32,
 'R/O non-small cell lung cancer': 13,
 'R/O Lung cancer': 54,
 'R/O lung cancer': 156,
 'Squamous cell carcinoma': 28,
 'Metastatic carcinoma': 12,
 'Lung nodule': 1,
 'bronchial cancer': 1,
 'Poorly differentiated carcinoma': 4,
 nan: 1,
 'Non-small cell carcinoma': 13,
 'squamous cell carcinoma': 2,
 'Bronchoalveolar carcinoma': 1,
 'Metastatic cancer': 3,
 'R/O malignant pleural effusion': 2,
 'Mediastinal lymphadenotathy': 1,
 'Malignant pleural effusion': 6,
 'non-small cell lung cancer': 8,
 'non-small cell carcinoma': 1,
 'R/O lung cancer ': 2,
 'Squamous cell lung cancer': 8,
 'Metastatic brain cancer': 1,
 'Squamous cell Carcinoma': 1,
 'Atypical cells': 1,
 'malignant pleural effusion': 1,
 'Non small cell lung cancer': 45,
 'R/O Non small cell lung cancer': 3,
 'Moderately differentiated adenocarcinoma': 1,
 'Large cell lung carci

In [19]:
tissue_result_patient

Unnamed: 0,Code,type,Age,SEX,test,Diag,Disease,cost,Specimen Type,Source,...,Method1,Result1,Method2,Result2,Method3,Result3,Method4,Result4,Method5,Result5
4,C2-451069,"Tissue, FFPE",71.0,Female,EGFR mutation,Lung adenocarcinoma,,7410.0,FFPE,Lung,...,Cobas,Negative,,,,,,,,
6,A8-991646,"Tissue, FFPE",70.0,Male,Lung cancer actionable mutation panel & Lung c...,Lung adenocarcinoma,,32208.0,FFPE,Lung,...,NGS,Lung DNA - KRAS G12V AND EGFR amplification\n...,,,,,,,,
12,C2-838144,"Tissue, FFPE",51.0,Male,EGFR mutation,Lung adenocarcinoma,,7410.0,FFPE,Pleural fluid,...,Cobas,Negative,,,,,,,,
13,B8-013971,"Tissue, FFPE",65.0,Female,EGFR mutation,Lung cancer,,7410.0,FFPE,Sternum,...,Cobas,Negative,,,,,,,,
19,B4-917135,"Tissue, FFPE",81.0,Male,EGFR T790M mutation,Lung cancer,,8890.0,FFPE,Bronchoalveolar lavage,...,"Inhouse, Cobas",No T790M,,Negative,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5613,C3-284027,"Tissue, FFPE",69.0,Male,Lung cancer actionable mutation panel & Lung c...,Non-small cell lung cancer,Lung cancer,17000.0,FFPE,Left iliac bone,...,"Idylla, Cobas, NGS",ALK expression imbalance,,Negative,,Negative,,Negative,,
5614,B0-031746,"Tissue, FFPE",78.0,Female,Lung cancer actionable mutation panel,Lung cancer,Lung cancer,17000.0,FFPE,Lung,...,NGS,Negative,,,,,,,,
5615,C3-465606,"Tissue, Fresh",,Female,EGFR mutation,Lung cancer,Lung cancer,7410.0,Others,Lung,...,Cobas,Negative,,,,,,,,
5616,C2-779109,"Tissue, FFPE",40.0,Female,Lung cancer fusion,Lung cancer,Lung cancer,17000.0,FFPE,Lung nodule,...,Idylla,Negative,,,,,,,,


In [20]:
patient_info_template = {
    "code": {"code": str,
     "age": int,
     "sex": str,
     "source": str,
     "tumor_percentage": float,
     "method": {dict},
     "result": {dict}}
}

In [21]:
for i in range(tissue_result_patient.__len__):
    try:
        pass
    except KeyError:


SyntaxError: incomplete input (1423844909.py, line 5)

In [12]:
result_template = {
    "code": str,
    "time": int,
    "molecular": {
        "method": str,
        "result": str
    }
}