# Class

In [26]:
import numpy as np

class DataExtraction:
    def __init__(self, dataframe):
        self.dataframe = dataframe

    def remove_data_missing(self):
        self.dataframe = self.dataframe[
            (self.dataframe['status'] == "Completed") &
            (self.dataframe['Code'].notnull())
        ]
        return self.dataframe

    def remove_unnecessary_data(self, analysis_columns):
        self.dataframe = self.dataframe[analysis_columns]
        return self.dataframe

    def filter_specimen(self, blood_specimens, tissue_specimens):
        self.dataframe = self.dataframe[
            (self.dataframe['type'].isin(blood_specimens)) |
            (self.dataframe['type'].isin(tissue_specimens))
        ]
        return self.dataframe

    def filter_diagnosis(self, non_small_cell_lung_cancer, lung_cancer, metastasis, squamous_cell_carcinoma):
        conditions = [
            (self.dataframe['Diag'].isin(non_small_cell_lung_cancer)),
            (self.dataframe['Diag'].isin(lung_cancer)),
            (self.dataframe['Diag'].isin(metastasis)),
            (self.dataframe['Diag'].isin(squamous_cell_carcinoma)),
        ]
        diagnosis_groups = [
            "Non small cell lung cancer",
            "Lung cancer",
            "Metastasis",
            "Squamous cell carcinoma"
        ]
        self.dataframe['diagnosis_group'] = np.select(conditions, diagnosis_groups, default='Other')
        self.dataframe = self.dataframe.drop(columns=['Diag'])
        return self.dataframe

    def create_dictionary(self):
        for i in range(self.dataframe.__len__):
            code = self.dataframe['Code'][i]

        code_dictionary = {
            "code": {
                "code": str,
                "times": int,
                "Time_X": {
                    "age": int,
                    "sex": str,
                    "source": str,
                    "tumor_percentage": float,
                    "method": {dict},
                    "result": {dict},
                    "diagnosis": str
                }
            }
        }

In [42]:
class Utility:
    def __init__(self, dataframe):
        self.dataframe = dataframe
        self.unique_item = {}

    def exact_unique_item(self):
        for column in self.dataframe.columns:
            if self.dataframe[column].dtype == 'object':
                self.dataframe[column] = self.dataframe[column].astype('str')
                try:
                    self.unique_item[column] = {}
                    for i in self.dataframe[column]:
                        if i != i:
                            self.unique_item[column] = self.unique_item[column].get("nan value", 0) + 1
                        else:
                            if i in self.unique_item[column]:
                                self.unique_item[column][i] += 1
                            else:
                                self.unique_item[column][i] = 1
                except TypeError as e:
                    print(f"The column {column} has an error: {e}")
        return self.unique_item

# Prototype

In [2]:
import pandas as pd

data_path = "/home/thanawat-nawan/NSCLC-OncogenicDriver_and_ResistanceAlterations-ClinicalOutcome/Data/NSCLC Data Collection revised 14.05.2568.xlsx"
total_patient = pd.read_excel(data_path, sheet_name=0)

In [11]:
# Removing missing data and separate blood and tissue specimens and
# Exact columns that used in analysis

blood_specimen_list = ["Blood, PAXgene", "Blood, Streck", "Blood, Roche", "Effusion, Pleural", "FNA"]
tissue_specimen_list = ["Tissue, FFPE", "Tissue, Fresh", "Cytological slide", "Slide, stained"]

analysis_columns = ["Code", "type", "Age", "SEX", "test", "Diag", "cost", "Specimen Type", "Source", "%tumor", "Method1", "Result1", "Method2", "Result2", "Method3", "Result3", "Method4", "Result4", "Method5", "Result5"]

# Grouping diagnosis into 4 groups: Non-small cell lung cancer (NSCLC), Lung cancer, Metastasis, Squamous cell carcinoma
non_small_cell_lung_cancer_list = [
    "Non-small cell lung cancer", "Non-small cell carcinoma", "non-small cell lung cancer", "non-small cell carcinoma", "Non small cell lung cancer", "Recurrent Non-small cell lung cancer", "Large cell lung carcinoma", "Large cell carcinoma"]
lung_cancer_list = ["Lung carcinoma", "Lung adenocarcinoma", "Lung cancer", "Lung pleomorphic carcinoma"]
metastasis_list = ["Metastatic adenocarcinoma", "Metastatic carcinoma", "Metastatic cancer", "Metastatic brain cancer"]
squamous_cell_carcinoma = ["Squamous cell carcinoma", "squamous cell carcinoma", "Squamous cell lung cancer", "Squamous cell Carcinoma"]

In [25]:
data = DataExtraction(total_patient)
data.remove_data_missing()
data.filter_specimen(blood_specimen_list, tissue_specimen_list)
data.remove_unnecessary_data(analysis_columns)
already = data.filter_diagnosis(non_small_cell_lung_cancer_list, lung_cancer_list, metastasis_list, squamous_cell_carcinoma)

In [13]:
from exact_unique_item import *
unique = UniqueItem(total_patient, ['test'])
unique.count_unique_items()

{'EGFR mutation': 3756,
 'EGFR T790M mutation': 843,
 'Lung cancer actionable mutation panel & Lung cancer fusion ': 27,
 'EGFR mutation & BRAF mutation': 256,
 'EGFR mutation & T790M mutation': 45,
 'EGFR mutation ': 185,
 'Lung cancer actionable mutation panel': 89,
 'Comprehensive EGFR mutation panel': 40,
 'Lung cancer actionable mutation panel & Lung cancer fusion': 220,
 'KRAS reflex NRAS/BRAF mutation': 2,
 'EGFR mutation+Lung cancer actionable mutation panel (EGFR mutation)': 1,
 'EGFR mutation & Lung cancer actionable mutation panel & Lung cancer fusion': 1,
 'Lung cancer actionable mutation panel ': 3,
 'KRAS,NRAS and BRAF mutation': 1,
 'EGFR mutation and Lung cancer actionable mutation panel': 2,
 'EGFR T790M mutation ': 15,
 'Solid tumors actionable mutation panel': 1,
 'Lung cancer fusion': 92,
 'EGFR mutation & Lung cancer fusion': 13,
 'Comprehensive EGFR mutation panel & Lung cancer fusion': 19,
 'KRAS mutation': 4,
 'EGFR mutation & BRAF mutation & Lung cancer fusion'