In [1]:
import pandas as pd
import numpy as np
import re

In [24]:
df = pd.read_csv('../data_mbc/Enhanced_MetBreastBiomarkers.csv')

In [26]:
df.BiomarkerName.value_counts()

BiomarkerName
HER2      92871
PR        75687
ER        75330
BRCA      23544
PIK3CA    13917
PDL1       9402
Name: count, dtype: int64

In [27]:
df.groupby('BiomarkerName')['BiomarkerStatus'].value_counts()

BiomarkerName  BiomarkerStatus                              
BRCA           No BRCA mutation                                 19798
               Genetic Variant of Unknown Significance (VUS)     1232
               BRCA2 mutation identified                         1104
               BRCA1 mutation identified                          648
               Unsuccessful/indeterminate test                    497
               Unknown                                             95
               Results pending                                     67
               Genetic Variant Favor Polymorphism                  42
               Both BRCA1 and BRCA2 mutations identified           30
               BRCA mutation NOS                                   23
               Other                                                8
ER             Positive                                         56234
               Negative                                         18405
               Unknown       

In [29]:
df.query('BiomarkerName == "PDL1"').PercentStaining.value_counts()

PercentStaining
0%           2844
< 1%         1009
1%            556
2% - 4%       288
5% - 9%       215
10% - 19%     151
20% - 29%      76
30% - 39%      32
50% - 59%      19
40% - 49%      19
80% - 89%      16
90% - 99%      15
70% - 79%      13
100%           13
60% - 69%      11
Name: count, dtype: int64

In [36]:
horomone_receptor_mapping = {
    'Positive': 2,
    'Negative': 1
}

In [46]:
(
    df
    .query('BiomarkerName == "ER"')
    .assign(numeric_status = lambda x: x['BiomarkerStatus'].map(horomone_receptor_mapping).fillna(0))
    .sort_values(
        by=['PatientID', 'ResultDate', 'numeric_status'], 
        ascending=[True, False, False]) # Second False makes most recent ResultDate first and last False means highest numeric_status is first in ties
    .groupby('PatientID')
    .first()
    .reset_index()
    [['PatientID', 'EcogValue']]
                .rename(columns = {'EcogValue': 'ecog_index'})
                .assign(
                    ecog_index = lambda x: x['ecog_index'].astype(pd.CategoricalDtype(categories = [0, 1, 2, 3, 4, 5], ordered = True))
                    )
)

Unnamed: 0_level_0,BiomarkerName,CellType,SpecimenCollectedDate,SpecimenReceivedDate,ResultDate,BiomarkerStatus,SampleType,ExpressionLevel,TissueCollectionSite,TestType,LabName,Assay,DnaType,IHCClone,StainingIntensity,PercentStaining,CombinedPositiveScore,BiomarkerDetail,numeric_status
PatientID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
F00009312BB8F,ER,,2023-10-11,,2023-10-16,Positive,Tissue,,Metastatic site,,,,,,,100%,,,2.0
F000666B8096E,ER,,2011-12-21,,2011-12-27,Positive,Tissue,,Primary site,,,,,,,,,,2.0
F000731324052,ER,,2016-02-01,,2016-02-15,Indeterminate result,Tissue,,Metastatic site,,,,,,,,,,0.0
F00079DEEB85D,ER,,2016-01-25,,2016-02-08,Positive,Tissue,,Metastatic site,,,,,,,60% - 69.9%,,,2.0
F0007F3FA0A0D,ER,,2017-11-15,,2017-11-22,Negative,Tissue,,Primary site,,,,,,,0%,,,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
FFFF9AEA01E37,ER,,2018-06-22,,2018-06-26,Positive,Tissue,,Lymph node,,,,,,,90% - 99.9%,,,2.0
FFFFA7CC85A8F,ER,,2017-02-14,,2017-02-17,Positive,Tissue,,Metastatic site,,,,,,,,,,2.0
FFFFB5EE3A947,ER,,2022-12-06,,2022-12-12,Positive,Tissue,,Metastatic site,,,,,,,Unknown,,,2.0
FFFFB704C8C53,ER,,2019-11-27,,2019-12-19,Unknown,Tissue,,Lymph node,,,,,,,0%,,,0.0


In [50]:
df.query('PatientID == "FFFFA7CC85A8F"').query('BiomarkerName == "ER"')

Unnamed: 0,PatientID,BiomarkerName,CellType,SpecimenCollectedDate,SpecimenReceivedDate,ResultDate,BiomarkerStatus,SampleType,ExpressionLevel,TissueCollectionSite,TestType,LabName,Assay,DnaType,IHCClone,StainingIntensity,PercentStaining,CombinedPositiveScore,BiomarkerDetail
132742,FFFFA7CC85A8F,ER,,2010-11-17,,2010-12-27,Positive,Tissue,,Metastatic site,,,,,,,,,
132743,FFFFA7CC85A8F,ER,,2012-05-31,,2012-06-04,Positive,Tissue,,Metastatic site,,,,,,,,,
132744,FFFFA7CC85A8F,ER,,2017-02-14,,2017-02-17,Positive,Tissue,,Metastatic site,,,,,,,,,
132745,FFFFA7CC85A8F,ER,,2007-02-02,,2007-02-07,Positive,Tissue,,Primary site,,,,,,,,,
132746,FFFFA7CC85A8F,ER,,2008-01-29,,2008-01-30,Positive,Tissue,,Primary site,,,,,,,,,


In [51]:
(
df
.query('BiomarkerName == "ER"')
.groupby('PatientID')['BiomarkerStatus']
.agg(lambda x: 'positive' if any('Mutation positive' in val for val in x)
    else ('negative' if any('Mutation negative' in val for val in x)
        else 'unknown'))
.reset_index()
.rename(columns={'BiomarkerStatus': 
                 ER_status'})  # Rename for clarity
)

NameError: name 'biomarker' is not defined