# Read output csv and work on it

In [1]:
import pandas as pd
import numpy as np
pd.options.display.max_colwidth = 390

In [2]:
df=pd.read_csv("whole_list.csv")

In [3]:
df.drop('Unnamed: 0', axis=1,inplace=True)

In [4]:
df[['CSTUMSIZ','CSEXTEN','CSLYMPHN',
    'CSMETSDX','CS1SITE','CS2SITE',
    'CS3SITE','CS4SITE','CS5SITE',
    'CS6SITE','CS25SITE', 'CS15SITE',
    'CS7SITE']] = df[['CSTUMSIZ','CSEXTEN','CSLYMPHN',
                      'CSMETSDX','CS1SITE','CS2SITE',
                      'CS3SITE','CS4SITE','CS5SITE',
                      'CS6SITE','CS25SITE', 'CS15SITE',
                      'CS7SITE']].fillna(value=0)

In [5]:
df[['DAJCCT','DAJCCN','DAJCCM','DAJCCSTG']]=df[['DAJCCT','DAJCCN','DAJCCM','DAJCCSTG']].fillna(value=88)


In [6]:
df[['DSS1977S','DSS2000S']]=df[['DSS1977S','DSS2000S']].fillna(value=8) 

In [7]:
df[['DAJCCFL']]=df[['DAJCCFL']].fillna(value=0)

In [8]:
df[['DAJCC7T','DAJCC7N','DAJCC7M','DAJCC7STG']]=df[['DAJCC7T','DAJCC7N','DAJCC7M','DAJCC7STG']].fillna(value=888)

In [9]:
df[['CSMETSDXB_PUB',
    'CSMETSDXBR_PUB',
    'CSMETSDXLIV_PUB',
    'CSMETSDXLUNG_PUB']]=df[['CSMETSDXB_PUB',
                             'CSMETSDXBR_PUB',
                             'CSMETSDXLIV_PUB',
                             'CSMETSDXLUNG_PUB']].fillna(value=8)

In [10]:
df=df[df['CODPUB']==26000]

In [11]:
continuous_vars=['AGE_DX','CSTUMSIZ','CS3SITE', 'MALIGCOUNT','BENBORDCOUNT','srv_time_mon' ]

In [12]:
df1=df[continuous_vars]

# Transform and recode

## Continuous vars transformation

In [13]:
#AGE_DX
'''
000-130	Actual Age in Years
999	Unknown just deleted
'''
bins = [0,10,20,30,40,50,60,70,80,90,130,999]
bin_names = ['<10' ,'<20','<30','<40','<50' ,'<60','<70','<80','<90','<130','Unknown']


In [14]:
df['AGE_DX'] = pd.cut(df['AGE_DX'], bins, right = True, labels=bin_names)

In [15]:
df["AGE_DX"] = df["AGE_DX"].astype('object')

In [16]:
#CSTUMSIZ
'''
000 	Indicates no mass or no tumor found; for example, when a tumor of a stated primary site is not found, but the tumor has metastasized. 
001-988 	Exact size in millimeters 
989 	989 millimeters or larger 
990 	Microscopic focus or foci only; no size of focus is given 
991 	Described as less than 1 cm 
992 	Described as less than 2 cm 
993 	Described as less than 3 cm 
994 	Described as less than 4 cm 
995 	Described as less than 5 cm 
996-998 	Site-specific codes where needed 
999 	Unknown; size not stated; not stated in patient record 
888 	Not applicable 
'''
bins = [0,1,10,20,30,40,50,60,70,80,90,100,200,300,400,500,600,700,800,888,889,989,990,991,992,993,994,995,996,999,1000]
bin_names = ['no mass/tumor but meta','<10' ,'<20','<30','<40','<50' ,'<60','<70','<80','<90','<100','<200','<300','<400','<500',
             '<600','<700','<800','<888','Not applicable','<989', '989 millimeters or larger','Microscopic/no size',
             'less than 1 cm','less than 2 cm', 'less than 3 cm','less than 4 cm','less than 5 cm','Site-specific code',
             'Unknown']


In [17]:
df['CSTUMSIZ'] = pd.cut(df['CSTUMSIZ'], bins, right = False,labels=bin_names)

In [18]:
df["CSTUMSIZ"] = df["CSTUMSIZ"].astype('object')

In [19]:
#CS3SITE
'''
000	All ipsilateral axillary nodes examined negative
001-089	1 - 89 nodes positive 
(Exact number of nodes positive)
090	90 or more nodes positive
095	Positive aspiration of lymph node(s)
097	Positive nodes, number unspecified
098	No axillary nodes examined
099	Unknown if axillary nodes are positive
Not documented in patient record
988	Not applicable:  Information not collected for this case
(If this item is required by your standard setter, use of code 988 will result in an edit error.)
'''

bins = [-1,0,10,20,30,40,50,60,70,80,89,90,95,97,98,99,998]
bin_names = ['All nodes negative','<10' ,'<20','<30','<40','<50' ,'<60','<70','<80','<89', '90 or more positive',
             'Positive aspiration of lymph node(s)',
             "Positive nodes, number unspecified","No axillary nodes examined","Unknown if positive",
             "Not applicable"]


In [20]:
df['CS3SITE'] = pd.cut(df['CS3SITE'], bins, right = True,labels=bin_names)

In [21]:
df["CS3SITE"] = df["CS3SITE"].astype('object')

# ELABORATE ON TIMES

In [22]:
df['srv_time_mon']=df['srv_time_mon'].replace({9999: 9})


In [23]:
#srv_time_mon
'''
0	Complete dates are available and there are 0 days of survival 
1	Complete dates are available and there are more than 0 days of survival 
2	Incomplete dates are available and there could be zero days of follow-up 
3	Incomplete dates are available and there cannot be zero days of follow-up 
9	Unknown 

'''
# Need to make batches 10-12,12-18,18-24,24-36,36-48,48-60,60-72,72-84,84-96,more than 100
bins = [-1,0,1,2,3,9,10, 13, 19, 25, 37, 49,61,73,85,97,109,516]
bin_names = ['dates available / 0 days of survival','dates available / more than 0 days of survival',
             'Incomplete dates/ 0 days of follow-up',"Incomplete dates/ no 0 days of follow-up",
             '4-9',"Unknown",'<12' ,'<18','<24','<36','<48' ,'<60','<72','<84','<96','<108','>100']

In [24]:
df['srv_time_mon'] = pd.cut(df['srv_time_mon'], bins, labels=bin_names)

In [25]:
df["srv_time_mon"] = df["srv_time_mon"].astype('object')

In [26]:
#MALIGCOUNT
'''
Valid values: 00-98; 99 (unknown)
'''
replace={99:"Unknown"}
df.replace({"MALIGCOUNT": replace}, inplace=True)

In [27]:
#BENBORDCOUNT
'''
Valid values: 00-98; 99 (unknown)
'''
replace={99:"Unknown"}
df.replace({"BENBORDCOUNT": replace}, inplace=True)

df['BENBORDCOUNT'] = df['BENBORDCOUNT'].astype(str)

# Other Values transformation

In [28]:
#MAR_STAT
'''
1	Single (never married) 
2	Married (including common law) 
3	Separated 
4	Divorced 
5	Widowed 
6	Unmarried or domestic partner (same sex or opposite sex or unregistered) 
9	Unknown 
'''
df['MAR_STAT'] = df['MAR_STAT'].replace({1: 'Single', 2: 'Married',3:'Separated', 4:'Divorced',
                                     5:'Widowed', 6:'Unmarried',9:'Unknown'})


In [29]:
#RACE1V
'''
1	White 
2	Black 
3	American Indian, Aleutian, Alaskan Native or Eskimo (includes all indigenous populations of the Western hemisphere) 
4	Chinese 
5	Japanese 
6	Filipino 
7	Hawaiian 
8	Korean (Effective with 1/1/1988 dx) 
10	Vietnamese (Effective with 1/1/1988 dx) 
11	Laotian (Effective with 1/1/1988 dx) 
12	Hmong (Effective with 1/1/1988 dx) 
13	Kampuchean (including Khmer and Cambodian) (Effective with 1/1/1988 dx) 
14	Thai (Effective with 1/1/1994 dx) 
15	Asian Indian or Pakistani, NOS (Effective with 1/1/1988 dx) 
16	Asian Indian (Effective with 1/1/2010 dx) 
17	Pakistani (Effective with 1/1/2010 dx) 
20	Micronesian, NOS (Effective with 1/1/1991) 
21	Chamorran (Effective with 1/1/1991 dx) 
22	Guamanian, NOS (Effective with 1/1/1991 dx) 
25	Polynesian, NOS (Effective with 1/1/1991 dx) 
26	Tahitian (Effective with 1/1/1991 dx) 
27	Samoan (Effective with 1/1/1991 dx) 
28	Tongan (Effective with 1/1/1991 dx) 
30	Melanesian, NOS (Effective with 1/1/1991 dx) 
31	Fiji Islander (Effective with 1/1/1991 dx) 
32	New Guinean (Effective with 1/1/1991 dx) 
96	Other Asian, including Asian, NOS and Oriental, NOS (Effective with 1/1/1991 dx) 
97	Pacific Islander, NOS (Effective with 1/1/1991 dx) 
98	Other 
99	Unknown
''' 
df['RACE1V'] = df['RACE1V'].replace({1: 'White', 2: 'Black',3:'Indigenous', 4:'Chinese',5:'Japanese', 6:'Filipino',
                                 7:'Hawaiian',8: 'Korean',10: 'Vietnamese',11:'Laotian', 12:'Hmong',13:'Kampuchean',
                                 14:'Thai',15:'Asian Indian or Pakistani',16: 'Asian Indian', 17: 'Pakistani',
                                 20:'Micronesian', 21:'Chamorran',22:'Guamanian',25:'Polynesian',26:'Tahitian',
                                 27: 'Samoan',28: 'Tongan',30:'Melanesian',31:'Fiji_Islander',32:'New_Guinean',
                                 96:'Other Asian',97:'Pacific_Islander',98: 'Other', 99: 'Unknown'})


In [30]:
#SEX
'''
1	Male
2	Female
'''

df['SEX'] = df['SEX'].replace({1: 'Male', 2: 'Female'})

In [31]:
#PRIMSITE
'''
     C50.0    Nipple
     C50.1    Central portion of breast
     C50.2    Upper-inner quadrant of breast
     C50.3    Lower-inner quadrant of breast
     C50.4    Upper-outer quadrant of breast
     C50.5    Lower-outer quadrant of breast
     C50.6    Axillary tail of breast
     C50.8    Overlapping lesion of breast
     C50.9    Breast, NOS
'''

df['PRIMSITE'] = df['PRIMSITE'].replace({'C500': 'Nipple', 'C501': 'Central portion','C502': 'Upper-inner quadrant',
                                     'C503': 'Lower-inner quadrant','C504': 'Upper-outer quadrant',
                                     'C505': 'Lower-outer quadrant','C506': 'Axillary tail',
                                     'C508': 'Overlapping lesion', 'C509': 'Breast, NOS'})

In [32]:
#LATERAL
'''
0 Not a paired site
1 Right: origin of primary
2 Left: origin of primary
3 Only one side involved, right or left origin unspecified
4 Bilateral involvement, lateral origin unknown; stated to be single primary
    • Both ovaries involved simultaneously, single histology
    • Bilateral retinoblastomas
    • Bilateral Wilms’s tumors
5 Paired site: midline tumor
9 Paired site, but no information concerning laterality; midline tumor
'''
df['LATERAL'] = df['LATERAL'].replace({0: 'Not_a_paired_site', 1: 'Right',2:'Left', 3:'Only_one_side',4:'Bilateral involvement',
                                   5:'Paired site: midline_tumor',9:'Paired site_no_info'})

In [33]:
df['HISTO2V'].value_counts()

8500    162204
8520     19852
8010     13745
8140     11548
8522     11168
8000      8219
8530      5714
8501      2670
8480      2107
8510      1967
8141      1231
8541      1213
8503       532
8211       450
8401       416
8521       382
8201       358
8050       311
9020       280
8490       243
8230       211
8481       201
8543       170
8540       165
8260       162
8070       160
9120       158
8020       150
8021       134
8504       128
         ...  
8314         3
8440         3
8854         3
8850         3
9220         3
8251         3
8940         2
8470         2
9473         2
8190         2
8811         2
8852         2
8471         2
8350         2
9130         2
8804         2
8963         1
8320         1
8895         1
9170         1
9181         1
8261         1
9150         1
9100         1
8933         1
8045         1
8430         1
8400         1
8900         1
8143         1
Name: HISTO2V, Length: 98, dtype: int64

In [34]:
#HISTO2V
'''
8000	Neoplasms
8050	Squamous cell neoplasms
8090	Basal cell neoplasms
8120	Transitional cell papillomas and carcinomas
8140	Adenomas and adenocarcinomas
8390	Adnexal and skin appendage neoplasms
8430	Mucoepidermoid neoplasms
8440	Cystic, mucinous and serous neoplasms
8500	Ductal and lobular neoplasms
8550	Acinar cell neoplasms
8560	Complex epithelial neoplasms
8580	Thymic epithelial neoplasms
8590	Specialized gonadal neoplasms
8680	Paragangliomas and glomus tumors
8720	Nevi and melanomas
8800	Soft tissue tumors and sarcomas, NOS
8810	Fibromatous neoplasms
8840	Myxomatous neoplasms
8850	Lipomatous neoplasms
8890	Myomatous neoplasms
8930	Complex mixed and stromal neoplasms
9000	Fibroepithelial neoplasms
9040	Synovial-like neoplasms
9050	Mesothelial neoplasms
9060	Germ cell neoplasms
9100	Trophoblastic neoplasms
9110	Mesonephromas
9120	Blood vessel tumors
9170	Lymphatic vessel tumors
9180	Osseous and chondromatous neoplasms
9250	Giant cell tumors
9260	Miscellaneous bone tumors
9270	Odontogenic tumors
9350	Miscellaneous tumors
9380	Gliomas
9490	Neuroepitheliomatous neoplasms
9530	Meningiomas
9540	Nerve sheath tumors
9580	Granular cell tumors and alveolar soft part sarcomas
9590	Hodgkin and nonHodgkin lymphomas
9730	Plasma cell tumors
9740	Mast cell tumors
9750	Neoplasms of histiocytes and accessory lymphoid cells
9760	Immunoproliferative diseases
9800	Leukemias
9950	Chronic myeloproliferative disorders
9970	Other hematologic disorders
9980	Myelodysplastic syndromes
'''
bins=[8000,8050,8090,8120,8140,8390,8430,8440,8500,8550,8560,8580,8590,
      8680,8720,8800,8810,8840,8850,8890,8930,9000,9040,9050,9060,9100,
      9110,9120,9170,9180,9250,9260,9270,9350,9380,9490,9530,9540,9580,
      9590,9730,9740,9750,9760,9800,9950,9970,9980,10000]
bin_names=['Neoplasms','Squamous cell neoplasms','Basal cell neoplasms',
      'Transitional cell papillomas and carcinomas',
      'Adenomas and adenocarcinomas',
      'Adnexal and skin appendage neoplasms','Mucoepidermoid neoplasms',
      'Cystic, mucinous and serous neoplasms',
      'Ductal and lobular neoplasms','Acinar cell neoplasms',
      'Complex epithelial neoplasms','Thymic epithelial neoplasms',
      'Specialized gonadal neoplasms',
      'Paragangliomas and glomus tumors','Nevi and melanomas',
      'Soft tissue tumors and sarcomas, NOS','Fibromatous neoplasms',
      'Myxomatous neoplasms','Lipomatous neoplasms','Myomatous neoplasms',
      'Complex mixed and stromal neoplasms','Fibroepithelial neoplasms',
      'Synovial-like neoplasms','Mesothelial neoplasms',
      'Germ cell neoplasms','Trophoblastic neoplasms','Mesonephromas',
      'Blood vessel tumors','Lymphatic vessel tumors',
      'Osseous and chondromatous neoplasms','Giant cell tumors',
      'Miscellaneous bone tumors','Odontogenic tumors',
      'Miscellaneous tumors','Gliomas','Neuroepitheliomatous neoplasms',
      'Meningiomas','Nerve sheath tumors',
      'Granular cell tumors and alveolar soft part sarcomas',
      'Hodgkin and nonHodgkin lymphomas','Plasma cell tumors',
      'Mast cell tumors',
      'Neoplasms of histiocytes and accessory lymphoid cells',
      'Immunoproliferative diseases','Leukemias',
      'Chronic myeloproliferative disorders',
      'Other hematologic disorders','Myelodysplastic syndromes']





In [35]:
df['HISTO2V'] = pd.cut(df['HISTO2V'], bins, right = False, labels=bin_names)

In [36]:
df["HISTO2V"] = df["HISTO2V"].astype('object')

In [37]:
df["HISTO2V"].value_counts()

Ductal and lobular neoplasms                   206250
Neoplasms                                       22692
Adenomas and adenocarcinomas                    14218
Cystic, mucinous and serous neoplasms            2558
Squamous cell neoplasms                           521
Adnexal and skin appendage neoplasms              417
Fibroepithelial neoplasms                         280
Complex epithelial neoplasms                      237
Complex mixed and stromal neoplasms               163
Blood vessel tumors                               161
Soft tissue tumors and sarcomas, NOS               88
Fibromatous neoplasms                              23
Osseous and chondromatous neoplasms                17
Myomatous neoplasms                                14
Lipomatous neoplasms                                8
Acinar cell neoplasms                               6
Transitional cell papillomas and carcinomas         4
Gliomas                                             2
Trophoblastic neoplasms     

In [38]:
#BEHO2V
'''
0	Benign (Reportable for intracranial and CNS sites only) 
1	Uncertain whether benign or malignant, borderline malignancy, low malignant potential, and uncertain malignant potential (Reportable for intracranial and CNS sites only) 
2	Carcinoma in situ; intraepithelial; noninfiltrating; noninvasive 
3	Malignant, primary site (invasive) 
'''
df['BEHO2V'] = df['BEHO2V'].replace({0: 'Benign', 1: 'Uncertain',2:'Carcinoma in situ', 3:'Malignant, primary site'})


In [39]:
#HISTO3V
'''
8000	Neoplasms
8050	Squamous cell neoplasms
8090	Basal cell neoplasms
8120	Transitional cell papillomas and carcinomas
8140	Adenomas and adenocarcinomas
8390	Adnexal and skin appendage neoplasms
8430	Mucoepidermoid neoplasms
8440	Cystic, mucinous and serous neoplasms
8500	Ductal and lobular neoplasms
8550	Acinar cell neoplasms
8560	Complex epithelial neoplasms
8580	Thymic epithelial neoplasms
8590	Specialized gonadal neoplasms
8680	Paragangliomas and glomus tumors
8720	Nevi and melanomas
8800	Soft tissue tumors and sarcomas, NOS
8810	Fibromatous neoplasms
8840	Myxomatous neoplasms
8850	Lipomatous neoplasms
8890	Myomatous neoplasms
8930	Complex mixed and stromal neoplasms
9000	Fibroepithelial neoplasms
9040	Synovial-like neoplasms
9050	Mesothelial neoplasms
9060	Germ cell neoplasms
9100	Trophoblastic neoplasms
9110	Mesonephromas
9120	Blood vessel tumors
9170	Lymphatic vessel tumors
9180	Osseous and chondromatous neoplasms
9250	Giant cell tumors
9260	Miscellaneous bone tumors
9270	Odontogenic tumors
9350	Miscellaneous tumors
9380	Gliomas
9490	Neuroepitheliomatous neoplasms
9530	Meningiomas
9540	Nerve sheath tumors
9580	Granular cell tumors and alveolar soft part sarcomas
9590	Hodgkin and nonHodgkin lymphomas
9730	Plasma cell tumors
9740	Mast cell tumors
9750	Neoplasms of histiocytes and accessory lymphoid cells
9760	Immunoproliferative diseases
9800	Leukemias
9950	Chronic myeloproliferative disorders
9970	Other hematologic disorders
9980	Myelodysplastic syndromes
'''
bins=[8000,8050,8090,8120,8140,8390,8430,8440,8500,8550,8560,8580,8590,
      8680,8720,8800,8810,8840,8850,8890,8930,9000,9040,9050,9060,9100,
      9110,9120,9170,9180,9250,9260,9270,9350,9380,9490,9530,9540,9580,
      9590,9730,9740,9750,9760,9800,9950,9970,9980,10000]
bin_names=['Neoplasms','Squamous cell neoplasms','Basal cell neoplasms',
      'Transitional cell papillomas and carcinomas',
      'Adenomas and adenocarcinomas',
      'Adnexal and skin appendage neoplasms','Mucoepidermoid neoplasms',
      'Cystic, mucinous and serous neoplasms',
      'Ductal and lobular neoplasms','Acinar cell neoplasms',
      'Complex epithelial neoplasms','Thymic epithelial neoplasms',
      'Specialized gonadal neoplasms',
      'Paragangliomas and glomus tumors','Nevi and melanomas',
      'Soft tissue tumors and sarcomas, NOS','Fibromatous neoplasms',
      'Myxomatous neoplasms','Lipomatous neoplasms','Myomatous neoplasms',
      'Complex mixed and stromal neoplasms','Fibroepithelial neoplasms',
      'Synovial-like neoplasms','Mesothelial neoplasms',
      'Germ cell neoplasms','Trophoblastic neoplasms','Mesonephromas',
      'Blood vessel tumors','Lymphatic vessel tumors',
      'Osseous and chondromatous neoplasms','Giant cell tumors',
      'Miscellaneous bone tumors','Odontogenic tumors',
      'Miscellaneous tumors','Gliomas','Neuroepitheliomatous neoplasms',
      'Meningiomas','Nerve sheath tumors',
      'Granular cell tumors and alveolar soft part sarcomas',
      'Hodgkin and nonHodgkin lymphomas','Plasma cell tumors',
      'Mast cell tumors',
      'Neoplasms of histiocytes and accessory lymphoid cells',
      'Immunoproliferative diseases','Leukemias',
      'Chronic myeloproliferative disorders',
      'Other hematologic disorders','Myelodysplastic syndromes']


In [40]:
df['HISTO3V'] = pd.cut(df['HISTO3V'], bins, right = False, labels=bin_names)

In [41]:
df["HISTO3V"] = df["HISTO3V"].astype('object')

In [42]:
#BEHO3V
'''
0	Benign (Reportable for intracranial and CNS sites only) 
1	Uncertain whether benign or malignant, borderline malignancy, low malignant potential, and uncertain malignant potential (Reportable for intracranial and CNS sites only) 
2	Carcinoma in situ; intraepithelial; noninfiltrating; noninvasive 
3	Malignant, primary site (invasive) 
'''
df['BEHO3V'] = df['BEHO3V'].replace({0: 'Benign', 1: 'Uncertain',2:'Carcinoma in situ', 3:'Malignant, primary site'})

In [43]:
#GRADE
'''
1	Grade I; grade i; grade 1; well differentiated; differentiated, NOS 
2	Grade II; grade ii; grade 2; moderately differentiated; moderately differentiated; intermediate differentiation 
3	Grade III; grade iii; grade 3; poorly differentiated; differentiated 
4	Grade IV; grade iv; grade 4; undifferentiated; anaplastic 
5	T-cell; T-precursor 
6	B-cell; Pre-B; B-Precursor 
7	Null cell; Non T-non B; 
8	N K cell (natural killer cell) 
9	cell type not determined, not stated or not applicable 
'''

df['GRADE'] = df['GRADE'].replace({1: 'well differentiated', 2: 'moderately differentiated',3:'poorly differentiated',
                               4:'undifferentiated, primary site',5: 'T-cell', 6: 'B-cell',7:'Null cell',
                               8:'N K cell',9: 'cell type not determined'})

In [44]:
#CS1SITE
'''
000	OBSOLETE DATA CONVERTED V0203
See code 998

Test not done (test not ordered and not performed)
010	Positive/elevated
020	Negative/normal; within normal limits
030	Borderline; undetermined whether positive or negative
080	OBSOLETE DATA CONVERTED V0203
See code 997

Ordered, but results not in chart
988	Not applicable:  Information not collected for this case
(If this item is required by your standard setter, use of code 988 will result in an edit error.)
996	Test ordered, results not interpretable
997	Test ordered, results not in chart
998	Test not done (test not ordered and not performed)
999	Unknown or no information
Not documented in patient record
'''
df['CS1SITE']= df['CS1SITE'].replace({0:'Test not done',10:'Positive/elevated/amplified',20:'Negative/normal/not amplified',
                                  30:"Borderline/equivocal/ndeterminate",80:"Ordered, not in chart", 988:"Not applicable",
                                  996:"Test ordered, not interpretable",997:"Test ordered, not in chart",998:"Test(s) not done",
                                  999:"Unknown or no information"})

In [45]:
#CS2SITE
'''
000	OBSOLETE DATA CONVERTED V0203
See code 998

Test not done (test was not ordered and was not performed)
010	Positive/elevated
020	Negative/normal; within normal limits
030	Borderline; undetermined whether positive or negative
080	OBSOLETE DATA CONVERTED V0203
See code 997

Ordered, but results not in chart
988	Not applicable:  Information not collected for this case
(If this item is required by your standard setter, use of code 988 will result in an edit error.)
996	Test ordered, results not interpretable
997	Test ordered, results not in chart
998	Test not done (test not ordered and not performed)
999	Unknown or no information
Not documented in patient record
'''
df['CS2SITE']= df['CS2SITE'].replace({0:'Test not done',10:'Positive/elevated/amplified',20:'Negative/normal/not amplified',
                                  30:"Borderline/equivocal/ndeterminate",80:"Ordered, not in chart", 988:"Not applicable",
                                  996:"Test ordered, not interpretable",997:"Test ordered, not in chart",998:"Test(s) not done",
                                  999:"Unknown or no information"})



In [46]:
df["CS3SITE"].value_counts()

All nodes negative                      182195
No axillary nodes examined               29402
<10                                      22094
Unknown if positive                       5144
<20                                       4348
Positive aspiration of lymph node(s)      2383
<30                                       1002
Positive nodes, number unspecified         827
<40                                        201
<50                                         37
<60                                         16
90 or more positive                          5
<80                                          4
<70                                          2
<89                                          2
Name: CS3SITE, dtype: int64

In [47]:
#CS4SITE
'''
000	Regional lymph nodes negative on routine hematoxylin and eosin (H and E), no immunohistochemistry (IHC)  
OR unknown if tested for isolated tumor cells (ITCs) by IHC studies 
Nodes clinically negative, not examined pathologically
001	Regional lymph nodes negative on routine H and E, IHC studies done, negative for tumor
002	Regional lymph nodes negative on routine H and E, IHC studies done, positive for ITCs 
(Tumor cell clusters not greater than 0.2 millimeter (mm))
009	Regional lymph nodes negative on routine H and E, positive for tumor detected by IHC, size of tumor cell clusters or metastases not stated

Stated as N0(i+) with no further information on regional lymph nodes
888	OBSOLETE DATA CONVERTED V0200 
See code 987

Not applicable: CS Lymph Nodes not coded 000
987	Not applicable: CS Lymph Nodes not coded 000
988	Not applicable:  Information not collected for this case
(If this item is required by your standard setter, use of code 988 will result in an edit error.)
'''
df['CS4SITE']= df['CS4SITE'].replace({0:'nodes negative/no immunohistochemistry',1:'nodes negative/negative for tumor',
                                  2:'nodes negative/positive for ITCs',
                                  9:"nodes negative/positive for tumor",888:"Not applicable/not coded", 988:"Not applicable/not coded",
                                  987:"Not applicable/not coded"})


In [48]:
#CS5SITE
'''
000	Regional lymph nodes negative on routine hematoxylin and eosin (H and E), no RT-PCR molecular (MOL) studies done 
OR unknown if RT-PCR studies done
Nodes clinically negative, not examined pathologically
001	Regional lymph nodes negative on routine H and E, RT-PCR MOL studies done, negative for tumor
002	Regional lymph nodes negative on routine H and E, RT-PCR MOL studies done, positive for tumor
888	OBSOLETE DATA CONVERTED V0200  
See code 987

Not applicable
CS Lymph Nodes not coded 000
987	Not applicable:  CS Lymph Nodes not coded 000
988	Not applicable:  Information not collected for this case
(If this item is required by your standard setter, use of code 988 will result in an edit error.)
'''

df['CS5SITE']= df['CS5SITE'].replace({0:'nodes negative/no RT-PCR',1:'nodes negative/negative for tumor',
                                  2:'nodes negative/positive for tumor',888:"Not applicable/not coded", 
                                  988:"Not applicable/not coded",987:"Not applicable/not coded"})

In [49]:
#CS6SITE
'''
000	Entire tumor reported as invasive 
(No in situ component reported)
010	Entire tumor reported as in situ 
(No invasive component reported)
020	Invasive and in situ components present, size of invasive component stated and coded in CS Tumor Size
030	Invasive and in situ components present, size of entire tumor coded in CS Tumor Size because size of invasive component not stated
AND in situ described as minimal (less than 25%)
040	Invasive and in situ components present, size of entire tumor coded in CS Tumor Size because size of invasive component not stated
AND in situ described as extensive (25% or more)
050	Invasive and in situ components present, size of entire tumor coded in CS Tumor Size because size of invasive component not stated
AND proportions of in situ and invasive not known
060	Invasive and in situ components present, unknown size of tumor (CS Tumor Size coded 999)
888	OBSOLETE DATA CONVERTED V0200 
See code 987

Unknown if invasive and in situ components present, unknown if tumor size represents mixed tumor or a "pure" tumor. (See Note 2.)
Clinical tumor size coded.
987	Unknown if invasive and in situ components present, unknown if tumor size represents mixed tumor or a "pure" tumor. (See Note 2.)
Clinical tumor size coded.
988	Not applicable:  Information not collected for this case
(If this item is required by your standard setter, use of code 988 will result in an edit error.)
'''

df['CS6SITE']= df['CS6SITE'].replace({0:'tumor invasive',10:'tumor in situ',20:'Invasive and in situ, coded in CS Tumor Size',
                                  30:"Invasive and in situ (less than 25%)",40:"Invasive and in situ (25% or more)",
                                  50:"Invasive and in situ (not known)",60:"Invasive and in situ (unknown size of tumor)",
                                  888:"Unknown if invasive and in situ",987:"Unknown if invasive and in situ",
                                  988:"Not applicable"})

In [50]:
#CS25SITE
'''
988	Not applicable:  Site-specific factor not defined
'''
'''
Plus lots of NA - WIll be deleted
'''
df.drop('CS25SITE', axis=1,inplace=True)

In [51]:
#DAJCCT
'''
99 	TX 
00 	T0 
01 	Ta 
05 	Tis 
06 	Tispu (Urethra only) 
07 	Tispd (Urethra only) 
10 	T1 
11 	T1mic 
19 	T1 NOS 
12 	T1a 
13 	T1a1 
14 	T1a2 
15 	T1b 
16 	T1b1 
17 	T1b2 
18 	T1c 
20 	T2 
29 	T2 NOS 
21 	T2a 
22 	T2b 
23 	T2c 
30 	T3 
39 	T3 NOS 
31 	T3a 
32 	T3b 
33 	T3c 
40 	T4 
49 	T4 NOS 
41 	T4a 
42 	T4b 
43 	T4c 
44 	T4d 
80 	T1a NOS 
81 	T1b NOS 
88 	Not applicable 
'''
df['DAJCCT'] = df['DAJCCT'].replace({99: 'TX', 0: 'T0',1:'Ta',5:'Tis',6: 'Tispu', 7: 'Tispd',10:'T1',
                                 11: 'T1mic', 19: 'T1 NOS',12:'T1a',13:'T1a1',14: 'T1a2', 15: 'T1b',16:'T1b1',
                                 17: 'T1b2', 18: 'T1c',20:'T2',29:'T2 NOS',21: 'T2a', 22: 'T2b',23:'T2c',
                                 30: 'T3', 39: 'T3 NOS',31:'T3a',32:'T3b',33: 'T3c', 40: 'T4',49:'T4 NOS',
                                 41: 'T4a', 42: 'T4b',43:'T4c',44:'T4d',80: 'T1a NOS', 81: 'T1b NOS',88:'Not applicable'})


In [52]:
#DAJCCN
'''
99 	NX 
00 	N0 
01 	N0(i-) 
02 	N0(i+) 
03 	N0(mol-) 
04 	N0(mol+) 
10 	N1 
19 	N1 NOS 
11 	N1a 
12 	N1b 
13 	N1c 
18 	N1mi 
20 	N2 
29 	N2 NOS 
21 	N2a 
22 	N2b 
23 	N2c 
30 	N3 
39 	N3 NOS 
31 	N3a 
32 	N3b 
33 	N3c 
88 	NoN applicable 
'''

df['DAJCCN'] = df['DAJCCN'].replace({99: 'NX', 0: 'N0',1:'N0(i-)',2:'N0(i+)',3: 'N0(mol-)', 4: 'N0(mol+)',10:'N1',
                                 19: 'N1 NOS',11: 'N1a', 12:'N1b',13:'N1c',18: 'N1mi', 20: 'N2',29:'N2 NOS',
                                 21: 'N2a', 22: 'N2b',23:'N2c',30:'N3',39:'N3 NOS', 31: 'N3a',32:'N3b',
                                 33: 'N3c', 88:'Not applicable'})

In [53]:
#DAJCCM
'''
99	MX 
0	M0 
10	M1 
11	M1a 
12	M1b 
13	M1c 
19	M1 NOS 
88	Not applicable 
'''

df['DAJCCM'] = df['DAJCCM'].replace({99: 'MX', 0: 'M0',10:'M1',11:'M1a',12: 'M1b', 13: 'M1c',19:'M1 NOS',
                                 88:'Not applicable'})

In [54]:
#DAJCCSTG
'''
33 	Stage IIB 
34 	Stage IIC 
35 	Stage IIEA (lymphoma only) 
36 	Stage IIEB (lymphoma only) 
37 	Stage IIE (lymphoma only) 
38 	Stage IISA (lymphoma only) 
39 	Stage IISB (lymphoma only) 
40 	Stage IIS (lymphoma only) 
41 	Stage IIESA (lymphoma only) 
42 	Stage IIESB (lymphoma only) 
43 	Stage IIES (lymphoma only) 
50 	Stage III 
51 	Stage III NOS 
52 	Stage IIIA 
53 	Stage IIIB 
54 	Stage IIIC 
55 	Stage IIIEA (lymphoma only) 
56 	Stage IIIEB (lymphoma only) 
57 	Stage IIIE (lymphoma only) 
58 	Stage IIISA (lymphoma only) 
59 	Stage IIISB (lymphoma only) 
60 	Stage IIIS (lymphoma only) 
61 	Stage IIIESA (lymphoma only) 
62 	Stage IIIESB (lymphoma only) 
63 	Stage IIIES (lymphoma only) 
70 	Stage IV 
71 	Stage IV NOS 
72 	Stage IVA 
73 	Stage IVB 
74 	Stage IVC 
88 	Not applicable 
90 	Stage Occult 
99 	Stage Unknown 
'''





replace={0:'Stage 0',1:'Stage 0a',2:'Stage 0is',10:'Stage I',11:'Stage I NOS',
         12:'Stage IA',13:'Stage IA1',14:'Stage IA2',15:'Stage IB',16:'Stage IB1',
         17:'Stage IB2',18:'Stage IC',19:'Stage IS',23:'Stage ISA',24:'Stage ISB',
         20:'Stage IEA',21:'Stage IEB',22:'Stage IE',30:'Stage II',31:'Stage II NOS',
         32:'Stage IIA', 33:'Stage IIB',34:'Stage IIC',35:'Stage IIEA',36 :'Stage IIEB',
         37:'Stage IIE',38:'Stage IISA',39 :'Stage IISB',40 :'Stage IIS',
         41:'Stage IIESA',42:'Stage IIESB',43: 'Stage IIES',50 :'Stage III',
         51:'Stage III NOS',52:'Stage IIIA ',53:'Stage IIIB',54 :'Stage IIIC',
         55:'Stage IIIEA',56 :'Stage IIIEB',57 :'Stage IIIE',58:'Stage IIISA',
         59:'Stage IIISB',60:'Stage IIIS',61:'Stage IIIESA',62:'Stage IIIESB',
         63:'Stage IIIES',70:'Stage IV',71:'Stage IV NOS',72:'Stage IVA',73:'Stage IVB',
         74:'Stage IVC',88:'Not applicable',90:'Stage Occult',99:'Stage Unknown'}
df.replace({"DAJCCSTG": replace}, inplace=True)

In [55]:
#DSS1977S
'''
0	In Situ 
1	Localized 
2	Regional, direct extension 
3	Regional, lymph nodes only 
4	Regional, extension and nodes 
5	Regional, NOS 
7	Distant 
8	Not applicable 
9	Unknown/Unstaged 
'''



df['DSS1977S']= df['DSS1977S'].replace({0:'In Situ',1:'Localized',2:'Regional, direct extension',
                                    3 :'Regional, lymph nodes only',4:'Regional, extension and nodes',
                                    5:'Regional, NOS',7 :'Distant',8 :'Not applicable',9:'Unknown/Unstaged'})

In [56]:
#DSS2000S
'''
0	In Situ 
1	Localized 
2	Regional, direct extension 
3	Regional, lymph nodes only 
4	Regional, extension and nodes 
5	Regional, NOS 
7	Distant 
8	Not applicable 
9	Unknown/Unstaged 
'''

df['DSS2000S']= df['DSS2000S'].replace({0:'In Situ',1:'Localized',2:'Regional, direct extension',
                                    3 :'Regional, lymph nodes only',4:'Regional, extension and nodes',
                                    5:'Regional, NOS',7 :'Distant',8 :'Not applicable',9:'Unknown/Unstaged'})

In [57]:

#DAJCCFL             
'''
1	AJCC Sixth Edition derived from Collaborative Staging Manual and Coding Instructions, Version 1.0 
2	AJCC Sixth Edition derived from EOD (prior to 2004) 
Blank 	Not derived 
'''
df['DAJCCFL']= df['DAJCCFL'].replace({0:'Not derived',1:'Derived from Collaborative Staging Manual and Coding Instructions',
                                  2:'derived from EOD'})

In [58]:
df['DAJCCFL'].value_counts()

Not derived                                                          168249
Derived from Collaborative Staging Manual and Coding Instructions     79413
Name: DAJCCFL, dtype: int64

In [59]:
#NO_SURG
'''
0 	Surgery performed 
1	Surgery not recommended 
2	Contraindicated due to other conditions; Autopsy Only case (1973-2002) 
5 	Patient died before recommended surgery 
6 	Unknown reason for no surgery 
7	Patient or patient's guardian refused 
8 	Recommended, unknown if done 
9 	Unknown if surgery performed; Death Certificate Only case; Autopsy only case (2003+) 
'''

df['NO_SURG']= df['NO_SURG'].replace({0:'Surgery performed',1:'Surgery not recommended',2:'Contraindicated/Autopsy',
                                  5 :'Died before surgery',6:'Unknown reason for no surgery',
                                  7:'Refused',8 :'Recommended, unknown if done',9 :'Unknown if performed/ Death Certificate/Autopsy'})

In [60]:
#cs0204schema
'''
001 	AdnexaUterineOther 
002 	AdrenalGland 
003 	AmpullaVater 
004 	Anus 
005 	Appendix 
006 	BileDuctsDistal 
007 	BileDuctsIntraHepat 
008 	BileDuctsPerihilar 
009 	BiliaryOther 
010 	Bladder 
011 	Bone 
012 	Brain 
013 	Breast 
014 	BuccalMucosa 
015 	CarcinoidAppendix 
016 	Cervix 
017 	CNSOther 
018 	Colon 
019 	Conjunctiva 
020 	CorpusAdenosarcoma 
021 	CorpusCarcinoma 
022 	CorpusSarcoma 
023 	CysticDuct 
024 	DigestiveOther 
025 	EndocrineOther 
026 	EpiglottisAnterior 
027 	Esophagus 
028 	EsophagusGEJunction 
029 	EyeOther 
030 	FallopianTube 
031 	FloorMouth 
032 	Gallbladder 
033 	GenitalFemaleOther 
034 	GenitalMaleOther 
035 	GISTAppendix 
036 	GISTColon 
037 	GISTEsophagus 
038 	GISTPeritoneum 
039 	GISTRectum 
040 	GISTSmallIntestine 
041 	GISTStomach 
042 	GumLower 
043 	GumOther 
044 	GumUpper 
045 	HeartMediastinum 
046 	HemeRetic 
047 	Hypopharynx 
048 	IllDefinedOther 
049 	IntracranialGland 
050 	KaposiSarcoma 
051 	KidneyParenchyma 
052 	KidneyRenalPelvis 
053 	LacrimalGland 
054 	LacrimalSac 
055 	LarynxGlottic 
056 	LarynxOther 
057 	LarynxSubglottic 
058 	LarynxSupraglottic 
059 	LipLower 
060 	LipOther 
061 	LipUpper 
062 	Liver 
063 	Lung 
064 	Lymphoma 
065 	LymphomaOcularAdnexa 
066 	MelanomaBuccalMucosa 
067 	MelanomaChoroid 
068 	MelanomaCiliaryBody 
069 	MelanomaConjunctiva 
070 	MelanomaEpiglottisAnterior 
071 	MelanomaEyeOther 
072 	MelanomaFloorMouth 
073 	MelanomaGumLower 
074 	MelanomaGumOther 
075 	MelanomaGumUpper 
076 	MelanomaHypopharynx 
077 	MelanomaIris 
078 	MelanomaLarynxGlottic 
079 	MelanomaLarynxOther 
080 	MelanomaLarynxSubglottic 
081 	MelanomaLarynxSupraglottic 
082 	MelanomaLipLower 
083 	MelanomaLipOther 
084 	MelanomaLipUpper 
085 	MelanomaMouthOther 
086 	MelanomaNasalCavity 
087 	MelanomaNasopharynx 
088 	MelanomaOropharynx 
089 	MelanomaPalateHard 
090 	MelanomaPalateSoft 
091 	MelanomaPharynxOther 
092 	MelanomaSinusEthmoid 
093 	MelanomaSinusMaxillary 
094 	MelanomaSinusOther 
095 	MelanomaSkin 
096 	MelanomaTongueAnterior 
097 	MelanomaTongueBase 
098 	MerkelCellPenis 
099 	MerkelCellScrotum 
100 	MerkelCellSkin 
101 	MerkelCellVulva 
102 	MiddleEar 
103 	MouthOther 
104 	MycosisFungoides 
105 	MyelomaPlasmaCellDisorder 
106 	NasalCavity 
107 	Nasopharynx 
108 	NETAmpulla 
109 	NETColon 
110 	NETRectum 
111 	NETSmallIntestine 
112 	NETStomach 
113 	Orbit 
114 	Oropharynx 
115 	Ovary 
116 	PalateHard 
117 	PalateSoft 
118 	PancreasBodyTail 
119 	PancreasHead 
120 	PancreasOther 
121 	ParotidGland 
122 	Penis 
123 	Peritoneum 
124 	PeritoneumFemaleGen 
125 	PharyngealTonsil 
126 	PharynxOther 
127 	Placenta 
128 	Pleura 
129 	Prostate 
130 	Rectum 
131 	RespiratoryOther 
132 	Retinoblastoma 
133 	Retroperitoneum 
134 	SalivaryGlandOther 
135 	Scrotum 
136 	SinusEthmoid 
137 	SinusMaxillary 
138 	SinusOther 
139 	Skin 
140 	SkinEyelid 
141 	SmallIntestine 
142 	SoftTissue 
143 	Stomach 
144 	SubmandibularGland 
145 	Testis 
146 	Thyroid 
147 	TongueAnterior 
148 	TongueBase 
149 	Trachea 
150 	Urethra 
151 	UrinaryOther 
152 	Vagina 
153 	Vulva 
'''
'''
013    1631572
Name: cs0204schema, dtype: int64 - WILL BE DELETED
'''

df=df.drop(['cs0204schema'], axis=1)

In [61]:
#HST_STGA
'''
0	In situ — A noninvasive neoplasm; a tumor which has not penetrated the basement membrane nor extended beyond the epithelial tissue. Some synonyms are intraepithelial (confined to epithelial tissue), noninvasive and noninfiltrating. 
1	Localized — An invasive neoplasm confined entirely to the organ of origin. It may include intraluminal extension where specified. For example for colon, intraluminal extension limited to immediately contiguous segments of the large bowel is localized, if no lymph nodes are involved. Localized may exclude invasion of the serosa because of the poor survival of the patient once the serosa is invaded. 
2	Regional — A neoplasm that has extended 1) beyond the limits of the organ of origin directly into surrounding organs or tissues; 2) into regional lymph nodes by way of the lymphatic system; or 3) by a combination of extension and regional lymph nodes. 
4	Distant — A neoplasm that has spread to parts of the body remote from the primary tumor either by direct extension or by discontinuous metastasis (e.g., implantation or seeding) to distant organs, issues, or via the lymphatic system to distant lymph nodes. 
8	Localized/Regional – Only used for Prostate cases. 
9	Unstaged — Information is not sufficient to assign a stage. 
'''

df['HST_STGA']= df['HST_STGA'].replace({0:'In situ - noninvasive neoplasm',1:'Localized - invasive neoplasm',
                             2:'Regional — neoplasm that has extended',4 :'Distant — neoplasm that has spread',
                             8:'Localized/Regional – Only used for Prostate cases',9:'Unstaged'})

In [62]:
#FIRSTPRM
'''
0	no
1	yes
'''
df['FIRSTPRM']= df['FIRSTPRM'].replace({0:'no',1:'yes'})

In [63]:
df['FIRSTPRM'].value_counts()

yes    206484
no      41178
Name: FIRSTPRM, dtype: int64

In [64]:
#CODPUB              
'''
20010	Lip, Oral/Pharynx, Malignant
20020	Tongue, Oral/Pharynx, Malignant
20030	Salivary Gland, Oral/Pharynx, Malignant
20040	Floor of Mouth, Oral/Pharynx, Malignant
20050	Gum and Other Mouth, Oral/Pharynx, Malignant
20060	Nasopharynx, Oral/Pharynx, Malignant
20070	Tonsil, Oral/Pharynx, Malignant
20080	Oropharynx, Oral/Pharynx, Malignant
20090	Hypopharynx, Oral/Pharynx, Malignant
20100	Other Oral Cavity and Pharynx, Oral/Pharynx, Malignant
21010	Esophagus, Digestive, Malignant
21020	Stomach, Digestive, Malignant
21030	Small Intestine, Digestive, Malignant
21040	Colon excluding Rectum, Digestive, Malignant
21050	Rectum and Rectosigmoid Junction, Digestive, Malignant
21060	Anus, Anal Canal and Anorectum, Digestive, Malignant
21071	Liver, Digestive, Malignant
21072	Intrahepatic Bile Duct, Digestive, Malignant
21080	Gallbladder, Digestive, Malignant
21090	Other Biliary, Digestive, Malignant
21100	Pancreas, Digestive, Malignant
21110	Retroperitoneum, Digestive, Malignant
21120	Peritoneum, Omentum and Mesentery, Digestive, Malignant
21130	Other Digestive Organs, Digestive, Malignant
22010	Nose, Nasal Cavity and Middle Ear, Respiratory, Malignant
22020	Larynx, Respiratory, Malignant
22030	Lung and Bronchus, Respiratory, Malignant
22050	Pleura, Respiratory, Malignant
22060	Trachea, Mediastinum and Other Respiratory Organs, Respiratory, Malignant
23000	Bones and Joints,  Malignant
24000	Soft Tissue including Heart , Malignant
25010	Skin & Melanoma , Skin, Malignant
25020	Non-Melanoma , Skin, Malignant
26000	Breast, Malignant
27010	Cervix Uteri, Female genitalia, Malignant
27020	Corpus Uteri, Female genitalia, Malignant
27030	Uterus, NOS, Female genitalia, Malignant
27040	Ovary, Female genitalia, Malignant
27050	Vagina, Female genitalia, Malignant
27060	Vulva, Female genitalia, Malignant
27070	Other Female Genital Organs, Female genitalia, Malignant
28010	Prostate, Male genitalia, Malignant
28020	Testis, Male genitalia, Malignant
28030	Penis, Male genitalia, Malignant
28040	Other Male Genital Organs, Male genitalia, Malignant
29010	Urinary Bladder, Urinary, Malignant
29020	Kidney and Renal Pelvis, Urinary, Malignant
29030	Ureter, Urinary, Malignant
29040	Other Urinary Organs, Urinary, Malignant
30000	Eye and Orbit, Malignant
31010	Brain and Other Nervous System, Malignant
32010	Thyroid, Endocrine, Malignant
32020	Other Endocrine including thymus, Endocrine, Malignant
33010	Hodgkin Lymphoma, Lymphoma, Malignant
33040	Non-Hodgkin Lymphoma, Lymphoma, Malignant
34000	Myeloma, Malignant
35011	Acute Lymphocytic Leukemia, Leukemia, Malignant
35012	Chronic Lymphocytic Leukemia, Leukemia, Malignant
35013	Other Lymphocytic Leukemia, Leukemia, Malignant
35021	Acute myeloid, Leukemia, Malignant
35031	Acute Monocytic Leukemia, Leukemia, Malignant
35022	Chronic Myeloid Leukemia, Leukemia, Malignant
35023	Other Myeloid/Monocytic Leukemia, Leukemia, Malignant
35041	Other Acute Leukemia, Leukemia, Malignant
35043	Aleukemic, subleukemic and NOS, Leukemia, Malignant
36010	Mesothelioma (ICD-10 only) +, Malignant
36020	Kaposi Sarcoma (ICD-10 only) +, Malignant
37000	Miscellaneous Malignant Cancer, Malignant
38000	In situ, benign or unknown behavior neoplasm, Non-Cancer
50000	Tuberculosis, Non-Cancer
50010	Syphilis, Non-Cancer
50040	Human Immunodeficiency Virus (HIV) (1987+), Non-Cancer
50030	Septicemia, Non-Cancer
50040 	Other Infectious and Parasitic Diseases, Non-Cancer
50050	Diabetes Mellitus, Non-Cancer
50051	Alzheimers (ICD-9 and 10 only), Non-Cancer
50060	Diseases of Heart, Non-Cancer
50070	Hypertension without Heart Disease, Non-Cancer
50080	Cerebrovascular Diseases, Non-Cancer
50090	Atherosclerosis, Non-Cancer
50100	Aortic Aneurysm and Dissection, Non-Cancer
50110	Other Diseases of Arteries, Arterioles, Capillaries, Non-Cancer
50120	Pneumonia and Influenza, Non-Cancer
50130	Chronic Obstructive Pulmonary Disease and Allied Cond, Non-Cancer
50140	Stomach and Duodenal Ulcers, Non-Cancer
50150	Chronic Liver Disease and Cirrhosis, Non-Cancer
50160	Nephritis, Nephrotic Syndrome and Nephrosis, Non-Cancer
50170	Complications of Pregnancy, Childbirth, Puerperium, Non-Cancer
50180	Congenital Anomalies, Non-Cancer
50190	Certain Conditions Originating in Perinatal Period, Non-Cancer
50200	Symptoms, Signs and Ill-Defined Conditions, Non-Cancer
50210	Accidents and Adverse Effects, Non-Cancer
50220	Suicide and Self-Inflicted Injury, Non-Cancer
50230	Homicide and Legal Intervention, Non-Cancer
50300	Other Cause of Death, Non-Cancer
0	Alive
41000 	Not Available/no COD
99999 	Unknown
'''
list1=[20010,20020,20030,20040,20050,20060,20070,20080,20090,20100,21010,21020,21030,21040,21050,21060,
       21071,21072,21080,21090,21100,21110,21120,21130,22010,22020,22030,22050,22060,23000,24000,25010,
       25020,26000,27010,27020,27030,27040,27050,27060,27070,28010,28020,28030,28040,29010,29020,29030,
       29040,30000,31010,32010,32020,33010,33040,34000,35011,35012,35013,35021,35031,35022,35023,35041,
       35043,36010,36020,37000,38000,50000,50010,50040,50030,50040,50050,50051,50060,50070,50080,50090,
       50100,50110,50120,50130,50140,50150,50160,50170,50180,50190,50200,50210,50220,50230,50300,0,41000,
       99999]
list2=['Lip, Oral/Pharynx, Malignant','Tongue, Oral/Pharynx, Malignant','Salivary Gland, Oral/Pharynx, Malignant',
       'Floor of Mouth, Oral/Pharynx, Malignant','Gum and Other Mouth, Oral/Pharynx, Malignant',
       'Nasopharynx, Oral/Pharynx, Malignant','Tonsil, Oral/Pharynx, Malignant','Oropharynx, Oral/Pharynx, Malignant',
       'Hypopharynx, Oral/Pharynx, Malignant','Other Oral Cavity and Pharynx, Oral/Pharynx, Malignant',
       'Esophagus, Digestive, Malignant','Stomach, Digestive, Malignant','Small Intestine, Digestive, Malignant',
       'Colon excluding Rectum, Digestive, Malignant','Rectum and Rectosigmoid Junction, Digestive, Malignant',
       'Anus, Anal Canal and Anorectum, Digestive, Malignant','Liver, Digestive, Malignant',
       'Intrahepatic Bile Duct, Digestive, Malignant','Gallbladder, Digestive, Malignant',
       'Other Biliary, Digestive, Malignant','Pancreas, Digestive, Malignant','Retroperitoneum, Digestive, Malignant',
       'Peritoneum, Omentum and Mesentery, Digestive, Malignant','Other Digestive Organs, Digestive, Malignant',
       'Nose, Nasal Cavity and Middle Ear, Respiratory, Malignant','Larynx, Respiratory, Malignant',
       'Lung and Bronchus, Respiratory, Malignant','Pleura, Respiratory, Malignant',
       'Trachea, Mediastinum and Other Respiratory Organs, Respiratory, Malignant','Bones and Joints,  Malignant',
       'Soft Tissue including Heart , Malignant','Skin & Melanoma , Skin, Malignant','Non-Melanoma , Skin, Malignant',
       'Breast, Malignant','Cervix Uteri, Female genitalia, Malignant','Corpus Uteri, Female genitalia, Malignant',
       'Uterus, NOS, Female genitalia, Malignant','Ovary, Female genitalia, Malignant','Vagina, Female genitalia, Malignant',
       'Vulva, Female genitalia, Malignant','Other Female Genital Organs, Female genitalia, Malignant',
       'Prostate, Male genitalia, Malignant','Testis, Male genitalia, Malignant','Penis, Male genitalia, Malignant',
       'Other Male Genital Organs, Male genitalia, Malignant','Urinary Bladder, Urinary, Malignant',
       'Kidney and Renal Pelvis, Urinary, Malignant','Ureter, Urinary, Malignant','Other Urinary Organs, Urinary, Malignant',
       'Eye and Orbit, Malignant','Brain and Other Nervous System, Malignant','Thyroid, Endocrine, Malignant',
       'Other Endocrine including thymus, Endocrine, Malignant','Hodgkin Lymphoma, Lymphoma, Malignant',
       'Non-Hodgkin Lymphoma, Lymphoma, Malignant','Myeloma, Malignant','Acute Lymphocytic Leukemia, Leukemia, Malignant',
       'Chronic Lymphocytic Leukemia, Leukemia, Malignant','Other Lymphocytic Leukemia, Leukemia, Malignant',
       'Acute myeloid, Leukemia, Malignant','Acute Monocytic Leukemia, Leukemia, Malignant',
       'Chronic Myeloid Leukemia, Leukemia, Malignant','Other Myeloid/Monocytic Leukemia, Leukemia, Malignant',
       'Other Acute Leukemia, Leukemia, Malignant','Aleukemic, subleukemic and NOS, Leukemia, Malignant',
       'Mesothelioma (ICD-10 only) +, Malignant','Kaposi Sarcoma (ICD-10 only) +, Malignant',
       'Miscellaneous Malignant Cancer, Malignant','In situ, benign or unknown behavior neoplasm, Non-Cancer',
       'Tuberculosis, Non-Cancer','Syphilis, Non-Cancer','Human Immunodeficiency Virus (HIV) (1987+), Non-Cancer',
       'Septicemia, Non-Cancer','Other Infectious and Parasitic Diseases, Non-Cancer','Diabetes Mellitus, Non-Cancer',
       'Alzheimers (ICD-9 and 10 only), Non-Cancer',
       'Diseases of Heart, Non-Cancer','Hypertension without Heart Disease, Non-Cancer',
       'Cerebrovascular Diseases, Non-Cancer','Atherosclerosis, Non-Cancer','Aortic Aneurysm and Dissection, Non-Cancer',
       'Other Diseases of Arteries, Arterioles, Capillaries, Non-Cancer','Pneumonia and Influenza, Non-Cancer',
       'Chronic Obstructive Pulmonary Disease and Allied Cond, Non-Cancer','Stomach and Duodenal Ulcers, Non-Cancer',
       'Chronic Liver Disease and Cirrhosis, Non-Cancer','Nephritis, Nephrotic Syndrome and Nephrosis, Non-Cancer',
       'Complications of Pregnancy, Childbirth, Puerperium, Non-Cancer','Congenital Anomalies, Non-Cancer',
       'Certain Conditions Originating in Perinatal Period, Non-Cancer',
       'Symptoms, Signs and Ill-Defined Conditions, Non-Cancer','Accidents and Adverse Effects, Non-Cancer',
       'Suicide and Self-Inflicted Injury, Non-Cancer','Homicide and Legal Intervention, Non-Cancer',
       'Other Cause of Death, Non-Cancer','Alive','Not Available/no COD','Unknown']


In [65]:
recode=dict(zip(list1, list2))

In [66]:
df['CODPUB']= df['CODPUB'].replace(recode)

In [67]:
# Replace Non cancer cases with "Non-Cancer"
df['CODPUB'] = df['CODPUB'].replace({'In situ, benign or unknown behavior neoplasm, Non-Cancer':'Non-Cancer',
                                     'Tuberculosis, Non-Cancer':'Non-Cancer','Syphilis, Non-Cancer':'Non-Cancer',
                                     'Human Immunodeficiency Virus (HIV) (1987+), Non-Cancer':'Non-Cancer',
                                     'Septicemia, Non-Cancer':'Non-Cancer',
                                     'Other Infectious and Parasitic Diseases, Non-Cancer':'Non-Cancer',
                                     'Diabetes Mellitus, Non-Cancer':'Non-Cancer',
                                     'Alzheimers (ICD-9 and 10 only), Non-Cancer':'Non-Cancer',
                                     'Diseases of Heart, Non-Cancer':'Non-Cancer',
                                     'Hypertension without Heart Disease, Non-Cancer':'Non-Cancer',
                                     'Cerebrovascular Diseases, Non-Cancer':'Non-Cancer',
                                     'Atherosclerosis, Non-Cancer':'Non-Cancer',
                                     'Aortic Aneurysm and Dissection, Non-Cancer':'Non-Cancer',
                                     'Other Diseases of Arteries, Arterioles, Capillaries, Non-Cancer':'Non-Cancer',
                                     'Pneumonia and Influenza, Non-Cancer':'Non-Cancer',
                                     'Chronic Obstructive Pulmonary Disease and Allied Cond, Non-Cancer':'Non-Cancer',
                                     'Stomach and Duodenal Ulcers, Non-Cancer':'Non-Cancer',
                                     'Chronic Liver Disease and Cirrhosis, Non-Cancer':'Non-Cancer',
                                     'Nephritis, Nephrotic Syndrome and Nephrosis, Non-Cancer':'Non-Cancer',
                                     'Complications of Pregnancy, Childbirth, Puerperium, Non-Cancer':'Non-Cancer',
                                     'Congenital Anomalies, Non-Cancer':'Non-Cancer',
                                     'Certain Conditions Originating in Perinatal Period, Non-Cancer':'Non-Cancer',
                                     'Symptoms, Signs and Ill-Defined Conditions, Non-Cancer':'Non-Cancer',
                                     'Accidents and Adverse Effects, Non-Cancer':'Non-Cancer',
                                     'Suicide and Self-Inflicted Injury, Non-Cancer':'Non-Cancer',
                                     'Homicide and Legal Intervention, Non-Cancer':'Non-Cancer',
                                     'Other Cause of Death, Non-Cancer':'Non-Cancer'})

In [68]:
#STAT_REC
'''
1	Alive
4	Dead
'''
df['STAT_REC']= df['STAT_REC'].replace({1:'Alive',4:'Dead'})

# DEPENDANT VARIABLE

In [69]:
#VSRTSADX
'''
0	Alive or dead of other cause 
1	Dead 
9	N/A not first tumor 
'''

df['VSRTSADX']= df['VSRTSADX'].replace({9:0,8:0})

In [96]:
df['VSRTSADX']=df['VSRTSADX'].astype(str)

In [70]:
df['VSRTSADX'].value_counts()

1    206470
0     41192
Name: VSRTSADX, dtype: int64

In [71]:
#ODTHCLASS
'''
0	Alive or dead due to cancer 
1	Dead 
9	N/A not first tumor 
8 - Dead (missing/unknown COD)
'''
df['ODTHCLASS']= df['ODTHCLASS'].replace({0:'Alive or dead due to cancer',1:'Dead',9:'N/A not first tumor',
                                         8:"Dead"})

In [72]:
df['ODTHCLASS'].value_counts()

Alive or dead due to cancer    206470
N/A not first tumor             41154
Dead                               38
Name: ODTHCLASS, dtype: int64

In [73]:
#CS15SITE
'''
010	Positive/elevated; amplified
020	Negative/normal; within normal limits; not amplified
030	Borderline; equivocal; indeterminate; undetermined whether positive or negative
988	Not applicable: Information not collected for this case
(If this information is required by your standard setter, use of code 988 may result in an edit error.)
997	Test(s) ordered, results not in chart
998	Test(s) not done (test(s) not ordered and not performed)
999	Unknown or no information
Not documented in patient record
'''
df['CS15SITE']= df['CS15SITE'].replace({0:'Not documented',10:'Positive/elevated/amplified',20:'Negative/normal, not amplified',
                                    30:"Borderline/equivocal/ndeterminate",988:"Not applicable",997:"Test(s) ordered",
                                    998:"Test(s) not done",999:"Unknown or no information"})


In [74]:
df['CS15SITE'].value_counts()

Not documented                       220947
Negative/normal, not amplified        16506
Positive/elevated/amplified            3823
Unknown or no information              3191
Test(s) not done                       1922
Borderline/equivocal/ndeterminate       589
Not applicable                          404
Test(s) ordered                         280
Name: CS15SITE, dtype: int64

In [75]:
#DAJCC7T
'''
999 	TX 
000 	T0 
010 	Ta 
050 	Tis 
060 	Tispu (Urethra only) 
070 	Tispd (Urethra only) 
100 	T1 
110 	T1mic 
199 	T1 NOS 
120 	T1a 
130 	T1a1 
140 	T1a2 
150 	T1b 
160 	T1b1 
170 	T1b2 
180 	T1c 
181 	T1d 
200 	T2 
299 	T2 NOS 
210 	T2a 
211 	T2a1 
212 	T2a2 
213 	T2a NOS 
220 	T2b 
230 	T2c 
240 	T2d 
300 	T3 
399 	T3 NOS 
310 	T3a 
320 	T3b 
330 	T3c 
400 	T4 
499 	T4 NOS 
410 	T4a 
420 	T4b 
430 	T4c 
440 	T4d 
450 	T4e 
800 	T1a NOS 
810 	T1b NOS 
888 	Not applicable 

'''
replace={0:"Not applicable", 999: 'TX', 0: 'T0',10:'Ta',50:'Tis',60: 'Tispu', 70: 'Tispd',100:'T1',
                                 110: 'T1mic', 199: 'T1 NOS',120:'T1a',130:'T1a1',140: 'T1a2', 150: 'T1b',160:'T1b1',
                                 170: 'T1b2', 180: 'T1c',181:"T1d", 200:'T2',299:'T2 NOS',210: 'T2a',211:"T2a1",
                                 212:"T2a2",213:"T2a NOS",220: 'T2b',230:'T2c',240:"T2d", 300: 'T3', 399: 'T3 NOS',
                                 310:'T3a',320:'T3b',330: 'T3c', 400: 'T4',499:'T4 NOS',410: 'T4a', 420: 'T4b',
                                 430:'T4c',440:'T4d',450:"T4e", 800: 'T1a NOS', 810: 'T1b NOS',88:'Not applicable',888:'Not applicable'}
df.replace({"DAJCC7T": replace}, inplace=True)

In [76]:
#DAJCC7N
'''
999 	NX 
000 	N0 
010 	N0(i-) 
020 	N0(i+) 
030 	N0(mol-) 
040 	N0(mol+) 
100 	N1 
199 	N1 NOS 
110 	N1a 
120 	N1b 
130 	N1c 
180 	N1mi 
200 	N2 
299 	N2 NOS 
210 	N2a 
220 	N2b 
230 	N2c 
300 	N3 
399 	N3 NOS 
310 	N3a 
320 	N3b 
330 	N3c 
400 	N4 
888 	Not applicable 

'''

df['DAJCC7N'] = df['DAJCC7N'].replace({999: 'NX', 0: 'N0',10:'N0(i-)',20:'N0(i+)',30: 'N0(mol-)', 40: 'N0(mol+)',100:'N1',
                                 199: 'N1 NOS',110: 'N1a', 120:'N1b',130:'N1c',180: 'N1mi', 200: 'N2',299:'N2 NOS',
                                 210: 'N2a', 220: 'N2b',230:'N2c',300:'N3',399:'N3 NOS', 310: 'N3a',320:'N3b',
                                 330: 'N3c',400:"N4", 888:'Not applicable'})

In [77]:
#DAJCC7M
'''
999 	MX 
000 	M0 
010 	M0(i+) 
100 	M1 
110 	M1a 
120 	M1b 
130 	M1c 
140 	M1d 
150 	M1e 
199 	M1 NOS 
888 	Not applicable 
'''
df['DAJCC7M'] = df['DAJCC7M'].replace({999: 'MX', 0: 'M0',10:"M0(i+)", 100:'M1',110:'M1a',120: 'M1b', 130: 'M1c',
                                   140:"M1d",150:"M1e", 199:'M1 NOS',888:'Not applicable'})


In [78]:
#DAJCC7STG
'''
0	Stage 0 
010 	Stage 0a 
020 	Stage 0is 
100 	Stage I 
110 	Stage I NOS 
120 	Stage IA 
130 	Stage IA1 
140 	Stage IA2 
121 	Stage IA NOS 
150 	Stage IB 
160 	Stage IB1 
170 	Stage IB2 
151 	Stage IB NOS 
180 	Stage IC 
190 	Stage IS 
230 	Stage ISA (lymphoma only) 
240 	Stage ISB (lymphoma only) 
200 	Stage IEA (lymphoma only) 
210 	Stage IEB (lymphoma only) 
220 	Stage IE (lymphoma only) 
300 	Stage II 
310 	Stage II NOS 
320 	Stage IIA 
321 	Stage IIA NOS 
322 	Stage IIA1 
323 	Stage IIA NOS 
330 	Stage IIB 
340 	Stage IIC 
350 	Stage IIEA (lymphoma only) 
360 	Stage IIEB (lymphoma only) 
370 	Stage IIE (lymphoma only) 
380 	Stage IISA (lymphoma only) 
390 	Stage IISB (lymphoma only) 
400 	Stage IIS (lymphoma only) 
410 	Stage IIESA (lymphoma only) 
420 	Stage IIESB (lymphoma only) 
430 	Stage IIES (lymphoma only) 
500 	Stage III 
510 	Stage III NOS 
520 	Stage IIIA 
530 	Stage IIIB 
540 	Stage IIIC 
541 	Stage IIIC1 
542 	Stage IIIC2 
550 	Stage IIIEA (lymphoma only) 
560 	Stage IIIEB (lymphoma only) 
570 	Stage IIIE (lymphoma only) 
580 	Stage IIISA (lymphoma only) 
590 	Stage IIISB (lymphoma only) 
600 	Stage IIIS (lymphoma only) 
610 	Stage IIIESA (lymphoma only) 
620 	Stage IIIESB (lymphoma only) 
630 	Stage IIIES (lymphoma only) 
700 	Stage IV 
710 	Stage IV NOS 
720 	Stage IVA 
730 	Stage IVB 
740 	Stage IVC 
888 	Not applicable 
900 	Stage Occult 
999 	Stage Unknown 
'''

replace={0:'Stage 0',10:'Stage 0a',20:'Stage 0is',100:'Stage I',110:'Stage I NOS',
         120:'Stage IA',130:'Stage IA1',140:'Stage IA2',121:'Stage IA NOS', 150:'Stage IB',160:'Stage IB1',
         170:'Stage IB2',151 :'Stage IB NOS', 180:'Stage IC',190:'Stage IS',230:'Stage ISA',240:'Stage ISB',
         200:'Stage IEA',210:'Stage IEB',220:'Stage IE',300:'Stage II',310:'Stage II NOS',
         320:'Stage IIA',321:'Stage IIA NOS',322:'Stage IIA1',323:'Stage IIA NOS', 330:'Stage IIB',
         340:'Stage IIC',350:'Stage IIEA',360 :'Stage IIEB',
         370:'Stage IIE',380:'Stage IISA',390:'Stage IISB',400 :'Stage IIS',
         410:'Stage IIESA',420:'Stage IIESB',430: 'Stage IIES',500 :'Stage III',
         510:'Stage III NOS',520:'Stage IIIA ',530:'Stage IIIB',540 :'Stage IIIC',
         541 :'Stage IIIC1',542 :'Stage IIIC2',550:'Stage IIIEA',560 :'Stage IIIEB',570 :'Stage IIIE',580:'Stage IIISA',
         590:'Stage IIISB',600:'Stage IIIS',610:'Stage IIIESA',620:'Stage IIIESB',
         630:'Stage IIIES',700:'Stage IV',710:'Stage IV NOS',720:'Stage IVA',730:'Stage IVB',
         740:'Stage IVC',888:'Not applicable',900:'Stage Occult',999:'Stage Unknown'}
df.replace({"DAJCC7STG": replace}, inplace=True)

In [79]:
#CS7SITE
'''
030	Score of 3
040	Score of 4
050	Score of 5
060	Score of 6
070	Score of 7
080	Score of 8
090	Score of 9
110	Low Grade, Bloom-Richardson (BR) grade 1, score not given
120	Medium (Intermediate) Grade, BR grade 2, score not given
130	High Grade, BR grade 3, score not given
988	Not applicable: Information not collected for this case
(If this information is required by your standard setter, use of code 988 may result in an edit error.)
998	No histologic examination of primary site
999	Neither BR grade nor BR score given
Unknown or no information
Not documented in patient record
'''

df['CS7SITE'] = df['CS7SITE'].replace({0: 'Unknown',30:"Score of 3",40:'Score of 4',50:'Score of 5',60:'Score of 6',
                                   70:'Score of 7',80:'Score of 8',90:'Score of 9',110:'Low Grade',
                                   120:'Medium Grade',130:'High Grade',988:'Not applicable',
                                   998:'No histologic examination',999:'Unknown'})

In [80]:
#brst_sub
'''
1	Her2+/HR+ 
2	Her2+/HR- 
3	Her2-/HR+ 
4	Triple Negative 
5	Unknown 
9	Not 2010+ Breast 

'''
df['brst_sub'] = df['brst_sub'].replace({1:'Her2+/HR+',2:'Her2+/HR-',3:'Her2-/HR+',4:'Triple Negative',
                                     5:'Unknown',9:'Not 2010+ Breast'})

In [81]:
#CSMETSDXB_PUB
'''
0	None; no bone metastases 
1	Yes 
8	Not applicable 
9	Unknown; not documented in patient record 

'''
df['CSMETSDXB_PUB'] = df['CSMETSDXB_PUB'].replace({0:'None',1:'Yes',8:'Not applicable',9:'Unknown'})

In [82]:
#CSMETSDXBR_PUB
'''
0	None; no bone metastases 
1	Yes 
8	Not applicable 
9	Unknown; not documented in patient record 

'''
df['CSMETSDXBR_PUB'] = df['CSMETSDXBR_PUB'].replace({0:'None',1:'Yes',8:'Not applicable',9:'Unknown'})

In [83]:
#CSMETSDXLIV_PUB
'''
0	None; no bone metastases 
1	Yes 
8	Not applicable 
9	Unknown; not documented in patient record 

'''
df['CSMETSDXLIV_PUB'] = df['CSMETSDXLIV_PUB'].replace({0:'None',1:'Yes',8:'Not applicable',9:'Unknown'})

In [84]:
#CSMETSDXLUNG_PUB
'''
0	None; no bone metastases 
1	Yes 
8	Not applicable 
9	Unknown; not documented in patient record
'''
df['CSMETSDXLUNG_PUB'] = df['CSMETSDXLUNG_PUB'].replace({0:'None',1:'Yes',8:'Not applicable',9:'Unknown'})

In [85]:
#CSEXTEN             
'''
0	In situ
50	Paget disease of nipple  no tumor
70	Paget Disease disease of nipple  no carcinoma
100	Confined to breast tissue and fat including nipple and/or areola
110	T1mi
120	 T1a
130	T1b 
140	T1c
170	T1 [NOS] 
180	 T2 
190	T3 
200	Invasion of subcutaneous tissue
300	Attachment or fixation to pectoral muscle(s) or underlying tissue
380	T4 
390	T4a 
400	Invasion chest wall
410	T4a
510	Extensive skin involvement
512	Extensive skin involvement
514	Extensive skin involvement
516	Extensive skin involvement breast more then 33%
518	Extensive skin involvement breast  33%-50%
519	Extensive skin involvement
520	Extensive skin involvement breast  more 50%
575	Extensive skin involvement
580	Extensive skin involvement breast  % not stated
585	Extensive skin involvement
590	T4b
605	T4b
610	Invasion chest wall +Extensive skin involvement
612	Extensive skin involvement
613	Extensive skin involvement
615	Extensive skin involvement
620	Extensive skin involvement
680	T4c
710	Diagnosis of inflammatory carcinoma
715	Diagnosis of inflammatory carcinoma
720	Diagnosis of inflammatory carcinoma
725	Diagnosis of inflammatory carcinoma
730	Diagnosis of inflammatory carcinoma
750	Diagnosis of inflammatory carcinoma
780	T4d
790	T4 -NOS
950	No evidence of primary tumor
999	Unknown
'''
df['CSEXTEN']= df['CSEXTEN'].replace({0 : 'In situ Lobular neoplasia',50 : 'Paget disease of nipple  no tumor',
                                      70 : 'Paget Disease disease of nipple  no carcinoma',
                                      100 : 'Confined to breast tissue and fat including nipple and/or areola',
                                      110 : 'T1mi',120 : ' T1a',130 : 'T1b ',140 : 'T1c',170 : 'T1 [NOS]',
                                      180 : ' T2 ',190 : 'T3 ',200 : 'Invasion of subcutaneous tissue',
                                      300 : 'Attachment or fixation to pectoral muscle(s) or underlying tissue',
                                      380 : 'T4 ',390 : 'T4a ',400 : 'Invasion chest wall',410 : 'T4a',
                                      510 : 'Extensive skin involvement',512 : 'Extensive skin involvement',
                                      514 : 'Extensive skin involvement',
                                      516 : 'Extensive skin involvement breast more then 33%',
                                      518 : 'Extensive skin involvement breast  33%-50%',519 : 'Extensive skin involvement',
                                      520 : 'Extensive skin involvement breast  more 50%',
                                      575 : 'Extensive skin involvement',
                                      580 : 'Extensive skin involvement breast  % not stated',
                                      585 : 'Extensive skin involvement',590 : 'T4b',
                                      610 : "Diagnosis of inflammatory carcinoma",605 : 'T4b',
                                      610 : 'Invasion chest wall +Extensive skin involvement',
                                      612 : 'Extensive skin involvement',613 : 'Extensive skin involvement',
                                      615 : 'Extensive skin involvement',620 : 'Extensive skin involvement',
                                      680 : 'T4c',710 : 'Diagnosis of inflammatory carcinoma',
                                      715 : 'Diagnosis of inflammatory carcinoma',
                                      720 : 'Diagnosis of inflammatory carcinoma',
                                      725 : 'Diagnosis of inflammatory carcinoma',
                                      730 : 'Diagnosis of inflammatory carcinoma',
                                      750 : 'Diagnosis of inflammatory carcinoma',
                                      780 : 'T4d',790 : 'T4 -NOS',950 : 'No evidence of primary tumor',999 : 'Unknown',}) 



In [86]:
#CSLYMPHN            
'''
0 : 'No regional lymph node involvement ',
50 : 'Evaluated pathologically:',
130 : 'Evaluated pathologically:',
150 : 'Evaluated pathologically:',
155 : 'Evaluated pathologically:',
250 : 'Evaluated pathologically:',
255 : 'Evaluated clinically:',
257 : 'Evaluated clinically:',
258 : 'Evaluated pathologically:',
260 : 'N1 [NOS] ',
280 : 'N2 NOS ',
290 : 'N2 NOS - clinical',
300 : 'N2 NOS pathological',
500 : 'Fixed/matted ipsilateral axillary nodes NOS',
510 : 'Evaluated clinically:',
520 : 'Evaluated pathologically:',
600 : 'Axillary/regional lymph node(s)',
610 : 'Evaluated clinically:',
620 : 'Evaluated pathologically:',
630 : 'Stated as N2 [NOS] with no other information on regional lymph nodes',
710 : 'Evaluated pathologically:',
720 : 'Evaluated pathologically:',
730 : 'Evaluated pathologically:',
735 : 'Evaluated clinically:',
740 : 'Internal mammary node(s)',
745 : 'Internal mammary node(s)',
748 : ' N2b',
750 : 'Infraclavicular lymph node(s)',
755 : 'N3a',
760 : 'Internal mammary node(s)',
763 : 'Internal mammary node(s)',
764 : 'Internal mammary node(s)',
765 : 'Internal mammary node(s)',
768 : 'N3b',
770 : 'UNKNOWN if positive',
780 : 'UNKNOWN if positive Infraclavicular lymph nodes',
790 : 'N3',
800 : 'Supraclavicular node(s), ipsilateral',
805 : 'N3c ',
810 : 'Evaluated clinically:',
815 : 'Evaluated pathologically:',
820 : 'Stated as N3, NOS with no other information on regional lymph nodes',
999 : 'Unknown; regional lymph nodes not stated',

'''

df['CSLYMPHN']= df['CSLYMPHN'].replace ({0 : 'No regional lymph node involvement ',50 : 'Evaluated pathologically:',
                                         130 : 'Evaluated pathologically:',150 : 'Evaluated pathologically:',
                                         155 : 'Evaluated pathologically:',250 : 'Evaluated pathologically:',
                                         255 : 'Evaluated clinically:',257 : 'Evaluated clinically:',
                                         258 : 'Evaluated pathologically:',260 : 'N1 [NOS] ',280 : 'N2 NOS ',
                                         290 : 'N2 NOS - clinical',300 : 'N2 NOS pathological',
                                         500 : 'Fixed/matted ipsilateral axillary nodes NOS',
                                         510 : 'Evaluated clinically:',520 : 'Evaluated pathologically:',
                                         600 : 'Axillary/regional lymph node(s)',610 : 'Evaluated clinically:',
                                         620 : 'Evaluated pathologically:',
                                         630 : 'Stated as N2 [NOS] with no other information on regional lymph nodes',
                                         710 : 'Evaluated pathologically:',720 : 'Evaluated pathologically:',
                                         730 : 'Evaluated pathologically:',735 : 'Evaluated clinically:',
                                         740 : 'Internal mammary node(s)',745 : 'Internal mammary node(s)',748 : 'N2b',
                                         750 : 'Infraclavicular lymph node(s)',755 : 'N3a',760 : 'Internal mammary node(s)',
                                         763 : 'Internal mammary node(s)',764 : 'Internal mammary node(s)',
                                         765 : 'Internal mammary node(s)',768 : 'N3b',770 : 'UNKNOWN if positive',
                                         780 : 'UNKNOWN if positive Infraclavicular lymph nodes',790 : 'N3',
                                         800 : 'Supraclavicular node(s), ipsilateral',805 : 'N3c ',
                                         810 : 'Evaluated clinically:',815 : 'Evaluated pathologically:',
                                         820 : 'Stated as N3, NOS with no other information on regional lymph nodes',
                                         999 : 'Unknown; regional lymph nodes not stated'})

In [87]:
#CSMETSDX
'''
0 : 'No distant metastasis',
5 : 'No clinical or radiographic evidence of distant metastasis',
7 : 'M0(i+) ',
10 : 'Distant lymph node(s):',
40 : 'Distant metastasis except distant lymph node(s)',
42 : 'Further contiguous extension:',
44 : 'Metastasis:',
50 : 'Metastasis + Distant lymph node(s):',
60 : 'Distant metastasis, NOS',
99 : 'Unknown',
'''
df['CSMETSDX']=df['CSMETSDX'].replace({0 : 'No distant metastasis',
                                       5 : 'No clinical or radiographic evidence of distant metastasis',
                                       7 : 'M0(i+) ',10 : 'Distant lymph node(s):',
                                       40 : 'Distant metastasis except distant lymph node(s)',
                                       42 : 'Further contiguous extension:',44 : 'Metastasis:',
                                       50 : 'Metastasis + Distant lymph node(s):',60 : 'Distant metastasis, NOS',
                                       99 : 'Unknown'})

In [98]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 247662 entries, 11 to 1631571
Data columns (total 47 columns):
MAR_STAT            247662 non-null object
RACE1V              247662 non-null object
SEX                 247662 non-null object
AGE_DX              247662 non-null object
PRIMSITE            247662 non-null object
LATERAL             247662 non-null object
HISTO2V             247662 non-null object
BEHO2V              247662 non-null object
HISTO3V             247662 non-null object
BEHO3V              247662 non-null object
GRADE               247662 non-null object
CSTUMSIZ            247662 non-null object
CSEXTEN             247662 non-null object
CSLYMPHN            247662 non-null object
CSMETSDX            247662 non-null object
CS1SITE             247662 non-null object
CS2SITE             247662 non-null object
CS3SITE             247662 non-null object
CS4SITE             247662 non-null object
CS5SITE             247662 non-null object
CS6SITE             247662 

In [97]:
def category_count(df,feature):
    score=df[feature].value_counts().count()
    print("Number of Categories per feature {:16} is {:.0f}".format(feature, score))
for feature in df.columns:
    category_count(df,feature)

Number of Categories per feature MAR_STAT         is 7
Number of Categories per feature RACE1V           is 30
Number of Categories per feature SEX              is 2
Number of Categories per feature AGE_DX           is 10
Number of Categories per feature PRIMSITE         is 9
Number of Categories per feature LATERAL          is 5
Number of Categories per feature HISTO2V          is 21
Number of Categories per feature BEHO2V           is 3
Number of Categories per feature HISTO3V          is 21
Number of Categories per feature BEHO3V           is 2
Number of Categories per feature GRADE            is 5
Number of Categories per feature CSTUMSIZ         is 29
Number of Categories per feature CSEXTEN          is 29
Number of Categories per feature CSLYMPHN         is 19
Number of Categories per feature CSMETSDX         is 10
Number of Categories per feature CS1SITE          is 8
Number of Categories per feature CS2SITE          is 8
Number of Categories per feature CS3SITE          is 15
N

In [93]:
drop=['CODPUB','STAT_REC']
df.drop(drop,axis=1,inplace=True)

In [99]:
df.head()

Unnamed: 0,MAR_STAT,RACE1V,SEX,AGE_DX,PRIMSITE,LATERAL,HISTO2V,BEHO2V,HISTO3V,BEHO3V,...,DAJCC7M,DAJCC7STG,CS7SITE,brst_sub,CSMETSDXB_PUB,CSMETSDXBR_PUB,CSMETSDXLIV_PUB,CSMETSDXLUNG_PUB,MALIGCOUNT,BENBORDCOUNT
11,Widowed,White,Female,<80,Overlapping lesion,Left,Ductal and lobular neoplasms,"Malignant, primary site",Ductal and lobular neoplasms,"Malignant, primary site",...,Not applicable,Not applicable,Unknown,Not 2010+ Breast,Not applicable,Not applicable,Not applicable,Not applicable,2,0
16,Married,White,Female,<60,Upper-outer quadrant,Right,Ductal and lobular neoplasms,"Malignant, primary site",Ductal and lobular neoplasms,"Malignant, primary site",...,Not applicable,Not applicable,Unknown,Not 2010+ Breast,Not applicable,Not applicable,Not applicable,Not applicable,2,0
18,Separated,White,Female,<80,Upper-outer quadrant,Left,Ductal and lobular neoplasms,"Malignant, primary site",Ductal and lobular neoplasms,"Malignant, primary site",...,Not applicable,Not applicable,Unknown,Not 2010+ Breast,Not applicable,Not applicable,Not applicable,Not applicable,2,0
30,Widowed,White,Female,<60,"Breast, NOS",Right,Ductal and lobular neoplasms,"Malignant, primary site",Ductal and lobular neoplasms,"Malignant, primary site",...,Not applicable,Not applicable,Unknown,Not 2010+ Breast,Not applicable,Not applicable,Not applicable,Not applicable,2,0
32,Married,White,Female,<70,Upper-outer quadrant,Right,Ductal and lobular neoplasms,"Malignant, primary site",Ductal and lobular neoplasms,"Malignant, primary site",...,Not applicable,Not applicable,Unknown,Not 2010+ Breast,Not applicable,Not applicable,Not applicable,Not applicable,2,0


In [117]:
df=df.join(df1, rsuffix='_cont')

In [118]:
df.to_csv('whole_list_categorical.csv')

# Data Preprocessing 

### Onehot encode

In [100]:
df=pd.get_dummies(df,drop_first=True)

In [101]:
df=df.join(df1)

In [102]:
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Int64Index: 247662 entries, 11 to 1631571
Columns: 422 entries, MAR_STAT_Married to srv_time_mon
dtypes: float64(2), int64(4), uint8(416)
memory usage: 121.5 MB


In [103]:
df.head()

Unnamed: 0,MAR_STAT_Married,MAR_STAT_Separated,MAR_STAT_Single,MAR_STAT_Unknown,MAR_STAT_Unmarried,MAR_STAT_Widowed,RACE1V_Asian Indian or Pakistani,RACE1V_Black,RACE1V_Chamorran,RACE1V_Chinese,...,MALIGCOUNT_Unknown,BENBORDCOUNT_1,BENBORDCOUNT_2,BENBORDCOUNT_3,AGE_DX,CSTUMSIZ,CS3SITE,MALIGCOUNT,BENBORDCOUNT,srv_time_mon
11,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,76,0.0,0.0,2,0,30
16,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,58,0.0,0.0,2,0,18
18,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,78,0.0,0.0,2,0,59
30,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,59,0.0,0.0,2,0,54
32,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,63,0.0,0.0,2,0,121


# Multiple linear regression for the whole df

In [104]:
X=df.iloc[:,:-1].values
y=df.iloc[:,421].values

In [105]:
from sklearn.cross_validation import train_test_split
X_train,X_test, y_train, y_test= train_test_split(X,y,test_size=0.20,random_state=42)



In [106]:
from sklearn.linear_model import LinearRegression
regressor= LinearRegression()
regressor.fit(X_train,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [107]:
y_pred=regressor.predict(X_test)

In [108]:
from sklearn.metrics import r2_score
r2_score(y_test,y_pred)

0.7139363526769968

# Multiple linear regression for the cont vars

In [109]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 247662 entries, 11 to 1631571
Data columns (total 6 columns):
AGE_DX          247662 non-null int64
CSTUMSIZ        247662 non-null float64
CS3SITE         247662 non-null float64
MALIGCOUNT      247662 non-null int64
BENBORDCOUNT    247662 non-null int64
srv_time_mon    247662 non-null int64
dtypes: float64(2), int64(4)
memory usage: 23.2 MB


In [110]:
X=df1.iloc[:,:-1].values
y=df1.iloc[:,5].values

In [111]:
from sklearn.cross_validation import train_test_split
X_train,X_test, y_train, y_test= train_test_split(X,y,test_size=0.20,random_state=42)

In [112]:
from sklearn.linear_model import LinearRegression
regressor= LinearRegression()
regressor.fit(X_train,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [113]:
y_pred=regressor.predict(X_test)

In [114]:
from sklearn.metrics import r2_score
r2_score(y_test,y_pred)

0.05565996352037117

## training a supervised regression learner on a subset of the data with one feature removed, and then score how well that model can predict the removed feature.

In [115]:
from sklearn.linear_model import LinearRegression
def predict_feature(feature):
        # TODO: Make a copy of the DataFrame, using the 'drop' function to drop the given feature
    new_data = df1.drop([feature],axis=1)

    # TODO: Split the data into training and testing sets using the given feature as the target
    X_train, X_test, y_train, y_test = train_test_split(new_data,df1[feature],test_size=0.25,random_state=101)

    # TODO: Create a decision tree regressor and fit it to the training set
    regressor = LinearRegression().fit(X_train,y_train)

    # TODO: Report the score of the prediction using the testing set
    score = regressor.score(X_test,y_test)

    print("The score for feature {:16} is {:+.5f}".format(feature, score))

for feature in df1.columns.values:
    predict_feature(feature)

The score for feature AGE_DX           is +0.04832
The score for feature CSTUMSIZ         is +0.36656
The score for feature CS3SITE          is +0.35778
The score for feature MALIGCOUNT       is +0.00921
The score for feature BENBORDCOUNT     is +0.00096
The score for feature srv_time_mon     is +0.05943


In [116]:
from sklearn.tree import DecisionTreeRegressor
def predict_feature(feature):
        # TODO: Make a copy of the DataFrame, using the 'drop' function to drop the given feature
    new_data = df1.drop([feature],axis=1)

    # TODO: Split the data into training and testing sets using the given feature as the target
    X_train, X_test, y_train, y_test = train_test_split(new_data,df1[feature],test_size=0.25,random_state=101)

    # TODO: Create a decision tree regressor and fit it to the training set
    regressor = DecisionTreeRegressor(random_state=101).fit(X_train,y_train)

    # TODO: Report the score of the prediction using the testing set
    score = regressor.score(X_test,y_test)

    print("The score for feature {:16} is {:+.5f}".format(feature, score))

for feature in df1.columns.values:
    predict_feature(feature)

The score for feature AGE_DX           is -0.07832
The score for feature CSTUMSIZ         is +0.13273
The score for feature CS3SITE          is +0.23868
The score for feature MALIGCOUNT       is -0.46719
The score for feature BENBORDCOUNT     is -1.17360
The score for feature srv_time_mon     is +0.23216
