In [2]:
import pandas as pd
import numpy as np
from math import isnan
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import Imputer

pd.set_option('display.width', 5000) 
pd.set_option('display.max_columns', 90)

In [4]:
tcga_case = pd.read_csv('./tcga_caseID_171.lst', delim_whitespace=True)
tcga_case['CASE_NO'] = tcga_case.CaseID.map(lambda x: x.split('-')[0])

in1 = pd.read_csv('./RESEARCH_FOLLOW_UP.dsv', delimiter='\t', low_memory=False)
print in1.shape

(21481, 48)


In [431]:
print list(in1.columns)

['FOLLOW_UP_NO', 'CASE_NO', 'FOLLOW_UP_DATE', 'ORDER_FOR_CASE', 'MONTH_PAST_SURGERY', 'PATIENT_ALIVE', 'DATE_OF_DEATH', 'CAUSED_BY_TUMOR', 'CAUSE_OF_DEATH_ICD', 'LAST_FOLLOW_UP_DATE', 'RELAPSE_FREE', 'RELAPSE_FREE_SINCE', 'LOCAL_RELAPSE', 'LOCALIZATION', 'METASTATIC', 'TUMOR_MARKER', 'TUMOR_MARKER_PATHOLOGICAL', 'RESPONSE', 'GENERAL_CONDITION', 'WEIGHT', 'WEIGHT_FIXED', 'LOSS_OF_WEIGHT', 'LOSS_OF_WEIGHT_IN_MONTH', 'GAIN_OF_WEIGHT', 'GAIN_OF_WEIGHT_IN_MONTH', 'BODY_HEIGHT', 'BODY_MASS_INDEX', 'BODY_SURFACE_AREA', 'APPETITE', 'VEGETARIAN', 'VEGETARIAN_SINCE', 'MEAT_CONSUMPTION_FREQ', 'MENOPAUSE', 'SLEEP', 'NATURAL_DEFECATION', 'PRETERNATURAL_ANUS', 'PRETERNATURAL_ANUS_FREQUENCY', 'LAXATIVE_FREQUENCY', 'NOCTURIA', 'HIV', 'HEPATITIS', 'INQUIRY_TYPE', 'LIQUID_DIET', 'CHEMOTHERAPY', 'RADIATION', 'HORMONETHERAPY', 'ALTERNATIVE_THERAPY', 'CHEMOTHERAPY_TYPE']


In [12]:
# select columns of interest
in2 = in1 [['FOLLOW_UP_DATE','CASE_NO', 'PATIENT_ALIVE','RELAPSE_FREE', 'METASTATIC','RESPONSE','CHEMOTHERAPY','RADIATION']]

FU_case171 = pd.merge(tcga_case, in2, how='inner')

# There are multiple entries for each case_no. Find Unique cases. 
uniq_cases = set(list(FU_case171['CASE_NO']))

print "\nTotal followup instances avaliable :", FU_case171.shape[0]
print "\nUnique intersection between FollowUp  clinical data and cases_171 is :",len(uniq_cases)
print "\nEach case may have more than one follow-up data."
FU_case171.head(15)


Total followup instances avaliable : 428

Unique intersection between FollowUp  clinical data and cases_171 is : 142

Each case may have more than one follow-up data.


Unnamed: 0,TCGA,CaseID,CASE_NO,FOLLOW_UP_DATE,PATIENT_ALIVE,RELAPSE_FREE,METASTATIC,RESPONSE,CHEMOTHERAPY,RADIATION
0,TCGA-AA-3526,A1888-Tp2,A1888,07.01.2008,y,y,n,,n,n
1,TCGA-AA-3526,A1888-Tp2,A1888,12.01.2009,y,y,n,,n,n
2,TCGA-AA-3526,A1888-Tp2,A1888,27.07.2010,y,y,n,complete response (CR),n,n
3,TCGA-AA-3831,B1164-Tp1,B1164,01.04.2010,y,y,n,,n,n
4,TCGA-AA-3831,B1164-Tp1,B1164,18.03.2013,y,,,,,
5,TCGA-AA-A01C,A2733,A2733,28.09.2010,y,y,n,complete response (CR),n,n
6,TCGA-AA-A01C,A2733,A2733,16.11.2010,n,u,u,unknown,n,n
7,TCGA-AA-3549,A2327-Tp1,A2327,29.04.2010,y,y,n,,n,n
8,TCGA-AA-3549,A2327-Tp1,A2327,23.08.2011,n,n,y,progressive disease (PD),y,y
9,TCGA-AA-3696,A2779-Tp2,A2779,25.12.2009,n,u,u,unknown,u,u


In [14]:
# Find the unique values contained in each column. 
cols = [ 'PATIENT_ALIVE','RELAPSE_FREE', 'METASTATIC','RESPONSE','CHEMOTHERAPY','RADIATION']
for x in cols :
    print "\n\n", x
    print set(list(FU_case171[x]))



PATIENT_ALIVE
set(['y', 'n'])


RELAPSE_FREE
set(['y', nan, 'u', 'n'])


METASTATIC
set([nan, 'y', 'u', 'n'])


RESPONSE
set([nan, 'progressive disease (PD)', 'complete response (CR)', 'unknown', 'partial response (PR)', 'stable disease (SD)'])


CHEMOTHERAPY
set([nan, 'y', 'u', 'n'])


RADIATION
set([nan, 'y', 'u', 'n'])


In [32]:
# replace 'u' and 'NaN' with 'unknown' 
FU_case171.replace(['u'],'unknown', inplace=True)
FU_case171.fillna(value='unknown', inplace=True)

# check the new values
#cols = [ 'PATIENT_ALIVE','RELAPSE_FREE', 'METASTATIC','RESPONSE','CHEMOTHERAPY','RADIATION']
#for x in cols :
#    print "\n\n", x
#    print set(list(FU_case171[x]))

# Merge values of multiple occurances of each case_no into a list - for all columns

PA_series = FU_case171.groupby('CASE_NO')['PATIENT_ALIVE'].apply(list)
RF_series = FU_case171.groupby('CASE_NO')['RELAPSE_FREE'].apply(list)
MS_series = FU_case171.groupby('CASE_NO')['METASTATIC'].apply(list)
RP_series = FU_case171.groupby('CASE_NO')['RESPONSE'].apply(list)
CM_series = FU_case171.groupby('CASE_NO')['CHEMOTHERAPY'].apply(list)
RD_series = FU_case171.groupby('CASE_NO')['RADIATION'].apply(list)

In [33]:
# Example
PA_series.head()

CASE_NO
A1023          [y, y, y, y, y, y, y]
A1064    [y, y, y, y, y, y, y, y, y]
A1101                            [y]
A114                       [y, y, n]
A1143                            [y]
Name: PATIENT_ALIVE, dtype: object

In [34]:
RF_series

CASE_NO
A1023    [y, y, y, y, unknown, unknown, y]
A1064    [y, y, y, y, y, y, y, y, unknown]
A1101                                  [y]
A114                             [y, y, y]
A1143                                  [y]
A117                                [y, y]
A1193          [y, y, y, y, y, unknown, y]
A1230             [y, y, y, y, y, unknown]
A1234                                  [y]
A1349       [y, y, y, y, y, y, unknown, y]
A1436       [unknown, y, y, y, y, unknown]
A1455                               [y, y]
A1635                               [y, y]
A1649                                  [y]
A1664                                  [y]
A1670                               [y, y]
A1705                         [y, unknown]
A1717                         [n, n, n, n]
A1747                         [y, n, y, y]
A1763                      [y, y, unknown]
A1776       [unknown, unknown, y, y, y, y]
A1803                [y, unknown, y, y, y]
A1823                         [y, unknown]
A18

In [35]:
# Replace the list by one value using different conditions for different columns
RP_s1 = pd.Series()
MS_s1 = pd.Series()
RF_s1 = pd.Series()
PA_s1 = pd.Series()
CM_s1 = pd.Series()
RD_s1 = pd.Series()

for ix, val in RP_series.iteritems() :
       
    ## Remove unknowns and consider the latest value for Response and Patient Alive    
    RP_series[ix] = filter(lambda a: a != "unknown", RP_series[ix])
    PA_series[ix] = filter(lambda a: a != "unknown", PA_series[ix])

    if RP_series[ix]:
        RP_s1[ix] = RP_series[ix][-1]
    if PA_series[ix]:
        PA_s1[ix] = PA_series[ix][-1]
    
    ### If any of the follow-up had chemo / radiation or metastatis then 'y'
    CM_s1[ix] = 'y' if 'y' in CM_series[ix] else 'n'
    RD_s1[ix] = 'y' if 'y' in RD_series[ix] else 'n'
    MS_s1[ix] = 'y' if 'y' in MS_series[ix] else 'n'
 
    ### If all FU for relapse free is 'y' then 'y' 
    #RF_series[ix] = filter(lambda a: a != "unknown", RF_series[ix])
    RF_s1[ix] = 'y' if all( x == 'y' for x in RF_series[ix]) else 'n'
    
    ## if all values are unknown then 'u'
    CM_s1[ix] = 'u' if all( x == "unknown" for x in CM_series[ix]) else CM_s1[ix]
    RD_s1[ix] = 'u' if all( x == 'unknown' for x in RD_series[ix]) else RD_s1[ix]
    MS_s1[ix] = 'u' if all( x == 'unknown' for x in MS_series[ix]) else MS_s1[ix]
    RF_s1[ix] = 'u' if all( x == 'unknown' for x in RF_series[ix]) else RF_s1[ix]
        
print "Response : %d \nPatient_alive : %d \nMetastasis : %d \nRelapse Free : %d" %(len(RP_s1), len(PA_s1), len(MS_s1), len(RF_s1))
print "Chemotherapy : %d \nRadiation : %d " %(len(CM_s1), len(RD_s1))

Response : 101 
Patient_alive : 142 
Metastasis : 142 
Relapse Free : 142
Chemotherapy : 142 
Radiation : 142 


In [36]:
# Concatenate the 5 colums that have the same length (NOTE : index is in the same order)
all_CV = pd.concat([PA_s1,MS_s1,RF_s1,CM_s1,RD_s1], axis=1)
# label the header
all_CV.reset_index(inplace=True); all_CV.columns = ['CASE_NO','Patient_alive','Metastasis','Relapse_free','Chemotherapy','Radiation']
# replace 'u' by np.nan
all_CV.replace(to_replace='u', value=np.nan, inplace=True)
# Find Shape 
print "\nShape :",all_CV.shape,'\n'

# Find the distribution of nan 
print "NaN distribution :\n"
print (all_CV.isnull().sum(axis=0)*100/all_CV.shape[0])

cols = [u'Patient_alive', u'Metastasis', u'Relapse_free', u'Chemotherapy', u'Radiation']

for x in cols :
    print "\nDistribution for ####", x
    print all_CV[x].value_counts()/all_CV.shape[0]*100

all_CV.head(5)


Shape : (142, 6) 

NaN distribution :

CASE_NO           0.000000
Patient_alive     0.000000
Metastasis       10.563380
Relapse_free     10.563380
Chemotherapy      6.338028
Radiation         6.338028
dtype: float64

Distribution for #### Patient_alive
y    73.239437
n    26.760563
Name: Patient_alive, dtype: float64

Distribution for #### Metastasis
n    77.464789
y    11.971831
Name: Metastasis, dtype: float64

Distribution for #### Relapse_free
n    47.887324
y    41.549296
Name: Relapse_free, dtype: float64

Distribution for #### Chemotherapy
n    63.380282
y    30.281690
Name: Chemotherapy, dtype: float64

Distribution for #### Radiation
n    88.028169
y     5.633803
Name: Radiation, dtype: float64


Unnamed: 0,CASE_NO,Patient_alive,Metastasis,Relapse_free,Chemotherapy,Radiation
0,A1023,y,n,n,n,n
1,A1064,y,n,n,n,n
2,A1101,y,n,y,y,n
3,A114,n,n,y,y,n
4,A1143,y,n,y,n,n


In [37]:
# Drop coloumns RADIATION, METASTASIS as 80% cases fall under one category (if nan == most frequented)
RFU_3cv_142cases = all_CV[['CASE_NO',u'Patient_alive', u'Relapse_free', u'Chemotherapy']]

# Replace NaN in every column with its own most frequent value
RFU_3cv_142cases.Relapse_free.fillna('n',inplace=True)
RFU_3cv_142cases = RFU_3cv_142cases.apply(lambda x:x.fillna(x.value_counts().index[0]))

# Check the distribution of nan 
print (RFU_3cv_142cases.isnull().sum(axis=0)*100/RFU_3cv_142cases.shape[0])

# Categorical distribution
cols = [u'Patient_alive', u'Relapse_free', u'Chemotherapy']
for x in cols :
    print "\nDistribution for ####", x
    print RFU_3cv_142cases[x].value_counts()/RFU_3cv_142cases.shape[0]*100

RFU_3cv_142cases.head(10)

CASE_NO          0.0
Patient_alive    0.0
Relapse_free     0.0
Chemotherapy     0.0
dtype: float64

Distribution for #### Patient_alive
y    73.239437
n    26.760563
Name: Patient_alive, dtype: float64

Distribution for #### Relapse_free
n    58.450704
y    41.549296
Name: Relapse_free, dtype: float64

Distribution for #### Chemotherapy
n    69.71831
y    30.28169
Name: Chemotherapy, dtype: float64


Unnamed: 0,CASE_NO,Patient_alive,Relapse_free,Chemotherapy
0,A1023,y,n,n
1,A1064,y,n,n
2,A1101,y,y,y
3,A114,n,y,y
4,A1143,y,y,n
5,A117,y,y,n
6,A1193,y,n,n
7,A1230,y,n,n
8,A1234,y,y,n
9,A1349,y,n,n


In [427]:
# Assign numeric labels

for x in cols:
    number = LabelEncoder()
    RFU_3cv_142cases[x] = number.fit_transform(RFU_3cv_142cases[x].astype('str'))    

RFU_3cv_142cases.head(10)

Unnamed: 0,CASE_NO,Patient_alive,Relapse_free,Chemotherapy
0,A1023,1,0,0
1,A1064,1,0,0
2,A1101,1,1,1
3,A114,0,1,1
4,A1143,1,1,0
5,A117,1,1,0
6,A1193,1,0,0
7,A1230,1,0,0
8,A1234,1,1,0
9,A1349,1,0,0


In [428]:
# Response categories 
print RP_s1.value_counts()

complete response (CR)      76
progressive disease (PD)    15
partial response (PR)        8
stable disease (SD)          2
dtype: int64


In [429]:
RP_df = RP_s1.to_frame()
RP_df.reset_index(inplace=True); RP_df.columns=['CASE_NO','Response']

# Combining 'progressive disease (PD)' and 'stable disease (SD)' into one category  and also converting to labels. 
RP_df['Response'].replace(['progressive disease (PD)','stable disease (SD)'],0, inplace=True) 
RP_df['Response'].replace(['complete response (CR)'],1, inplace=True)
RP_df['Response'].replace(['partial response (PR)'],2, inplace=True)

RFU_4cv_101cases = pd.merge(RFU_3cv_142cases,RP_df, how='inner')

print RFU_4cv_101cases.shape
print (RFU_4cv_101cases.isnull().sum(axis=0)*100/RFU_4cv_101cases.shape[0])

(101, 5)
CASE_NO          0.0
Patient_alive    0.0
Relapse_free     0.0
Chemotherapy     0.0
Response         0.0
dtype: float64


In [430]:
RFU_4cv_101cases.head(10)

Unnamed: 0,CASE_NO,Patient_alive,Relapse_free,Chemotherapy,Response
0,A1023,1,0,0,1
1,A1064,1,0,0,1
2,A1101,1,1,1,2
3,A114,0,1,1,0
4,A1193,1,0,0,2
5,A1230,1,0,0,1
6,A1349,1,0,0,2
7,A1436,0,0,0,1
8,A1635,1,1,1,1
9,A1649,1,1,1,0


In [440]:
#RFU_3cv_142cases.to_csv('./COAD_clinical_matrix/RFU_3cv_142cases.csv', sep='\t',index=False)
#RFU_4cv_101cases.to_csv('./COAD_clinical_matrix/RFU_4cv_101cases.csv', sep='\t',index=False)

In [383]:
out = all_CV.dropna(axis=0, thresh = 5) # only 1 clinical can have NaN at a time for each row
out.shape

out[(out['Relapse_free']=='y') & (out['Patient_alive']=='n')].RP.value_counts()
out[out['Patient_alive']=='n'].RP.value_counts()

progressive disease (PD)    10
complete response (CR)       9
stable disease (SD)          2
partial response (PR)        1
Name: RP, dtype: int64

In [None]:
#### ROUGH
RP_series = FU_case171.groupby('CASE_NO')['RESPONSE'].apply(list)
to_remove = ['u', 'unknown']
for i in to_remove:
    if i in RP_series['A1064'] : RP_series['A1064'].remove("unknown")
j=RP_series['A1064'].remove('unknown')

PA_df = PA_s1.to_frame()
MS_df = MS_s1.to_frame()
RF_df = RF_s1.to_frame()

PA_df.reset_index(inplace=True); PA_df.columns=['ID','PA']
MS_df.reset_index(inplace=True); MS_df.columns=['ID','MS']
RF_df.reset_index(inplace=True); RF_df.columns=['ID','RF']


RP_PA = pd.merge(RP_df, PA_df, how='outer')
RP_PA_MS = pd.merge(RP_PA, MS_df, how='outer')
RP_PA_MS_RF = pd.merge(RP_PA_MS, RF_df, how='outer')
RP_PA_MS_RF.shape
RP_PA_MS_RF.tail()

out = RP_PA_MS_RF.dropna(axis=0, thresh = 3) # only 1 clinical can have NaN at a time for each row