In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import sys
!{sys.executable} -m pip install pandas-profiling
!{sys.executable} -m pip install pandasql 
from pandasql import sqldf
import pandas_profiling

###############################################################################################
# Process Clinical data
###############################################################################################

patient_data = pd.read_csv('patient_data.csv')
patient = pd.read_csv('patient_data.csv')

patient = patient.set_index('track_name').T
patient = patient[1:]
patient = patient.reset_index()
patient = patient.rename(columns = {'index':'ID'})

patient['ID'] = patient['ID'].str.upper()
patient['Diagnosis Age'] = pd.to_numeric(patient['Diagnosis Age'])
patient['Overall Survival (Months)'] = pd.to_numeric(patient['Overall Survival (Months)'])
patient['Mutation Count'] = pd.to_numeric(patient['Mutation Count'])
patient['Sex'] = patient['Sex'].str.lower()
patient = patient[patient['# Samples per Patient']!='2'].reset_index(drop=True)

def label(x):
    if 'T1' in x:
        return 'T1'
    if 'T2' in x:
        return 'T2'
    if 'T3' in x:
        return 'T3'
    return 'T4'
patient['Stage'] = patient['American Joint Committee on Cancer Tumor Stage Code'].apply(lambda x: label(x))

patient = patient.drop(['Profiled in Mutations','Profiled in Putative copy-number alterations from GISTIC','Patient\'s Vital Status','# Samples per Patient','Cancer Type','Informed consent verified','Neoplasm Histologic Type Name'],axis=1)

###############################################################################################
# Process Tumor mRNA gene expression data
###############################################################################################

mrna_data = pd.read_csv('mrna_data.txt',sep='\t')
mrna = pd.read_csv('mrna_data.txt',sep='\t')

mrna.dropna(thresh=1,axis=1,inplace=True)
mrna = mrna.drop(['STUDY_ID'],axis=1)

def id_process(x):
    new = x[:len(x)-3]
    new = new.replace("_","")
    return new.upper()
mrna['SAMPLE_ID'] = mrna['SAMPLE_ID'].apply(lambda x: id_process(x))

###############################################################################################
# Process Tumor Mutation Sequencing data
###############################################################################################

seq_data = pd.read_csv('seq_data.txt',sep='\t')
seq = pd.read_csv('seq_data.txt',sep='\t')

seq.dropna(how='all',axis=1,inplace=True)
seq = seq.fillna(0)
seq = seq.drop(['STUDY_ID'],axis=1)
seq['SAMPLE_ID'] = seq['SAMPLE_ID'].apply(lambda x: id_process(x))





In [2]:
###############################################################################################
# Process combined dataframe
###############################################################################################

def pysqldf(q):
    return sqldf(q, globals())

q = '''
SELECT *
FROM patient
JOIN mrna
ON patient.ID = mrna.SAMPLE_ID
JOIN seq
ON patient.ID = seq.SAMPLE_ID
'''
df = pysqldf(q)



In [3]:
df

Unnamed: 0,ID,American Joint Committee on Cancer Tumor Stage Code,Diagnosis Age,Overall Survival (Months),Neoplasm Histologic Grade,Race Category,Overall Survival Status,Sex,Mutation Count,Stage,...,STARD6,NTRK2,FOXP2,KCNJ1,MS4A3,OR51B4,GALNT9,KCNK3,PCDHB1,LRRTM3
0,CG-A3-3387,T1a,49,20.27,G2,WHITE,LIVING,male,70,T1,...,0,0,0,0,0,0,0,0,0,0
1,CG-A3-3374,T1b,51,43.17,G2,BLACK OR AFRICAN AMERICAN,LIVING,female,934,T1,...,0,0,0,0,0,0,0,0,0,0
2,CG-A3-3363,T2,50,10.48,G2,ASIAN,LIVING,male,1392,T2,...,0,0,0,0,0,0,0,0,0,0
3,CG-CZ-5460,T3b,55,94.38,G2,WHITE,LIVING,male,52,T3,...,0,0,0,0,0,0,0,0,0,0
4,CG-BP-4176,T1b,64,64.22,G2,WHITE,LIVING,male,85,T1,...,0,0,0,0,0,0,0,0,0,0
5,CG-CJ-6032,T2,63,119.55,G3,WHITE,LIVING,female,42,T2,...,0,0,0,0,0,0,0,0,0,0
6,CG-BP-5192,T1a,59,23.46,G2,WHITE,LIVING,male,46,T1,...,0,0,0,0,0,0,0,0,0,0
7,CG-CW-5585,T3b,51,85.71,G2,WHITE,LIVING,male,41,T3,...,0,0,0,0,0,0,0,0,0,0
8,CG-B0-5691,T1a,66,112.71,G3,WHITE,LIVING,female,55,T1,...,0,0,0,0,0,0,0,0,0,0
9,CG-AK-3453,T2,58,83.15,G2,WHITE,LIVING,female,350,T2,...,0,0,0,0,0,0,0,0,0,0


In [4]:
df = df.drop(['SAMPLE_ID'],axis=1)

In [5]:
df.to_csv ('df.csv')

In [6]:
col_with_null = df.loc[:,df.isnull().sum(axis=0)>0]
nulls = col_with_null[col_with_null.isnull().any(axis=1)]
ind = nulls.index

df.iloc[ind,4:6]

Unnamed: 0,Neoplasm Histologic Grade,Race Category
30,G2,
64,,BLACK OR AFRICAN AMERICAN
88,G2,
158,G3,
179,G2,
196,,BLACK OR AFRICAN AMERICAN
275,,WHITE
276,G2,
349,G3,
483,G2,


In [7]:
from sklearn.neighbors import KNeighborsClassifier

In [8]:
fitting = df
fitting = fitting.iloc[:,:90]
fitting = fitting.drop(['ID','American Joint Committee on Cancer Tumor Stage Code','Overall Survival Status','Sex','Stage'],axis=1)

In [9]:
race_fit = fitting.drop(['Neoplasm Histologic Grade'],axis=1)
null_index = race_fit.isnull().any(axis=1)
row_with_null = race_fit[null_index]
row_no_null = race_fit[-null_index]

clf = KNeighborsClassifier(3, weights='distance')
x = row_no_null.drop(['Race Category'],axis=1)
y = row_no_null['Race Category']
clf.fit(x,y)
x_test = row_with_null.drop(['Race Category'],axis=1)
imputes = clf.predict(x_test)

df.loc[null_index,'Race Category'] = imputes 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [11]:
race_fit = fitting.drop(['Race Category'],axis=1)
null_index = race_fit.isnull().any(axis=1)
row_with_null = race_fit[null_index]
row_no_null = race_fit[-null_index]

clf = KNeighborsClassifier(3, weights='distance')
x = row_no_null.drop(['Neoplasm Histologic Grade'],axis=1)
y = row_no_null['Neoplasm Histologic Grade']
clf.fit(x,y)
x_test = row_with_null.drop(['Neoplasm Histologic Grade'],axis=1)
imputes = clf.predict(x_test)

df.loc[null_index,'Neoplasm Histologic Grade'] = imputes 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [12]:
df.iloc[ind,4:6]

Unnamed: 0,Neoplasm Histologic Grade,Race Category
30,G2,WHITE
64,G1,BLACK OR AFRICAN AMERICAN
88,G2,WHITE
158,G3,WHITE
179,G2,WHITE
196,G2,BLACK OR AFRICAN AMERICAN
275,G3,WHITE
276,G2,WHITE
349,G3,WHITE
483,G2,WHITE


In [17]:
fitting.iloc[ind,2:4]

Unnamed: 0,Neoplasm Histologic Grade,Race Category
30,G2,
64,,BLACK OR AFRICAN AMERICAN
88,G2,
158,G3,
179,G2,
196,,BLACK OR AFRICAN AMERICAN
275,,WHITE
276,G2,
349,G3,
483,G2,


In [24]:
fitting.isnull().sum().sum(axis=0)

10

In [23]:
df.isnull().sum().sum(axis=0)

0