In [27]:
import numpy as np 
import pydicom as dicom
import os 
import matplotlib.pyplot as plt 
import pandas as pd 
import nrrd
from scipy.stats import chi2_contingency
from scipy.stats import mannwhitneyu
from statistics import median
from statistics import mean
from statistics import stdev
from tabulate import tabulate

In [2]:
DWI_path= "/Users/marktran21/Documents/MD Anderson/Data/DWI_Dicom"
patient_id = ['2609170', '2467000', '2459017', '2419611', '2413785', '2551521', '1049397', '2591335', '2541392', '2593206', '2386449', '2455275', '2139028', '2377620', '2510278', '862797', '2403028','2604993', '2467929']
sex=[]
age=[]
gtv=[0]*19
for id in patient_id:
    scan_path= DWI_path+'/'+id+'/W1'
    scan=dicom.read_file(scan_path+'/'+os.listdir(scan_path)[0])
    sex.append(scan.PatientSex)
    age.append(scan.PatientAge)

In [13]:
characteristics=pd.DataFrame(list(zip(patient_id, sex, age,gtv)),columns=['id','sex','age','GTV'])

In [14]:
characteristics

Unnamed: 0,id,sex,age,GTV
0,2609170,M,065Y,0
1,2467000,M,054Y,0
2,2459017,M,058Y,0
3,2419611,M,051Y,0
4,2413785,M,039Y,0
5,2551521,M,051Y,0
6,1049397,M,056Y,0
7,2591335,M,060Y,0
8,2541392,F,079Y,0
9,2593206,M,080Y,0


In [30]:
characteristics.to_csv('/Users/marktran21/Documents/MD Anderson/Data/characteristics.csv')

In [20]:
characteristics= pd.read_csv('/Users/marktran21/Documents/MD Anderson/Data/characteristics.csv')

In [23]:
characteristics

Unnamed: 0.1,Unnamed: 0,id,sex,age,GTV
0,0,Train,M,65,1.208
1,1,Train,M,54,10.432
2,2,Test,M,58,8.008
3,3,Train,M,51,10.352
4,4,Train,M,39,0.74
5,5,Train,M,51,2.773
6,6,Train,M,56,1.308
7,7,Train,M,60,2.256
8,8,Train,F,79,3.359
9,9,Train,M,80,5.762


In [22]:
for i in range(19):
    if characteristics.id[i]== 2377620:
        characteristics.id[i]="Test"
    elif characteristics.id[i]== 2459017:
        characteristics.id[i]="Test"
    else:
        characteristics.id[i]="Train"

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


In [24]:
#Find the p-value for sex
train_M = len(characteristics.loc[(characteristics.id=="Train") & (characteristics.sex=="M")])
train_F = len(characteristics.loc[(characteristics.id=="Train") & (characteristics.sex=="F")])
test_M = len(characteristics.loc[(characteristics.id=="Test") & (characteristics.sex=="M")])
test_F = len(characteristics.loc[(characteristics.id=="Test") & (characteristics.sex=="F")])
cohort_M = len(characteristics.loc[characteristics.sex=="M"])
cohort_F = len(characteristics.loc[characteristics.sex=="F"])

stat, p_sex, dof, expected = chi2_contingency([[train_M, train_F], [test_M, test_F]])

In [25]:
def findrange(val_list):
    min_val = min(val_list)
    max_val = max(val_list)

    return (min_val, max_val)

In [32]:
#Find the p-value for Age
train_age = list(characteristics[characteristics.id=="Train"].age)
test_age = list(characteristics[characteristics.id=="Test"].age)
cohort_age = list(characteristics.age)

train_age_med = median(train_age)
test_age_med = median(test_age)
cohort_age_med = median(cohort_age)

train_age_range = findrange(train_age)
test_age_range = findrange(test_age)
cohort_age_range = findrange(cohort_age)

train_age_stdev = round(stdev(train_age),2)
test_age_stdev = round(stdev(test_age),2)
cohort_age_stdev = round(stdev(cohort_age),2)

stat, p_age = mannwhitneyu(train_age, test_age)

In [33]:
#Find the p-value for GTV
train_gtv = list(characteristics[characteristics.id=="Train"].GTV)
test_gtv = list(characteristics[characteristics.id=="Test"].GTV)
cohort_gtv = list(characteristics.GTV)

train_gtv_mean = round(mean(train_gtv),2)
test_gtv_mean = round(mean(test_gtv),2)
cohort_gtv_mean = round(mean(cohort_gtv),2)

train_gtv_stdev = round(stdev(train_gtv),2)
test_gtv_stdev = round(stdev(test_gtv),2)
cohort_gtv_stdev = round(stdev(cohort_gtv),2)

stat, p_gtv = mannwhitneyu(train_gtv, test_gtv)

In [34]:
#Make the table
table = [['Sex', '', '', '', round(p_sex,2)], ['Male', cohort_M, train_M, test_M, ''], ['Female', cohort_F, train_F, test_F, ''],
        ['Age', '', '', '', round(p_age,2)], ['Median', cohort_age_med, train_age_med, test_age_med, ''], 
        ['Range', cohort_age_range, train_age_range, test_age_range, ''],['Stdev', cohort_age_stdev, train_age_stdev, test_age_stdev, ''],
        ['Primary GTV (cm3)', '', '', '', round(p_gtv,2)],['Mean', cohort_gtv_mean, train_gtv_mean, test_gtv_mean, ''],['Stdev', cohort_gtv_stdev, train_gtv_stdev, test_gtv_stdev, '']]

charac_table = pd.DataFrame(table, columns=['Characteristic', 'Entire Cohort', 'Training-Validation Cohort', 'Testing Cohort', 'p-value'])


In [35]:
charac_table

Unnamed: 0,Characteristic,Entire Cohort,Training-Validation Cohort,Testing Cohort,p-value
0,Sex,,,,1.0
1,Male,17,15,2,
2,Female,2,2,0,
3,Age,,,,0.51
4,Median,61,61,68.5,
5,Range,"(39, 80)","(39, 80)","(58, 79)",
6,Stdev,10.74,10.44,14.85,
7,Primary GTV (cm3),,,,1.0
8,Mean,4.67,4.68,4.61,
9,Stdev,5.42,5.62,4.81,


In [36]:
charac_table.to_csv('/Users/marktran21/Documents/MD Anderson/Data/characteristics_table.csv')