In [85]:
import os
import re

import openslide
import tensorflow as tf
import tensorflow.python.platform
from tensorflow.python.platform import gfile
import numpy as np
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.svm import SVC, LinearSVC
import matplotlib.pyplot as plt
%matplotlib inline
import cPickle as pickle 

from PIL import Image


from boto.s3.connection import S3Connection
from boto.s3.key import Key

# pull in local code
import sys
sys.path.insert(0, '/Users/zgreyn/Documents/galvanize/projects/vegf/src')
import slideutils
import create_tiles


In [9]:
# create a pandas data frame where each row is a tile
df = pd.DataFrame()
df['key'] = np.nan
df['slide'] = np.nan
df['width'] = 64
df['height'] = 64
df['x'] = np.nan
df['y'] = np.nan

In [41]:
def populate_dataframe(slide, slide_name, df, tile_size=(64,64), level=0):
    '''
    This function breaks an openslide slide into tiles, discards empty tiles (just blank slide area),
    and records the location of each tile.  It only exists because I didn't originally save the locations...
    '''

    dimensions = slide.dimensions # (width, height)
    
    i = 0
    for x in xrange(0, dimensions[0], tile_size[0]):
        for y in xrange(0, dimensions[1], tile_size[1]):
            
            # create the tile (read_region takes location (upper left pixel), level, and size)
            tile = slide.read_region((x,y), level, tile_size)

            # don't look at blank tiles
            if not create_tiles.is_empty(tile):
                key = 'tiles/slide_' + slide_name + '_' + '{:06d}'.format(i) + '.tiff'
                record = pd.DataFrame([[key, slide_name, tile_size[0], tile_size[1], x, y]], columns=['key','slide','width','height','x','y'])
                df = df.append(record)
                
            # update the tile num
            i+=1

            if i%1000==0:
                print i

    return df

In [16]:
# get all the slides from the bucket
slide_names = []
for key in bucket.list():
    if key.name.endswith('.svs'):
        slide_names.append(key.name)

temp_slide_file = '../data/temp.svs'

In [44]:
# open each slide
for s3_key in slide_names:

    # save the file to a temporary local file
    k = bucket.get_key(s3_key)
    k.get_contents_to_filename(temp_slide_file)

    # open it with openslide
    slide = openslide.OpenSlide(temp_slide_file)

    # break the slide into tiles and save the metadata in the dataframe
    df = populate_dataframe(slide, s3_key[:-4], df)

1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
27000
28000
29000
30000
31000
32000
33000
34000
35000
36000
37000
38000
39000
40000
41000
42000
43000
44000
45000
46000
47000
48000
49000
50000
51000
52000
53000
54000
55000
56000
57000
58000
59000
60000
61000
62000
63000
64000
65000
66000
67000
68000
69000
70000
71000
72000
73000
74000
75000
76000
77000
78000
79000
80000
81000
82000
83000
84000
85000
86000
87000
88000
89000
90000
91000
92000
93000
94000
95000
96000
97000
98000
99000
100000
101000
102000
103000
104000
105000
106000
107000
108000
109000
110000
111000
112000
113000
114000
115000
116000
117000
118000
119000
120000
121000
122000
123000
124000
125000
126000
127000
128000
129000
130000
131000
132000
133000
134000
135000
136000
137000
138000
139000
140000
141000
142000
143000
144000
145000
146000
147000
148000
149000
150000
151000
152000
153000
154000
155000
156000
157000
158000
15

In [45]:
df.head()

Unnamed: 0,key,slide,width,height,x,y
0,something,other,64,64,0.0,0.0
0,tiles/slide_70229_000000.tiff,70229,64,64,0.0,0.0
0,tiles/slide_70229_000001.tiff,70229,64,64,0.0,64.0
0,tiles/slide_70229_000002.tiff,70229,64,64,0.0,128.0
0,tiles/slide_70229_000003.tiff,70229,64,64,0.0,192.0


In [48]:
pickle.dump(df, open( "../data/tiles_df.pkl", "wb" ))

In [62]:
df = df.drop(df.index[[0]])
df = df.reset_index()
df.drop(['index'], axis=1, inplace=True)

In [79]:
df.head()

Unnamed: 0,key,slide,width,height,x,y
0,tiles/slide_70229_000000.tiff,70229,64,64,0.0,0.0
1,tiles/slide_70229_000001.tiff,70229,64,64,0.0,64.0
2,tiles/slide_70229_000002.tiff,70229,64,64,0.0,128.0
3,tiles/slide_70229_000003.tiff,70229,64,64,0.0,192.0
4,tiles/slide_70229_000004.tiff,70229,64,64,0.0,256.0


In [64]:
pickle.dump(df, open( "../data/tiles_df.pkl", "wb" ))

In [66]:
k = Key(bucket)
k.key = 'tiles_df.pkl'
k.set_contents_from_filename('../data/tiles_df.pkl')

129239389

In [70]:
# the data frame has all of the tile metadata in it; verify that we can retrieve a tile easily
import random
key = bucket.get_key(df.iloc[random.randint(0,df.shape[0])]['key'])
key.get_contents_to_filename('../data/temp.tiff')

### Now, we have all the metadata in a dataframe and all of the tiles in S3.  We can go ahead and augment the data frame with the output of the neural net for each tile.

In [71]:
# For each tile, download it, pass it through the neural net (all layers except the last one), and put the features in the data frame.

In [89]:
'''
Inspired by: 
KERNIX blog - Image classification with a pre-trained deep neural network 
http://www.kernix.com/blog/image-classification-with-a-pre-trained-deep-neural-network_p11
'''


def create_graph():
    '''
    In TensorFlow, a graph describes the computations to be done, 
    which are then executed in sessions.  Credit: KERNIX blog (see above)
    '''
    model_dir = '../model/imagenet'
    with gfile.FastGFile(os.path.join(
            model_dir, 'classify_image_graph_def.pb'), 'rb') as f:
        graph_def = tf.GraphDef()
        graph_def.ParseFromString(f.read())
        _ = tf.import_graph_def(graph_def, name='')


def extract_features(df):
    '''
    augment a pandas df containing image file S3 keys (as 'key') with
    a pandas series of features
    '''

    temp_tiff = '../data/temp.tiff'
    temp_jpg = '../data/temp.jpg'
    nb_features = 2048
    features = np.empty((df.shape[0], nb_features))

    create_graph()

    with tf.Session() as sess:

        next_to_last_tensor = sess.graph.get_tensor_by_name('pool_3:0')

        for i,k in enumerate(df['key']):
            if (i % 100 == 0):
                print('Processing %s... (%s of %s)' % (k, i, df.shape[0]))
                    
            # save the file locally
            key = bucket.get_key(k, validate=False)
            key.get_contents_to_filename(temp_tiff)
            
            # convert tiff to jpg
            try:
                im = Image.open(temp_tiff)
                im.thumbnail(im.size)
                im.save(temp_jpg, "JPEG", quality=100)
            except Exception, e:
                print e

            image_data = gfile.FastGFile(temp_jpg, 'rb').read()

            predictions = sess.run(next_to_last_tensor,
                                    {'DecodeJpeg/contents:0': image_data})
            features[i, :] = np.squeeze(predictions)

    return features

In [119]:
list(df.slide.unique())

[u'70229',
 u'70230',
 u'70231',
 u'70232',
 u'70233',
 u'70234',
 u'70235',
 u'70236',
 u'70237']

In [97]:
df[df['slide'] == '70232']

Unnamed: 0,key,slide,width,height,x,y
633045,tiles/slide_70232_000167.tiff,70232,64,64,0.0,10688.0
633046,tiles/slide_70232_000335.tiff,70232,64,64,64.0,10688.0
633047,tiles/slide_70232_000498.tiff,70232,64,64,128.0,10368.0
633048,tiles/slide_70232_000503.tiff,70232,64,64,128.0,10688.0
633049,tiles/slide_70232_000671.tiff,70232,64,64,192.0,10688.0
633050,tiles/slide_70232_000839.tiff,70232,64,64,256.0,10688.0
633051,tiles/slide_70232_001007.tiff,70232,64,64,320.0,10688.0
633052,tiles/slide_70232_001175.tiff,70232,64,64,384.0,10688.0
633053,tiles/slide_70232_001343.tiff,70232,64,64,448.0,10688.0
633054,tiles/slide_70232_001511.tiff,70232,64,64,512.0,10688.0


In [98]:
df_70232 = df[df['slide'] == '70232']
features_70232 = extract_features(df_70232)

Processing tiles/slide_70232_000167.tiff... (0 of 3723)
Processing tiles/slide_70232_006827.tiff... (100 of 3723)
Processing tiles/slide_70232_008663.tiff... (200 of 3723)
Processing tiles/slide_70232_011164.tiff... (300 of 3723)
Processing tiles/slide_70232_011813.tiff... (400 of 3723)
Processing tiles/slide_70232_012171.tiff... (500 of 3723)
Processing tiles/slide_70232_012512.tiff... (600 of 3723)
Processing tiles/slide_70232_012717.tiff... (700 of 3723)
Processing tiles/slide_70232_012998.tiff... (800 of 3723)
Processing tiles/slide_70232_013188.tiff... (900 of 3723)
Processing tiles/slide_70232_013379.tiff... (1000 of 3723)
Processing tiles/slide_70232_013669.tiff... (1100 of 3723)
Processing tiles/slide_70232_013869.tiff... (1200 of 3723)
Processing tiles/slide_70232_014168.tiff... (1300 of 3723)
Processing tiles/slide_70232_014366.tiff... (1400 of 3723)
Processing tiles/slide_70232_014662.tiff... (1500 of 3723)
Processing tiles/slide_70232_014859.tiff... (1600 of 3723)
Processin

In [101]:
features_70232.shape

(3723, 2048)

In [110]:
df_70232 = pd.concat([df_70232, pd.DataFrame(data=features_70232, index=df_70232.index)], axis=1, join_axes=[df_70232.index])

In [114]:
pickle.dump(df_70232, open( "../data/df_70232.pkl", "wb" ))

In [120]:
slides = [
 '70231',
 #'70232',
 '70233',
 '70234',
 '70235',
 '70236',
 '70237']

for s in slides:
    df_s = df[df['slide'] == s]
    features_s = extract_features(df_s)
    df_s = pd.concat([df_s, pd.DataFrame(data=features_s, index=df_s.index)], axis=1, join_axes=[df_s.index])
    f_name = "../data/df_"+s+".pkl"
    pickle.dump(df_s, open(f_name, "wb" ))

Processing tiles/slide_70231_000056.tiff... (0 of 12082)
Processing tiles/slide_70231_004651.tiff... (100 of 12082)
Processing tiles/slide_70231_004838.tiff... (200 of 12082)
Processing tiles/slide_70231_005120.tiff... (300 of 12082)
Processing tiles/slide_70231_005314.tiff... (400 of 12082)
Processing tiles/slide_70231_005586.tiff... (500 of 12082)
Processing tiles/slide_70231_005767.tiff... (600 of 12082)
Processing tiles/slide_70231_005962.tiff... (700 of 12082)
Processing tiles/slide_70231_006211.tiff... (800 of 12082)
Processing tiles/slide_70231_006392.tiff... (900 of 12082)
Processing tiles/slide_70231_006647.tiff... (1000 of 12082)
Processing tiles/slide_70231_006818.tiff... (1100 of 12082)
Processing tiles/slide_70231_006983.tiff... (1200 of 12082)
Processing tiles/slide_70231_007127.tiff... (1300 of 12082)
Processing tiles/slide_70231_007275.tiff... (1400 of 12082)
Processing tiles/slide_70231_007425.tiff... (1500 of 12082)
Processing tiles/slide_70231_007574.tiff... (1600 of