In [38]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"]="1"

In [39]:
from tensorflow.python.client import device_lib
print (device_lib.list_local_devices())

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 5872283738812741676
xla_global_id: -1
, name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 23551737856
locality {
  bus_id: 1
  links {
  }
}
incarnation: 7843232950300974549
physical_device_desc: "device: 0, name: NVIDIA GeForce RTX 3090, pci bus id: 0000:41:00.0, compute capability: 8.6"
xla_global_id: 416903419
]


2023-10-31 21:36:07.921620: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1616] Created device /device:GPU:0 with 22460 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 3090, pci bus id: 0000:41:00.0, compute capability: 8.6


In [40]:
import time
import os
#import neccesary packages
#import tensorflow_hub as hub
import tensorflow as tf
import pickle
from keras import backend as K
import numpy as np
from sklearn_extra.cluster import KMedoids
from tensorflow import keras
from tensorflow.keras.layers import Concatenate, Dense, Input, LSTM, Embedding, Dropout, Activation, GRU, Flatten
from datetime import datetime
from scipy.spatial import distance_matrix
import sys

In [41]:
import keras.backend as K
import operator


In [59]:

from tensorflow.keras import layers, Model, regularizers

def make_variables(tf_name, k1, k2, initializer):
     
    return tf.Variable(initializer(shape=[k1, k2], dtype=tf.float32), trainable=True, name=tf_name)

#prototype layer
class prototypeLayer(keras.layers.Layer):
    def __init__(self, k_protos, vect_size, k_cents):
        super(prototypeLayer, self).__init__(name='proto_layer')
        self.n_protos = k_protos
        self.vect_size = vect_size
        self.prototypes = make_variables("prototypes", k_protos, vect_size,
                                         initializer=tf.constant_initializer(k_cents))
    
    @tf.function
    def call(self, inputs):
        tmp1 = tf.expand_dims(inputs, 2)
        
        tmp1 = tf.broadcast_to(tmp1, [tf.shape(tmp1)[0], tf.shape(tmp1)[1], self.n_protos, self.vect_size])
        tmp2 = tf.broadcast_to(self.prototypes,
                               [tf.shape(tmp1)[0], tf.shape(tmp1)[1], self.n_protos, self.vect_size])
        tmp3 = tmp1 - tmp2
        tmp4 = tmp3 * tmp3
        distances = tf.reduce_sum(tmp4, axis=3)
        
        return distances, self.prototypes

#distance layer: to convert the full distance matrix to sparse similarity matrix
class distanceLayer(keras.layers.Layer):
    def __init__(self):
        super(distanceLayer, self).__init__(name='distance_layer')
        self.a = 0.1
        self.beta = 1e6

    def e_func(self, x, e=2.7182818284590452353602874713527):
        return tf.math.pow(e, -(self.a * x))

    @tf.function
    def call(self, full_distances):
        min_dist_ind = tf.nn.softmax(-full_distances * self.beta)
        e_dist = self.e_func(full_distances) + 1e-8
        dist_hot_vect = min_dist_ind * e_dist
        return dist_hot_vect
    
    
class PrototypeCNN(Model):
    """
    A CNN for text classification.
    Uses an embedding layer, followed by a convolutional, max-pooling and softmax layer.
    """
    def __init__(self, sequence_length, num_classes, vocab_size, pretrained_embeddings, word_idx_map,\
                 embedding_size, filter_sizes, num_filters, l2_reg_lambda, dropout_keep_prob, k_protos, vect_size):

        
        super(PrototypeCNN, self).__init__()
        self.k_protos = k_protos
        self.vect_size = vect_size
        self.full_distences = None
        self.full_onehot_distances = None
        self.embedding = tf.keras.layers.Embedding(input_dim=vocab_size,output_dim=embedding_size,weights = [pretrained_embeddings],trainable=True)  # Optional: Set to True if you want to fine-tune the embeddings during training

        
        self.convs = []
        for filter_size in filter_sizes:
            conv_block = tf.keras.Sequential([
                layers.Conv2D(num_filters, (filter_size, embedding_size), 
                              padding='valid', activation='relu'),
                layers.MaxPooling2D(pool_size=(sequence_length - filter_size + 1, 1), 
                                    strides=(1,1), padding='valid')])
            self.convs.append(conv_block)

        self.flatten = layers.Flatten()
        self.distance_layer = distanceLayer()
        self.dropout = layers.Dropout(dropout_keep_prob)  # keep_prob will be supplied by call argument
        self.fc = layers.Dense(num_classes, 
                               kernel_regularizer=regularizers.l2(l2_reg_lambda), 
                               activation='softmax')

    def init_prototypelayer(self, k_cents):
        self.proto_layer = prototypeLayer(self.k_protos, self.vect_size, k_cents)
        
    def call(self, x):
        # Embedding layer
        x = self.embedding(x)
        x = tf.expand_dims(x, -1)
  
        pooled_outputs = []
        for conv in self.convs:
            c = conv(x)
            pooled_outputs.append(c)

        # Combine all the pooled features
        x = tf.concat(pooled_outputs, axis=-1)
        x = self.flatten(x)
        
        x = tf.expand_dims(x, axis=0)
        full_distances, protos = self.proto_layer(x)
       
        dist_hot_vect = self.distance_layer(full_distances)
        

        x = self.dropout(dist_hot_vect)
        x = self.fc(x)
        x = tf.squeeze(x, axis=0)
        #return x, self.fc.weights[0], self.fc.weights[1]
        
       
        
        return x
    
    def embed(self,x):
        # Embedding layer
        x = self.embedding(x)
        x = tf.expand_dims(x, -1)


        pooled_outputs = []
        for conv in self.convs:
            #print(x.shape)
            
            c = conv(x)
            pooled_outputs.append(c)
           

        # Combine all the pooled features
        x = tf.concat(pooled_outputs, axis=-1)
        x = self.flatten(x)
        
        return x
    
    def full_distance(self, x):
        
        x = self.embedding(x)
        x = tf.expand_dims(x, -1)
  
        pooled_outputs = []
        for conv in self.convs:
            c = conv(x)
            pooled_outputs.append(c)

        # Combine all the pooled features
        x = tf.concat(pooled_outputs, axis=-1)
        x = self.flatten(x)
        
        x = tf.expand_dims(x, axis=0)
        full_distances, protos = self.proto_layer(x)
        
        return full_distances
    
    def one_hot_distance(self, x):
        
         # Embedding layer
        x = self.embedding(x)
        x = tf.expand_dims(x, -1)
  
        pooled_outputs = []
        for conv in self.convs:
            c = conv(x)
            pooled_outputs.append(c)

        # Combine all the pooled features
        x = tf.concat(pooled_outputs, axis=-1)
        x = self.flatten(x)
        
        x = tf.expand_dims(x, axis=0)
        full_distances, protos = self.proto_layer(x)
       
        dist_hot_vect = self.distance_layer(full_distances)
        
        return dist_hot_vect
    

In [60]:
#this method simple project prototypes to the closest sentences in
#sample_sent_vects
def projection(sample_sentences,sample_sent_vects,data_size=10000):
    prototypes = ProtoCNN.proto_layer.prototypes
    d_pos = {}
    #for each prototype
    for p_count, p in enumerate(prototypes):
        print('[db] p_count = ', p_count)
        s_count = 0
        d_pos[p_count] = {}
        #find its distances to all sample sentences
        for i, s in enumerate(sample_sent_vects[:data_size]):
            if len(sample_sentences[i]) < 5 or len(sample_sentences[i]) > 100:
                continue
            d_pos[p_count][i] = np.linalg.norm(sample_sent_vects[i] - p)
            s_count += 1
    #sort those distances, then assign the closest ones to new prototypes
    new_protos = []
    for p_count, p in enumerate(prototypes):
        sorted_d = sorted(d_pos[p_count].items(), key=operator.itemgetter(1))
        new_protos.append(sample_sent_vects[sorted_d[0][0]])
    #return these values

    return new_protos

In [61]:
#show the list of prototypes
def showPrototypes(sample_sentences,sample_sent_vects, sample_y, k_protos=10,printOutput=False, k_closest_sents = 20):
    
    prototypes = ProtoCNN.proto_layer.prototypes.numpy()
    #data_size = 10000
    d_pos = {}
    data_size = 150000
    for p_count, p in enumerate(prototypes):
       
        s_count = 0
        d_pos[p_count] = {}
        for i, s in enumerate(sample_sent_vect[:data_size]):
            #if len(sample_sentences[i]) < 20 or len(sample_sentences[i]) > 100:
            if len(sample_sentences[i]) < 30 or sample_y[i][1]==0:
                continue
            d_pos[p_count][i] = np.linalg.norm(sample_sent_vect[i] - p)
            s_count += 1
 

    mappedPrototypes = {}    
   
    recorded_protos_score = {}
    print("Prototypes: ")
    for l in range(k_protos):
        # print("prototype index = ", l)
        recorded_protos_score[l] = {}
        sorted_d = sorted(d_pos[l].items(), key=operator.itemgetter(1))
        print(l)
        mappedPrototypes[l]=[]
        for k in range(k_closest_sents):
            i = sorted_d[k][0]
            score = sorted_d[k][1]
            # print("[db] sorted_d ",sorted_d[0])
            # print("[db] sample_sentences[sorted_d[0][0]]: ",sample_sentences[sorted_d[0][0]])
            mappedPrototypes[l].append((sample_sentences[i].strip(), score, sample_y[i][1]))
            if k<10:
                print(sorted_d[k], sample_sentences[i],sample_y[i][1])
        #print(mappedPrototypes[l])

    
    return mappedPrototypes

In [62]:
#method to generate the number of closest sentences to each prototype
def protoFreq(self,sample_sent_vect):
    d = {}
    for sent in sample_sent_vect:
        sent_dist = {}
        for i, p in enumerate(self.prototypes):
            sent_dist[i] = np.linalg.norm(sent - p)
            if i not in d:
                d[i] = 0
        sorted_sent_d = sorted(sent_dist.items(), key=operator.itemgetter(1))
        # print(sorted_sent_d)
        picked_protos = sorted_sent_d[0][0]
        d[picked_protos] += 1
    print("Prototype freq = ", d)
    x = sorted(d.items(), key=lambda item: item[1], reverse=True)
    print("sorted :",x)

#re-train the model with new pruned prototype



In [63]:
def pruningTrain(self,new_k_protos,x_train,y_train,x_test,y_test):
    #print("[db] self prototypes: ",self.prototypes)
    k_cents = self.prototypes[:new_k_protos]
    k_cents = [p.numpy() for p in k_cents]
    #print("[db] k_cents = ",k_cents)
    self.createModel(k_cents=k_cents,k_protos=new_k_protos)
    self.train(x_train,y_train,x_test,y_test)

# generate the sentence value for each prototype
# and 10 closest sentences to it


In [64]:
def showTrajectory(self,input,sample_sentences,sample_vect):
    if len(self.mappedPrototypes) == 0:
        self.showPrototypes(sample_sentences,sample_vect,printOutput=False)
    prototypes = [self.mappedPrototypes[k].strip() for k in self.mappedPrototypes]
    vP, vS = self.embed(prototypes), self.embed(input)
    dStoP = {}
    for sCount, s in enumerate(vS):
        dStoP[sCount] = {}
        for i, p in enumerate(vP):
            dStoP[sCount][i] = np.linalg.norm(vS[sCount] - p)

    mappedProtos, mappedScore, mappedDist = [], [], []
    for sCount, s in enumerate(vS):
        sorted_d = sorted(dStoP[sCount].items(), key=operator.itemgetter(1))
        mappedProtos.append(prototypes[sorted_d[0][0]])

    #for small dataset, we use a pretrained sentiment model. We can use any
    #model for sentiment scores
    from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
    sid_obj = SentimentIntensityAnalyzer()
    print("[db] mappedProtos ", mappedProtos)
    scores = []
    for s in mappedProtos:
        # sentiment_dict = sid_obj.polarity_scores(s)
        scores.append(0.5 + sid_obj.polarity_scores(s)['compound'] / 2)
    return scores

In [65]:
dev_sample_percentage = .1


# Model Hyperparameters
embedding_dim = 300
filter_sizes ="3,4,5"
num_filters = 128
dropout_keep_prob = 0.5
l2_reg_lambda = 0.5
max_l =100
# Training parameters
batch_size = 4096
num_epochs = 100
evaluate_every = 100
checkpoint_everyt = 100
num_checkpoints = 5

# Misc Parameters
allow_soft_placement = True
log_device_placement = False

# Data preprocessing

In [66]:
timestamp = str(int(time.time()))

out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", timestamp))
print("output directory: ", out_dir)
if not os.path.exists(out_dir):
    os.makedirs(out_dir)
# Data Preparation
# ==================================================

# Load data

print("loading data...")
x = pickle.load(open("./mainbalancedpickle.p","rb"))
revs, W, W2, word_idx_map, vocab, max_l = x[0], x[1], x[2], x[3], x[4], x[5]
print("data loaded!")# Load data


output directory:  /big/xw384/schoolwork/NLP+DEEP LEARNING/Project/CASCADE/src/runs/1698788246
loading data...
data loaded!


In [67]:
def encode(text, max_l= 100):
    
    encoded_X = []
    for i in range(len(text)):
        encoded_X.append(np.asarray([word_idx_map[word] for word in text[i].split()]))

    # padding
    for i in range(len(encoded_X)):
        if( len(encoded_X[i]) < max_l ):
            encoded_X[i] = np.append(encoded_X[i],np.zeros(max_l-len(encoded_X[i])))
        elif( len(encoded_X[i]) > max_l ):
            encoded_X[i] = encoded_X[i][0:max_l]
    encoded_X = np.asarray(encoded_X)
    
    return encoded_X

In [68]:

max_l = 100

x_text = []
y = []

test_x = []
test_y = []

for i in range(len(revs)):
    if revs[i]['split']==1:
        x_text.append(revs[i]['text'])
        y.append(revs[i]['label'])
    else:
        test_x.append(revs[i]['text'])
        test_y.append(revs[i]['label'])  

y = np.asarray(y)
y_test = np.asarray(test_y)


x =  encode(x_text)
x_test = encode(test_x)

In [69]:
train_data = list(zip(x_text,y))

In [70]:
shuffle_indices = np.random.permutation(np.arange(len(y)))
x_shuffled = x[shuffle_indices]
y_shuffled = y[shuffle_indices]


# Split train/test set
# TODO: This is very crude, should use cross-validation

dev_sample_index = -1 * int(dev_sample_percentage * float(len(y)))
x_train, x_dev = x_shuffled[:dev_sample_index], x_shuffled[dev_sample_index:]
y_train, y_dev = y_shuffled[:dev_sample_index], y_shuffled[dev_sample_index:]
print("Train/Dev split: {:d}/{:d}".format(len(y_train), len(y_dev)))

x_train = np.asarray(x_train)
x_dev = np.asarray(x_dev)
y_train = np.asarray(y_train)
y_dev = np.asarray(y_dev)
word_idx_map["@"] = 0
rev_dict = {v: k for k, v in word_idx_map.items()}

Train/Dev split: 139232/15470


In [71]:
k_protos, vect_size = 10, 384

In [72]:
ProtoCNN = PrototypeCNN(sequence_length=max_l,
    num_classes=len(y_train[0]),
    vocab_size=len(W),
    pretrained_embeddings = W,
    word_idx_map = word_idx_map,
    embedding_size=embedding_dim,
    filter_sizes=list(map(int, filter_sizes.split(","))),
    num_filters=num_filters,
    l2_reg_lambda=l2_reg_lambda,
    dropout_keep_prob = dropout_keep_prob,
    k_protos = k_protos,
    vect_size = vect_size)

In [73]:
id_word_map = {}

In [74]:
for word, idx in word_idx_map.items():
    id_word_map[idx] = word

In [76]:
y = ProtoCNN.embed(encode(x_text[:2]))

In [77]:
import random

In [79]:
random.shuffle(x_text)
sample_sentences = x_text[:15000]
encoded_sample_sentences = encode(sample_sentences)
sample_sent_vects =[]

sample_sent_vect = ProtoCNN.embed(encoded_sample_sentences)


In [80]:
sample_sent_vect.shape

TensorShape([15000, 384])

In [81]:
k_protos = 10
kmedoids = KMedoids(n_clusters=k_protos, random_state=0).fit(sample_sent_vect)
k_cents = kmedoids.cluster_centers_
print(k_cents.shape)

(10, 384)


In [82]:
ProtoCNN.init_prototypelayer(k_cents)

In [83]:
y= ProtoCNN(x_train[:2])

2023-10-31 21:38:09.691218: I tensorflow/stream_executor/cuda/cuda_blas.cc:1614] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.


In [84]:
y

<tf.Tensor: shape=(2, 2), dtype=float32, numpy=
array([[0.54959184, 0.45040822],
       [0.64327854, 0.35672143]], dtype=float32)>

# Model training and testing

In [85]:

timestamp = str(int(time.time()))
# Output directory for models and summaries
out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", timestamp))
print("Writing to {}\n".format(out_dir))

# Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it
checkpoint_dir = os.path.abspath(os.path.join(out_dir, "checkpoints"))
if not os.path.exists(checkpoint_dir):
    os.makedirs(checkpoint_dir)
    
checkpoint_prefix = os.path.join(checkpoint_dir, "model")

Writing to /big/xw384/schoolwork/NLP+DEEP LEARNING/Project/CASCADE/src/runs/1698788683



In [86]:
x_train.shape

(139232, 100)

In [87]:
y_train.shape

(139232, 2)

In [88]:
#ProtoCNN = tf.keras.models.load_model(os.path.join(out_dir,"my_weights-finetune.pt"))

In [89]:
#We use Adam optimizer with default learning rate 0.0001.
#Change this value based on your preference
#out_dir = "/big/xw384/schoolwork/NLP+DEEP LEARNING/Project/CASCADE/src/runs/1686708033"
opt = tf.keras.optimizers.Adam(learning_rate=.0001)
ProtoCNN.compile(optimizer=opt, loss='categorical_crossentropy',metrics=['accuracy'])

In [90]:
#loaded_object = pickle.load(open(os.path.join(out_dir,"optimizer.pt"), 'rb'))
#ProtoCNN.optimizer.set_weights(loaded_object)

In [91]:

i = 0

maxEvalRes = 0

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath = checkpoint_dir,  # Specify the path to save the checkpoints
    save_weights_only=True,  # Save only the model weights
    monitor='val_loss',  # Monitor the validation loss for saving the best weights
    save_best_only=True,  # Save only the best weights based on the monitored metric
    verbose=1  # Print a message when a checkpoint is saved
)    
ProtoCNN.fit(x_train,y_train, batch_size = 4096, epochs=2000, verbose=1, validation_data= (x_dev, y_dev))

       

Epoch 1/2000
Epoch 2/2000
Epoch 3/2000
Epoch 4/2000
Epoch 5/2000
Epoch 6/2000
Epoch 7/2000
Epoch 8/2000
Epoch 9/2000
Epoch 10/2000
Epoch 11/2000
Epoch 12/2000
Epoch 13/2000
Epoch 14/2000
Epoch 15/2000
Epoch 16/2000
Epoch 17/2000
Epoch 18/2000
Epoch 19/2000
Epoch 20/2000
Epoch 21/2000
Epoch 22/2000
Epoch 23/2000
Epoch 24/2000
Epoch 25/2000
Epoch 26/2000
Epoch 27/2000
Epoch 28/2000
Epoch 29/2000
Epoch 30/2000
Epoch 31/2000
Epoch 32/2000
Epoch 33/2000
Epoch 34/2000
Epoch 35/2000
Epoch 36/2000
Epoch 37/2000
Epoch 38/2000
Epoch 39/2000
Epoch 40/2000
Epoch 41/2000
Epoch 42/2000
Epoch 43/2000
Epoch 44/2000
Epoch 45/2000
Epoch 46/2000
Epoch 47/2000
Epoch 48/2000
Epoch 49/2000
Epoch 50/2000
Epoch 51/2000
Epoch 52/2000
Epoch 53/2000
Epoch 54/2000
Epoch 55/2000
Epoch 56/2000

KeyboardInterrupt: 

In [None]:
#pickle.dump(opt.get_weights(), open(os.path.join(out_dir, 'optimizer.pt'), 'wb+'))

In [202]:
ProtoCNN.save_weights(os.path.join(out_dir,"my_weights-finetune.model"))

In [201]:
ProtoCNN.save(os.path.join(out_dir,"my_weights-finetune.pt"))

INFO:tensorflow:Assets written to: /big/xw384/schoolwork/NLP+DEEP LEARNING/Project/CASCADE/src/runs/1686708033/my_weights-finetune.pt/assets


In [61]:
ProtoCNN.load_weights(os.path.join(out_dir,"my_weights-finetune.model"))

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7fd1655769d0>

In [1]:
ProtoCNN.load_weights(os.path.join("src/runs/10_31/best_classifier.ckpt"))

NameError: name 'ProtoCNN' is not defined

In [62]:
def dev_step(x_batch, y_batch):
    """
    Evaluates model on a dev set
    """
    logits= ProtoCNN.predict(x_batch)

   

    prediction_losses = tf.keras.losses.categorical_crossentropy(y_batch, tf.nn.softmax(logits))

    loss =  prediction_losses 


    predictions = tf.argmax(logits, 1)
    correct_predictions = tf.equal(predictions, tf.argmax(y_batch, 1))
    
    return loss, correct_predictions
    

    

In [63]:
correct_predictions_test = None

In [64]:
from tqdm import tqdm

In [65]:

# Create testing dataset
test_loader = tf.data.Dataset.from_tensor_slices((x_test, y_test))


In [66]:
for x_batch, y_batch in tqdm(test_loader.batch(4096)):    
    test_loss, correct_predictions = dev_step(x_batch, y_batch)  
    if correct_predictions_test is None:
        correct_predictions_test = correct_predictions
    else:
        correct_predictions_test = tf.concat((correct_predictions_test, correct_predictions), axis=0)

test_accuracy = tf.reduce_mean(tf.cast(correct_predictions_test, tf.float32))
print("test accuracy {}".format(test_accuracy))

100%|██████████| 16/16 [00:04<00:00,  3.20it/s]

test accuracy 0.6460427641868591





In [87]:
sarc_comments = [ rev['text'] for rev in revs if rev['label'][1]==1 ]

In [151]:
sarc_comments[500:600]

['looks like its time to convince the chineese that siberian tiger bones improve boners',
 'shutting down the government and the military at the same time !',
 "yay , now i do n't have to pay taxes !",
 'a modern day shakespeare',
 'god im done with this sub fuckin circle jerk shit',
 'looks like a fun format',
 'this film is a serious game changer !',
 "well , if you do n't have anything to hide then there is nothing to fear",
 'admissions are great , i get to skip the whole investigation and just ban you !',
 'as we all know , apple are the only ones who can innovate',
 'hahahaha handicapped people omg lol',
 "it 's worth every penny to bring jeeezus back",
 'i hear mien kamph is a very popular book',
 "well i'm glad to know that the government still considers shutting down voluntary marketplaces an essential service during the shutdown",
 'trade geno and sid for miller sounds like a great trade',
 'wow , never saw that coming',
 "if english was good enough for jesus , then it 's goo

In [166]:
#test giving a prediction value to an input
testS = ["i guess no one at google 's ever been on a plane and wanted to listen to their music library",
         "it 's like windows phone 7 and that worked great",
        'religion must have the answer',
        'until a republican does it'
        ]


In [167]:
x= encode(testS )

In [168]:
ProtoCNN.predict(x)

array([[0.5124252 , 0.48757482],
       [0.49226758, 0.50773245],
       [0.4995483 , 0.5004517 ],
       [0.51824856, 0.48175144]], dtype=float32)

In [169]:
ProtoCNN.full_distance(x)

<tf.Tensor: shape=(1, 4, 10), dtype=float32, numpy=
array([[[ 2.7111988 ,  2.8435276 ,  2.8580358 ,  2.4625027 ,
          2.8837852 ,  3.2298265 ,  2.8851266 ,  2.779249  ,
          2.7834678 ,  2.695945  ],
        [13.555579  , 13.56708   , 13.608713  , 13.089714  ,
         13.671051  , 14.761423  , 14.198931  , 13.693032  ,
         13.776615  , 13.339384  ],
        [ 8.140151  ,  8.293659  ,  8.2329035 ,  7.875981  ,
          8.249426  ,  8.662914  ,  8.463434  ,  8.189972  ,
          8.333049  ,  7.9773016 ],
        [ 0.99792427,  1.0499238 ,  1.040509  ,  0.7297432 ,
          1.0358859 ,  1.2475713 ,  1.0940286 ,  1.0371547 ,
          1.0157193 ,  0.9604418 ]]], dtype=float32)>

In [170]:
ProtoCNN.one_hot_distance(x)

<tf.Tensor: shape=(1, 4, 10), dtype=float32, numpy=
array([[[0.        , 0.        , 0.        , 0.78172654, 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ],
        [0.        , 0.        , 0.        , 0.27009773, 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ],
        [0.        , 0.        , 0.        , 0.4549362 , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ],
        [0.        , 0.        , 0.        , 0.9296246 , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ]]],
      dtype=float32)>

# Show prototypes

In [107]:
#choose with data to sample
#compute vector values of sentences
import random

random.shuffle(train_data)
sample_sentences = train_data[:150000]

sample_sentences, sample_y = zip(*train_data)
encoded_sample_sentences = encode(sample_sentences)
sample_sent_vects =[]
for i in range(3):
    sample_sent_vect = ProtoCNN.embed(encoded_sample_sentences[i*50000:(i+1)*50000])
    sample_sent_vects.append(sample_sent_vect)

sample_sent_vect = tf.concat(sample_sent_vects, axis=0)

In [108]:
mapped_prototype = showPrototypes(sample_sentences,sample_sent_vect,sample_y, k_protos=10,printOutput=False, k_closest_sents = 100)

Prototypes: 
0
(44384, 0.59639615) i hear there is even an uk one 1
(119485, 0.5967583) these kids are going places in life 1
(9329, 0.5996673) then you find out it was a land line 1
(69175, 0.60089) that seems like someone i wanna hang out with 1
(122428, 0.6031556) robin has been in every fe game since the first one 1
(76009, 0.60461885) obby nothing gets me hot like o o 1
(122435, 0.604776) she really seems to be enjoying it 1
(38294, 0.60514015) both of them combined or each of them \? 1
(55014, 0.6053043) whose the girl on the bottom left \? 1
(117943, 0.60653824) who let her leave the house line that 1
1
(119485, 0.6018789) these kids are going places in life 1
(44384, 0.608037) i hear there is even an uk one 1
(69175, 0.61627233) that seems like someone i wanna hang out with 1
(122428, 0.617497) robin has been in every fe game since the first one 1
(76009, 0.62080157) obby nothing gets me hot like o o 1
(122435, 0.621308) she really seems to be enjoying it 1
(9329, 0.621637) the

In [109]:
mapped_prototype[3]

[('these kids are going places in life', 0.3135772, 1),
 ('i hear there is even an uk one', 0.3172326, 1),
 ('who let her leave the house line that', 0.32124335, 1),
 ('that seems like someone i wanna hang out with', 0.32230243, 1),
 ("you could n't make up a title like that", 0.3228807, 1),
 ("someone 's moving house and home", 0.32551184, 1),
 ("he looks exactly like what i 'd imagine someone like this to look like",
  0.32663426,
  1),
 ('robin has been in every fe game since the first one', 0.32718822, 1),
 ('obby nothing gets me hot like o o', 0.32777935, 1),
 ('something for meth or something like that', 0.32828987, 1),
 ('i just love that it takes the video that long to get to the point',
  0.3287512,
  1),
 ("that looks more like it 's next to point", 0.32890695, 1),
 ('she really seems to be enjoying it', 0.33070856, 1),
 ('the one with the bear , i think', 0.33257347, 1),
 ('i think i had a seizure while reading this', 0.33272243, 1),
 ('funniest thing ive ever seen in my ent

In [110]:
mapped_prototype_150000 = mapped_prototype 

In [125]:
def find_similar_sentences(sent, mapped_prototype):
    sentence_embed = ProtoCNN.embed(np.expand_dims(sent, 0))
   
    protos = [x[0] for x in mapped_prototype]
    proto_embed = ProtoCNN.embed(encode(protos))
    distances = [(index, np.linalg.norm(embed - sentence_embed)) for index, embed in enumerate(proto_embed)]
    output = [mapped_prototype[x[0]] for x in sorted(distances, key= lambda x: x[1])]
    
    return output[:10]

In [150]:
find_similar_sentences(x[0], mapped_prototype[3])

[('better let as many of them into europe as possible', 0.3523066, 1),
 ('onlinebots living up to their name', 0.3604018, 1),
 ('everything happens for a reason', 0.3676815, 1),
 ("you could n't make up a title like that", 0.3228807, 1),
 ('why not put the full thing on it', 0.34032157, 1),
 ('really terrible stuff in there', 0.35132548, 1),
 ('you know i think this guy has a chance of making it', 0.34213576, 1),
 ('i came here to cringe not to think', 0.3506609, 1),
 ('deserves to crash with a shirt like that', 0.35434902, 1),
 ('your not supposed to mention that', 0.36335245, 1)]

In [149]:
find_similar_sentences(x[1], mapped_prototype[3])

[("i 've been thinking about this all day", 0.35759634, 1),
 ('by doing the same thing i do every night and day nothing', 0.3557903, 1),
 ('now that looks like a president i could have a beer with', 0.3606794, 1),
 ('this person is going to go far in life', 0.33753684, 1),
 ('a superior phone , like say , a galaxy would have been able to take that',
  0.36363792,
  1),
 ('burning man became terrible exactly the year after i went that one time',
  0.35728908,
  1),
 ('i cant stay and work here for ever', 0.35292253, 1),
 ('better let as many of them into europe as possible', 0.3523066, 1),
 ('sounds like jesus himself said this', 0.3607124, 1),
 ('should have shot him or strung him up from a tree just in case',
  0.35523936,
  1)]

In [147]:
find_similar_sentences(x[2], mapped_prototype[3])

[('can i request one for my school', 0.35782492, 1),
 ('your not supposed to mention that', 0.36335245, 1),
 ('i cant stay and work here for ever', 0.35292253, 1),
 ('sad thing is , i can actually belive this', 0.34167826, 1),
 ("you go first and then i 'll think about it", 0.36763626, 1),
 ('funniest thing ive ever seen in my entire life', 0.33317474, 1),
 ('you know i think this guy has a chance of making it', 0.34213576, 1),
 ('i just love that it takes the video that long to get to the point',
  0.3287512,
  1),
 ('everything happens for a reason', 0.3676815, 1),
 ("i do n't think i 'd take my salt any other way", 0.35186923, 1)]

In [174]:
testS[3]

'until a republican does it'

In [171]:
find_similar_sentences(x[3], mapped_prototype[3])

[('right that almost the same thing', 0.35840473, 1),
 ('while i read this during a shit', 0.34476843, 1),
 ('your not supposed to mention that', 0.36335245, 1),
 ('just the fucking way i like it', 0.3665198, 1),
 ("you could n't make up a title like that", 0.3228807, 1),
 ('sad thing is , i can actually belive this', 0.34167826, 1),
 ('something for meth or something like that', 0.32828987, 1),
 ('she really seems to be enjoying it', 0.33070856, 1),
 ('see what happens when you do spinning shit', 0.36211467, 1),
 ('that was me , i got high and started drawing on shit', 0.344531, 1)]

In [326]:
distances[1][0]

1

In [318]:
mapped_prototype[3][63]

("i 've been thinking about this all day", 0.35759634, 1)

In [294]:
proto3_embed list_c.index(max_val)

<tf.Tensor: shape=(100, 384), dtype=float32, numpy=
array([[0.        , 0.        , 0.        , ..., 0.01747742, 0.        ,
        0.01237033],
       [0.        , 0.        , 0.        , ..., 0.01747742, 0.        ,
        0.01237033],
       [0.        , 0.        , 0.        , ..., 0.01747742, 0.        ,
        0.01237033],
       ...,
       [0.        , 0.        , 0.        , ..., 0.01747742, 0.        ,
        0.01237033],
       [0.        , 0.        , 0.        , ..., 0.01747742, 0.        ,
        0.01237033],
       [0.        , 0.        , 0.        , ..., 0.01747742, 0.        ,
        0.05296943]], dtype=float32)>

In [319]:
mapped_prototype[3]

[('these kids are going places in life', 0.3135772, 1),
 ('i hear there is even an uk one', 0.3172326, 1),
 ('who let her leave the house line that', 0.32124335, 1),
 ('that seems like someone i wanna hang out with', 0.32230243, 1),
 ("you could n't make up a title like that", 0.3228807, 1),
 ("someone 's moving house and home", 0.32551184, 1),
 ("he looks exactly like what i 'd imagine someone like this to look like",
  0.32663426,
  1),
 ('robin has been in every fe game since the first one', 0.32718822, 1),
 ('obby nothing gets me hot like o o', 0.32777935, 1),
 ('something for meth or something like that', 0.32828987, 1),
 ('i just love that it takes the video that long to get to the point',
  0.3287512,
  1),
 ("that looks more like it 's next to point", 0.32890695, 1),
 ('she really seems to be enjoying it', 0.33070856, 1),
 ('the one with the bear , i think', 0.33257347, 1),
 ('i think i had a seizure while reading this', 0.33272243, 1),
 ('funniest thing ive ever seen in my ent

In [279]:
mapped_prototype[0]

[('shrek , especially the first one', 0.5935609, 0),
 ("holy shit it 's already been a year \\?", 0.6024625, 0),
 ('brazilian in britain , probably both', 0.6050657, 0),
 ('the rifle from a regular store and my sister painted it', 0.60792977, 0),
 ('do you mind giving me the demo of that \\?', 0.60851675, 0),
 ('none of them can stop the time', 0.6088037, 0),
 ('anyone have a video of the incident \\?', 0.60930187, 0),
 ('one of the names is visible towards the bottom', 0.6107298, 0),
 ('give portland some love for once', 0.61119914, 0),
 ('make it twice as thick with a battery that lasts twice as long',
  0.6117877,
  0),
 ('he just got on a list by searching for that', 0.61233556, 0),
 ('16 is a worryingly high number', 0.6130593, 0),
 ("what 's the source for this \\?", 0.61373526, 0),
 ('hope you enjoy it other people are paying for it', 0.61425763, 0),
 ('did you even watch the video \\?', 0.6157763, 0),
 ('they were not going to gain anything anyway', 0.6159876, 0),
 ('the us cou