In [2]:
#!/usr/bin/env python
import pickle
import tensorflow as tf
import numpy as np
import os
import time
import datetime
import data_helpers
import sys
import os
import csv
from time import sleep
import pickle, argparse
from sklearn_extra.cluster import KMedoids
from tensorflow.keras import layers, Model, regularizers
from tensorflow import keras 
from transformers import BertTokenizer, TFBertModel
from tqdm import tqdm
import pandas as pd


def make_variables(tf_name, k1, k2, initializer):
     
    return tf.Variable(initializer(shape=[k1, k2], dtype=tf.float32), trainable=True, name=tf_name)

class DataLoader:
    def __init__(self, data, batch_size=200, shuffle=True):
        self.data = data
        self.batch_size = batch_size
        self.shuffle = shuffle
    

    def __len__(self):
        # Returns the number of batches
        return int(np.ceil(len(self.data) / self.batch_size))

    def __iter__(self):
        # Shuffles the indexes if required
        data = pd.DataFrame(self.data).to_numpy()
        data_size = len(data)
        num_batches_per_epoch = int((len(data)-1)/self.batch_size) + 1
      
        if self.shuffle:
            shuffle_indices = np.random.permutation(np.arange(data_size))
            shuffled_data = data[shuffle_indices]
        else:
            shuffled_data = data
        
        for batch_num in range(num_batches_per_epoch):
            start_index = batch_num * self.batch_size
            end_index = min((batch_num + 1) * self.batch_size, data_size)
            output = list(zip(*shuffled_data[start_index:end_index]))
            yield output[0],  output[1],  output[2],  output[3]
       

            
#prototype layer
class prototypeLayer(keras.layers.Layer):
    def __init__(self, k_protos, vect_size, k_cents):
        super(prototypeLayer, self).__init__(name='proto_layer')
        self.n_protos = k_protos
        self.vect_size = vect_size
        self.prototypes = make_variables("prototypes", k_protos, vect_size,
                                         initializer=tf.constant_initializer(k_cents))

    @tf.function
    def call(self, inputs):
        tmp1 = tf.expand_dims(inputs, 2)

        tmp1 = tf.broadcast_to(tmp1, [tf.shape(tmp1)[0], tf.shape(tmp1)[1], self.n_protos, self.vect_size])
        tmp2 = tf.broadcast_to(self.prototypes,
                               [tf.shape(tmp1)[0], tf.shape(tmp1)[1], self.n_protos, self.vect_size])
        tmp3 = tmp1 - tmp2
        tmp4 = tmp3 * tmp3
        distances = tf.reduce_sum(tmp4, axis=3)

        return distances, self.prototypes

#distance layer: to convert the full distance matrix to sparse similarity matrix
class distanceLayer(keras.layers.Layer):
    def __init__(self):
        super(distanceLayer, self).__init__(name='distance_layer')
        self.a = 0.1
        self.beta = 1e6

    def e_func(self, x, e=2.7182818284590452353602874713527):
        return tf.math.pow(e, -(self.a * x))

    # @tf.function
    # def call(self, full_distances):
    #     min_dist_ind = tf.nn.softmax(-full_distances * self.beta)
    #     e_dist = self.e_func(full_distances) + 1e-8
    #     dist_hot_vect = min_dist_ind * e_dist
    #     return dist_hot_vect

    @tf.function
    def call(self, full_distances):
        e_dist = self.e_func(full_distances) + 1e-8
        dist_hot_vect = tf.squeeze(e_dist, axis=0)
        return dist_hot_vect


class TextCNN(tf.keras.Model):
    """
    A CNN for text classification in TensorFlow 2.x.
    Uses an embedding layer, followed by a convolutional, max-pooling, and softmax layer.
    """
    
    def __init__(
        self, sequence_length, num_classes, tokenizer, bert_model, user_embeddings, topic_embeddings, embedding_size, filter_sizes, num_filters, l2_reg_lambda, dropout_keep_prob, k_protos, vect_size):
        super(TextCNN, self).__init__()
        self.max_l = sequence_length
        self.l2_reg_lambda = l2_reg_lambda
        l2_regularizer = tf.keras.regularizers.l2(l2_reg_lambda)
        # Embedding layer
        self.tokenizer = tokenizer
        self.embedding = bert_model
        self.num_filters = num_filters
        self.filters_sizes = filter_sizes
        self.k_protos = k_protos
        self.vect_size = vect_size
        
        self.user_embedding = tf.keras.layers.Embedding(input_dim=user_embeddings.shape[0], output_dim=user_embeddings.shape[1], weights=[user_embeddings], trainable=False)
        self.topic_embedding = tf.keras.layers.Embedding(input_dim=topic_embeddings.shape[0], output_dim=topic_embeddings.shape[1], weights=[topic_embeddings], trainable=False)
        self.distance_layer = distanceLayer()

        self.conv_layers = []
        for filter_size in filter_sizes:
            conv_block = tf.keras.Sequential([
                layers.Conv2D(num_filters, (filter_size, embedding_size), 
                              padding='valid', activation='relu'),
                layers.MaxPooling2D(pool_size=(sequence_length - filter_size + 1, 1), 
                                    strides=(1,1), padding='valid')])
            self.conv_layers.append(conv_block)
        self.concat_layer = tf.keras.layers.Concatenate()
        #self.last_dense = tf.keras.layers.Dense(100, activation='relu')
        self.user_topic_dense = tf.keras.layers.Dense(400, activation='relu')
        self.dropout = tf.keras.layers.Dropout(1 - dropout_keep_prob)
        # Final dense layer with L2 regularization
        self.final_dense = tf.keras.layers.Dense(num_classes, activation="softmax", kernel_regularizer=l2_regularizer)

    def init_prototypelayer(self, k_cents):
        
        self.proto_layer = prototypeLayer(self.k_protos, self.vect_size, k_cents)
       

    def call(self, inputs):
       

        input_content, input_author, input_topic = inputs

       
         # Embedding layer
        x = self.tokenizer(input_content, padding = "max_length", max_length=self.max_l, return_tensors ="tf",truncation = True )
        x = self.embedding(input_ids = x["input_ids"], attention_mask = x["attention_mask"], output_hidden_states =True)[0]
        x = tf.expand_dims(x, -1)


        pooled_outputs = []
        for conv in self.conv_layers:
            conv_out = conv(x)
            pooled_outputs.append(conv_out)

        num_filters_total = self.num_filters * len(self.filters_sizes)
        h_pool = tf.concat(pooled_outputs, axis=1)
        h_pool_flat = tf.reshape(h_pool, [-1, num_filters_total])

        
        x = tf.expand_dims(h_pool_flat, axis=0)
        full_distances, protos = self.proto_layer(x)
        dist_vect = self.distance_layer(full_distances)

       
    

        #h_last = self.last_dense(dist_vect)

        user_embeddings = self.user_embedding(input_author)
        topic_embeddings = self.topic_embedding(input_topic)
        
        combined_vectors = self.concat_layer([dist_vect, user_embeddings, topic_embeddings])
        combined_vector_final = self.user_topic_dense(combined_vectors)

        
        combined_vector_final = self.dropout(combined_vector_final)

        scores = self.final_dense(combined_vector_final)

        return scores
    def embed(self, x):
         # Embedding layer
        x = self.tokenizer(x, padding = "max_length", max_length=self.max_l, return_tensors ="tf",truncation = True )
        x = self.embedding(input_ids = x["input_ids"], attention_mask = x["attention_mask"], output_hidden_states =True)[0]
        x = tf.expand_dims(x, -1)


        
        pooled_outputs = []
        for conv in self.conv_layers:
            conv_out = conv(x)
            pooled_outputs.append(conv_out)

           
        
        num_filters_total = self.num_filters * len(self.filters_sizes)
        
        h_pool = tf.concat(pooled_outputs, axis=3)
        h_pool_flat = tf.reshape(h_pool, [-1, num_filters_total])

      
        


        return h_pool_flat



    def compute_accuracy(self, input_y, scores):
        predictions = tf.argmax(scores, 1, name="predictions")
        correct_predictions = tf.equal(predictions, tf.argmax(input_y, 1))
        accuracy = tf.reduce_mean(tf.cast(correct_predictions, "float"), name="accuracy")
        return accuracy


In [3]:
np.random.seed(10)

In [4]:
  
print("loading data...")
x = pickle.load(open("./mainbalancedpickle.p","rb"))
revs, W, W2, word_idx_map, vocab, max_l = x[0], x[1], x[2], x[3], x[4], x[5]
print("data loaded!")# Load data

print('loading wgcca embeddings...')
wgcca_embeddings = np.load('./../users/user_embeddings/user_gcca_embeddings.npz')
print('wgcca embeddings loaded')


ids = np.concatenate((np.array(["unknown"]), wgcca_embeddings['ids']), axis=0)
user_embeddings = wgcca_embeddings['G']
unknown_vector = np.random.normal(size=(1,100))
user_embeddings = np.concatenate((unknown_vector, user_embeddings), axis=0)
user_embeddings = user_embeddings.astype(dtype='float32')

wgcca_dict = {}
for i in range(len(ids)):
    wgcca_dict[ids[i]] = int(i)

csv_reader = csv.reader(open("./../discourse/discourse_features/discourse.csv"))
topic_embeddings = []
topic_ids = []
for line in csv_reader:
    topic_ids.append(line[0])
    topic_embeddings.append(line[1:])
topic_embeddings = np.asarray(topic_embeddings)
topic_embeddings_size = len(topic_embeddings[0])
topic_embeddings = topic_embeddings.astype(dtype='float32')
print("topic emb size: ",topic_embeddings_size)

topics_dict = {}
for i in range(len(topic_ids)):
    try:
        topics_dict[topic_ids[i]] = int(i)
    except TypeError:
        print(i)

max_l = 100

x_text = []
author_text_id = []
topic_text_id = []
comment_id = []
y = []


test_x = []
test_topic = []
test_author = []
test_y = []
text2id={}
for i in range(len(revs)):

    text2id[revs[i]["text"]] = revs[i]["id"]
    if revs[i]['split']==1:
        x_text.append(revs[i]['text'])
        try:
            author_text_id.append(wgcca_dict['"'+revs[i]['author']+'"'])
        except KeyError:
            author_text_id.append(0)
        try:
            topic_text_id.append(topics_dict['"'+revs[i]['topic']+'"'])
        except KeyError:
            topic_text_id.append(0)
        y.append(revs[i]['label'])
        comment_id.append(revs[i]["id"])
    else:
        test_x.append(revs[i]['text'])
        try:
            test_author.append(wgcca_dict['"'+revs[i]['author']+'"'])
        except:
            test_author.append(0)
        try:
            test_topic.append(topics_dict['"'+revs[i]['topic']+'"'])
        except:
            test_topic.append(0)
        test_y.append(revs[i]['label'])  


y_test = test_y

loading data...
data loaded!
loading wgcca embeddings...
wgcca embeddings loaded
topic emb size:  100


In [5]:
sarc_train_file =  "my_train_balanced.csv"
sarc_test_file = "my_test_balanced.csv"

In [6]:
all_training = pd.read_csv("../data/my_train_balanced.csv",header=None)

In [7]:
all_training.head(4)

Unnamed: 0,0,1,2
0,c07fd66,['7uaac'],1
1,c07fjge,['7uaac'],0
2,c07f3md,['7u896'],1
3,c07f3ls,['7u896'],0


In [8]:
all_training.columns = ["comment","post","label"]

In [9]:
all_training[all_training["comment"]=="c07fd66"]

Unnamed: 0,comment,post,label
0,c07fd66,['7uaac'],1


In [10]:
import json

In [4]:
comments = json.loads(open("../data/comments.json").read())

In [7]:
c2n={}
for c, dictionary in comments.items():
   c2n[dictionary["text"]] = c

In [9]:
c2n["bf3 had the best menu layout , level progression , and unlock system out of any other bf game imo the battlepack system in bf4 was a half assed attempt at packaging something that should never be packaged for the sake of making it feel special just give me my freaking unlocks and be done with it , i got on battlelog about 6 months ago to play some bf4 for the first time in ages , i had over 70 battlepacks to open and it took me almost ten minutes to open them all"]

KeyError: 'bf3 had the best menu layout , level progression , and unlock system out of any other bf game imo the battlepack system in bf4 was a half assed attempt at packaging something that should never be packaged for the sake of making it feel special just give me my freaking unlocks and be done with it , i got on battlelog about 6 months ago to play some bf4 for the first time in ages , i had over 70 battlepacks to open and it took me almost ten minutes to open them all'

In [12]:
# Dev Sample Percentage
dev_sample_percentage = 0.1

# Model Hyperparameters
embedding_dim = 768
filter_sizes = [3, 4, 5]
num_filters = 128
dropout_keep_prob = 0.5
l2_reg_lambda = 0.5

# Training parameters
batch_size = 60
num_epochs = 4000

In [13]:
topic_train = np.asarray(topic_text_id)
topic_test = np.asarray(test_topic)
author_train = np.asarray(author_text_id)
author_test = np.asarray(test_author)
id_train = np.asarray(comment_id)


shuffle_indices = np.random.permutation(np.arange(len(y)))
x_shuffled = np.asarray(x_text)[shuffle_indices]
y_shuffled = np.asarray(y)[shuffle_indices]


topic_train_shuffled = topic_train[shuffle_indices]
author_train_shuffled = author_train[shuffle_indices]

# Split train/test set
# TODO: This is very crude, should use cross-validation

dev_sample_index = -1 * int(dev_sample_percentage * float(len(y)))
x_train, x_dev = x_shuffled[:dev_sample_index], x_shuffled[dev_sample_index:]
topic_train, topic_dev = topic_train_shuffled[:dev_sample_index], topic_train_shuffled[dev_sample_index:]
author_train, author_dev = author_train_shuffled[:dev_sample_index], author_train_shuffled[dev_sample_index:]
id_train, id_dev = id_train_shuffled[:dev_sample_index], id_train_shuffled[dev_sample_index:]
y_train, y_dev = y_shuffled[:dev_sample_index], y_shuffled[dev_sample_index:]

print("Train/Dev split: {:d}/{:d}".format(len(y_train), len(y_dev)))


x_train = np.asarray(x_train)
x_dev = np.asarray(x_dev)
author_train = np.asarray(author_train)
author_dev = np.asarray(author_dev)
topic_train = np.asarray(topic_train)
topic_dev = np.asarray(topic_dev)
y_train = np.asarray(y_train)
y_dev = np.asarray(y_dev)
# word_idx_map["@"] = 0
# rev_dict = {v: k for k, v in word_idx_map.items()}

# Training
# ==================================================

bert_model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(bert_model_name)
bert_model = TFBertModel.from_pretrained(bert_model_name)

for layer in bert_model.layers:
    layer.trainable = False

k_protos, vect_size = 15, 384

ProtoCNN = TextCNN(
    sequence_length=max_l,
    num_classes=len(y_train[0]) ,
    tokenizer = tokenizer,
    bert_model = bert_model,
    user_embeddings = user_embeddings,
    topic_embeddings = topic_embeddings,
    embedding_size=embedding_dim,
    filter_sizes=list(map(int, filter_sizes)),
    num_filters=num_filters,
    l2_reg_lambda=l2_reg_lambda,
    dropout_keep_prob = dropout_keep_prob,
    k_protos = k_protos,
    vect_size = vect_size)








# random.shuffle(x_text)
sample_sentences = x_text[:15000]
sample_sentences_vects = []
for i in range(300):
    batch = sample_sentences[i * 50:(i + 1) * 50]
    vect = ProtoCNN.embed(batch)
    sample_sentences_vects.append(vect.numpy())


# In[30]:


sample_sentences_vect = np.concatenate(sample_sentences_vects, axis=0)


kmedoids = KMedoids(n_clusters=k_protos, random_state=0).fit(sample_sentences_vect)
k_cents = kmedoids.cluster_centers_

# In[33]:


ProtoCNN.init_prototypelayer(k_cents)


predictions = ProtoCNN([x_train[:2].tolist(), author_train[:2], topic_train[:2]])



Train/Dev split: 139232/15470


2024-01-01 03:58:21.393176: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-01-01 03:58:21.870753: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1616] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 22460 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 3090, pci bus id: 0000:25:00.0, compute capability: 8.6
2024-01-01 03:58:21.871627: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1616] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 1286 MB memory:  -> device: 1, name: NVIDIA GeForce RTX 3090, pci bus id: 0000:41:00.0, compute capability: 8.6
2024-01-01 03:58:21.872306: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1616] Created device /job:localhost/replica:0/task:0/d

In [141]:
ProtoCNN.load_weights("runs/bert_cascade_proto_div_loss-0.8/best_classifier.ckpt")

<tensorflow.python.checkpoint.checkpoint.CheckpointLoadStatus at 0x7f5ff7069660>

In [110]:
train_loader = DataLoader(list(zip(x_train, author_train, topic_train, y_train)), batch_size=batch_size, shuffle=False)

In [142]:
sample_sentences_vects = []
sample_labels = []
for i, inputs in enumerate(train_loader): 
    
    x_batch, author_batch, topic_batch, y_batch = inputs  
    vect = ProtoCNN.embed(x_batch) 
    sample_sentences_vects.append(vect.numpy())
    sample_labels.extend([x.tolist() for x in y_batch])
    

   
       

In [143]:
sample_sent_vect = np.concatenate(sample_sentences_vects, axis=0)

In [46]:
def showPrototypes(sample_sentences,sample_sent_vects, sample_y, k_protos=10,printOutput=False, k_closest_sents = 20):
    
    prototypes = ProtoCNN.proto_layer.prototypes.numpy()
    #data_size = 10000
    d_pos = {}
    data_size = 150000
    for p_count, p in enumerate(prototypes):
       
        s_count = 0
        d_pos[p_count] = {}
        for i, s in enumerate(sample_sent_vect[:data_size]):
            #if len(sample_sentences[i]) < 20 or len(sample_sentences[i]) > 100:
            if len(sample_sentences[i]) < 30 or sample_y[i][1]==0:
                continue
            d_pos[p_count][i] = np.linalg.norm(sample_sent_vect[i] - p)
            s_count += 1
 

    mappedPrototypes = {}    
   
    recorded_protos_score = {}
    print("Prototypes: ")
    for l in range(k_protos):
        # print("prototype index = ", l)
        recorded_protos_score[l] = {}
        sorted_d = sorted(d_pos[l].items(), key=operator.itemgetter(1))
        print(l)
        mappedPrototypes[l]=[]
        for k in range(k_closest_sents):
            i = sorted_d[k][0]
            score = sorted_d[k][1]
            # print("[db] sorted_d ",sorted_d[0])
            # print("[db] sample_sentences[sorted_d[0][0]]: ",sample_sentences[sorted_d[0][0]])
            mappedPrototypes[l].append((sample_sentences[i].strip(), score, sample_y[i][1]))
            if k<10:
                print(sorted_d[k], sample_sentences[i],sample_y[i][1])
        #print(mappedPrototypes[l])

    
    return mappedPrototypes

In [58]:
import operator

In [48]:
comments2id = comments["d1nhsrd"]

In [49]:
all_training[all_training["comment"]=="c07fd66"]

Unnamed: 0,comment,post,label
0,c07fd66,['7uaac'],1


In [50]:
mapped_prototype = showPrototypes(sample_sentences,sample_sent_vect, sample_labels, k_protos=10,printOutput=False, k_closest_sents = 100)

Prototypes: 
0
(4836, 0.9561245) afk or negative attitude though more of afk but since the firewall issues happened i only report for afk if the person leaves mid game due to rage or leaving without saying why 1
(966, 0.96359783) joke 's on you i'm not having any children 1
(5470, 0.97721153) i do n't think that word means what you think it means 1
(8501, 0.97721153) there 's a special place in hell for her 1
(9618, 0.97721153) finally the leafs win something brings a tear to my eye 1
(5786, 1.0457137) stop waging unnecessary wars in the middle east then its always weird when the government bitches about the government , as if they dont have all the power to change policy 1
(2359, 1.0516869) they should raise the price of alcohol to lower alcohol abuse too ! 1
(9293, 1.0768752) how to lose gays to conservatives once republicans realize they lost the marriage fight 1
(7336, 1.0916734) old money is stuck in the past , paypal is so progressive 1
(11082, 1.0945343) i'm sure he 's disappoin

In [130]:
def showPrototypes(ProtoCNN, sample_sentences,sample_sent_vects,k_protos=15,printOutput=False):
    prototypes = ProtoCNN.proto_layer.prototypes.numpy()
    #data_size = 10000
    d_pos = {}
    for p_count, p in enumerate(prototypes):
        print('p_count = ', p_count)
        s_count = 0
        d_pos[p_count] = {}
        for i, s in enumerate(sample_sent_vects):
            if len(sample_sentences[i]) < 20 :
                continue
            d_pos[p_count][i] = np.linalg.norm(sample_sent_vects[i] - p)
            s_count += 1
        print('count = ', s_count)
    
    
    mappedPrototypes = {} 
    k_closest_sents = 10
    recorded_protos_score = {}
    print("Prototypes: ")
    for l in range(k_protos):
        # print("prototype index = ", l)
        recorded_protos_score[l] = {}
        sorted_d = sorted(d_pos[l].items(), key=operator.itemgetter(1))
        for k in range(k_closest_sents):
            i = sorted_d[k][0]
            # print("[db] sorted_d ",sorted_d[0])
            # print("[db] sample_sentences[sorted_d[0][0]]: ",sample_sentences[sorted_d[0][0]])
            mappedPrototypes[l] = sample_sentences[sorted_d[0][0]].strip()
           
            #print(sorted_d[k], sample_sentences[i])
        print(mappedPrototypes[l])

In [114]:
def pw_distance(A):
    r = tf.reduce_sum(A * A, 1)
    r = tf.reshape(r, [-1, 1])
    D = r - 2 * tf.matmul(A, tf.transpose(A)) + tf.transpose(r)
    return D

In [115]:
protos

array([[ 0.04658692,  0.08148573,  0.07718311, ...,  0.08493915,
         0.09445967,  0.09795462],
       [ 0.025462  ,  0.05717431,  0.05421162, ...,  0.05526998,
         0.05259991,  0.13021095],
       [ 0.01885531,  0.06329141,  0.0172288 , ...,  0.04997756,
         0.0773843 ,  0.04898995],
       ...,
       [-0.00148784,  0.03370793,  0.07756134, ...,  0.00104265,
         0.0702292 ,  0.06720568],
       [ 0.00819621,  0.05842676,  0.04430661, ...,  0.08267815,
         0.02784036,  0.15792556],
       [ 0.05129506,  0.02122571,  0.03022781, ...,  0.03197355,
         0.0905187 ,  0.08947792]], dtype=float32)

In [145]:
protos = ProtoCNN.proto_layer.prototypes.numpy()

In [146]:
protos.shape

(15, 384)

In [147]:
d = pw_distance(protos)
diag_ones = tf.convert_to_tensor(np.eye(k_protos, dtype=float))
diag_ones = tf.dtypes.cast(diag_ones, tf.float32)
d1 = d + diag_ones * tf.reduce_max(d)
d2 = tf.reduce_min(d1, axis=1)
min_d2_dist = tf.reduce_min(d2)

In [149]:
min_d2_dist 

<tf.Tensor: shape=(), dtype=float32, numpy=0.288455>

In [120]:
np.set_printoptions(suppress = True)

In [131]:
prototypes = ProtoCNN.proto_layer.prototypes.numpy()
#data_size = 10000
d_pos = {}
for p_count, p in enumerate(prototypes):
    print('p_count = ', p_count)
    s_count = 0
    d_pos[p_count] = {}
    for i, s in enumerate(x_train):
        if len( x_train[i]) < 20 :
            continue
        d_pos[p_count][i] = np.linalg.norm(sample_sent_vect[i] - p)
        s_count += 1
    print('count = ', s_count)


p_count =  0
count =  118697
p_count =  1
count =  118697
p_count =  2
count =  118697
p_count =  3
count =  118697
p_count =  4
count =  118697
p_count =  5
count =  118697
p_count =  6
count =  118697
p_count =  7
count =  118697
p_count =  8
count =  118697
p_count =  9
count =  118697
p_count =  10
count =  118697
p_count =  11
count =  118697
p_count =  12
count =  118697
p_count =  13
count =  118697
p_count =  14
count =  118697


In [140]:
#show the list of prototypes
showPrototypes(ProtoCNN, x_train,sample_sent_vect)

p_count =  0
count =  118697
p_count =  1
count =  118697
p_count =  2
count =  118697
p_count =  3
count =  118697
p_count =  4
count =  118697
p_count =  5
count =  118697
p_count =  6
count =  118697
p_count =  7
count =  118697
p_count =  8
count =  118697
p_count =  9
count =  118697
p_count =  10
count =  118697
p_count =  11
count =  118697
p_count =  12
count =  118697
p_count =  13
count =  118697
p_count =  14
count =  118697
Prototypes: 
because applying braking forces through your chain is the most efficient way to slow down your wheels
yes , infowars is the place to find solutions to all our problems
good thing his trebuchet jammed
omg that lady should be kicked out for breastfeeding in public !
so that 's where they make xboxs
because that will only make them more accepting !
cool now we can spend more time playing with books instead of reading them
well , they certainly got their goal of appearing on those subreddits
oh great , i packed mandzukic and kouyate earlier toda

In [87]:
id_train[604]

'cu2g2p8'

In [88]:
x_train[604]

'pretty sure it was basically mushu from mulan \\?'

In [81]:
all_training[all_training["comment"]=="c07fd66"]

Unnamed: 0,comment,post,label
0,c07fd66,['7uaac'],1


In [86]:
comments["cu217s2"]

{'text': "I only fear failing on the bench press, but that's why I only BP in a power rack.",
 'author': 'Thojos',
 'score': 9,
 'ups': 9,
 'downs': 0,
 'created_utc': 1439499693,
 'date': '2015-08',
 'subreddit': 'bodybuilding'}

In [2]:
import json

In [3]:
comments

NameError: name 'comments' is not defined