In [1]:
import sys, time, pickle, torch
sys.path.insert(0, '../Models')
sys.path.insert(0, '../Utils')
sys.path.insert(0, '../Preprocess')
import numpy as np
import pandas as pd
from ann import NeuralNet
from utils import my_completely_random_sample, conv_input_attri_to_word_attri,\
    collect_info_for_metric, save_info, random_state
from preprocess_funcs import preprocess_pipelined, prepare_text, prepare_text_view_friendly
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package stopwords to /home/user/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/user/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
model = torch.load('../Models/ann_model.pt')

In [3]:
Train_df = pd.read_csv('../Data/Processed/train_df.csv')
Test_df = pd.read_csv('../Data/Processed/test_df.csv')
Y_train = Train_df['sentiment']
Y_test = Test_df['sentiment']
Y_train_tensor = torch.tensor(Y_train, dtype=torch.float).unsqueeze(dim=1)
Y_test_tensor = torch.tensor(Y_test, dtype=torch.float).unsqueeze(dim=1)


In [4]:
reviews_raw, targets, indices = my_completely_random_sample(Train_df, num_sample=50)

In [5]:
reviews_raw[0]

'This musical was not quite what I expected, foremost being there weren\'t many scenes between Brando and Sinatra. As it was based on a Damon Runyon story, I expected irony and surprise, of which there was one really good one - when we find that Sinatra\'s gang has used the Salvation Army office for their crap game while Brando was in Havana with Simmons. If course it comes at the right moment too, when Brando brings her back. I really didn\'t expect much from Brando as a singer, but he surprised me. He wasn\'t great but he was just fine in the role. His big number in the sewer, however, with the rest of Sinatra\'s boys was the only place I felt Brando\'s voice was weak. He just didn\'t have the power the grand climax demanded. Overall I found the scenes between Brando and Simmons to be filled with electricity, something I didn\'t think would happen when we first see Simmons by herself, and later when we\'re introduced to Brando in the restaurant with Sinatra trying to pull a fast one 

In [6]:
#define some containers to save some info
model_out_list, attr_list = [], []

In [7]:
from captum.attr import Lime
from captum.attr import visualization 

In [8]:
lime = Lime(forward_func=model.forward)

In [11]:
def generate_record(raw_review, target):
    data, data_vec, input_tfidf = preprocess_pipelined([raw_review]) #preprocess_pipelined expects a lits of strings        
    input_model = input_tfidf.to_dense()
    with torch.no_grad():
        pred_prob =  model.forward(input_model).detach().item()
        pred_class = 'Pos' if pred_prob > 0.5 else 'Neg' 
        true_class = 'Pos' if target > 0.5 else 'Neg' 
        attribution = lime.attribute(input_model, n_samples=5000, return_input_shape=True, show_progress=True)
        attr_score = torch.sum(attribution)    
        attr_class = 'Pos' if attr_score > 0.5 else 'Neg'
        raw_input = prepare_text_view_friendly(raw_review) #word_tokenize(raw_review) #raw_review.split()
        convergence_score = 0
        word_attributions = conv_input_attri_to_word_attri(attribution, raw_review)
        #         print(f'word attributions {word_attributions}')
        #         print(f'pred_prob {pred_prob}')
        #         print(f'pred_class {pred_class}')
        #         print(f'true_class {true_class}')
        #         print(f'attribution {attribution}')
        #         print(f'attr_class {attr_class}')
        #         print(f'attr_score {attr_score}')
        print(f'raw_input {raw_input}')

        
        #collect info for metrics later
        collect_info_for_metric(model_out_list, pred_prob, attr_list, attribution)
        
    
        visual_record = visualization.VisualizationDataRecord(word_attributions=word_attributions,
                                                         pred_prob=pred_prob,
                                                         pred_class=pred_class,
                                                         true_class=true_class,
                                                         attr_class=attr_class,
                                                         attr_score=attr_score,
                                                         raw_input=raw_input,
                                                         convergence_score=convergence_score)
        
        
        return visual_record
      
    

In [12]:
for i, (rev_raw, target) in enumerate(zip(reviews_raw, targets)):
    visual_record = generate_record(rev_raw, target)
    print(visualization.visualize_text([visual_record])) #list because visualize_text expects a list of visual_records
    break

Lime attribution: 100%|████████████████████████████████████████████████████████████| 5000/5000 [00:10<00:00, 469.20it/s]


words attr: {'great': tensor(0.5021), 'simmons': tensor(0.8505), 'sinatra': tensor(1.)}
raw_input musical quite expected foremost n t many scenes brando sinatra based damon runyon story expected irony surprise one really good one find sinatra s gang used salvation army office crap game brando havana simmons course comes right moment brando brings back really n t expect much brando singer surprised n t great fine role big number sewer however rest sinatra s boys place felt brando s voice weak n t power grand climax demanded overall found scenes brando simmons filled electricity something n t think would happen first see simmons later re introduced brando restaurant sinatra trying pull fast one n t brando goes office story came life br br frank sinatra hand flat even vocal performances vivian blaine never heard guess played role broadway seemed slow proceedings scenes sinatra obvious also songs felt weakest terms advancing story character top goldwyn girls numbers seemed shoe horned glit

True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
Pos,Pos (0.92),Neg,0.09,m u s i c a l q u i t e e x p e c t e d f o r e m o s t n t m a n y s c e n e s b r a n d o s i n a t r a b a s e d d a m o n r u n y o n s t o r y e x p e c t e d i r o n y s u r p r i s e o n e r e a l l y g o o d o n e f i n d s i n a t r a s g a n g u s e d s a l v a t i o n a r m y o f f i c e c r a p g a m e b r a n d o h a v a n a s i m m o n s c o u r s e c o m e s r i g h t m o m e n t b r a n d o b r i n g s b a c k r e a l l y n t e x p e c t m u c h b r a n d o s i n g e r s u r p r i s e d n t g r e a t f i n e r o l e b i g n u m b e r s e w e r h o w e v e r r e s t s i n a t r a s b o y s p l a c e f e l t b r a n d o s v o i c e w e a k n t p o w e r g r a n d c l i m a x d e m a n d e d o v e r a l l f o u n d s c e n e s b r a n d o s i m m o n s f i l l e d e l e c t r i c i t y s o m e t h i n g n t t h i n k w o u l d h a p p e n f i r s t s e e s i m m o n s l a t e r r e i n t r o d u c e d b r a n d o r e s t a u r a n t s i n a t r a t r y i n g p u l l f a s t o n e n t b r a n d o g o e s o f f i c e s t o r y c a m e l i f e b r b r f r a n k s i n a t r a h a n d f l a t e v e n v o c a l p e r f o r m a n c e s v i v i a n b l a i n e n e v e r h e a r d g u e s s p l a y e d r o l e b r o a d w a y s e e m e d s l o w p r o c e e d i n g s s c e n e s s i n a t r a o b v i o u s a l s o s o n g s f e l t w e a k e s t t e r m s a d v a n c i n g s t o r y c h a r a c t e r t o p g o l d w y n g i r l s n u m b e r s s e e m e d s h o e h o r n e d g l i t z e x a m p l e f r a n k m e e t s b r a n d o n i g h t c l u b c u t s t a g e r o u t i n e c a t n u m b e r c u t s b a c k g u y s c o n t i n u e n t d a n c e n u m b e r w h e n e v e r b r a n d o s i m m o n s s c r e e n g r e a t t i m e t i m e r e t u r n s i n a t r a b l a i n e s t o r y i n t e r e s t l e v e l w a n e d b r b r s o n g s g o o d o n e s p a r t i c u l a r l y f i r s t n u m b e r s t u b b y k a y e f u g u e t i n h o r n s n u m b e r s g r e a t s o n g r e m i n d e d f i r s t s o n g m u s i c m a n c a s h m e r c h a n d i s e w h a t e v e r s c a l l e d n u m b e r s e w e r c o u l d n t h e l p r e m i n d e d c o o l w e s t s i d e s t o r y b r i n g s p o i n t r e a l l y l i k e a r t d i r e c t i o n f i l m f a k e t i m e s s q u a r e c o m p l e t e l y p h o n y d r e w a t t e n t i o n h a v a n a s e q u e n c e p a r t i c u l a r l y s e w e r r e a l i z e b a c k 1 9 5 5 m u s i c a l s s h o t s e t s t h i n g s c h a n g i n g c a r o u s e l e x a m p l e m a d e g r e a t u s e l o c a t i o n p h o t o g r a p h y e v e n t o w n s h o t s c e n e s m a h a t t a n 1 9 4 9 t i m e g e t w e s t s i d e s t o r y 1 9 6 1 s g i v e n s t u f f t a k i n g p l a c e m a n h a t t a n a c t u a l l y s h o t m a n h a t t a n c o m p a r i s o n g u y s d o l l s s e t b o u n d m a n h a t t a n f e l t d a t e d l i t t l e c u t e c h a n g i n g l i n d y s m i n d y s r e a l l y l e g a l r e a s o n s a l w a y s t h o u g h t g u y s d o l l s m u s i c a l s i n a t r a b r a n d o a d v e n t u r e s v a r i o u s g i r l s m u c h f o c u s e d c r e d i t r e g a r d m u c h b e t t e r l e s g i r l s i n t e r e s t i n g s r i g h t c e r t a i n s h a l l o w n e s s b r b r o n e m a j o r c o m p l a i n t g u y s d o l l s n t k n o w e n d e m i c o r i g i n a l s t a g e s h o w j e a n s i m m o n s r e a l i z e s b r a n d o n e v e r t o o k m o n e y b e t m a d e s i n a t r a e v e n s a i d l o s t b e t r u n s f i n d c u t w e d d i n g s e e m s s c e n e b r a n d o s i m m o n s w o u l d a d d e d i m p a c t s t o r y s e e b r a n d o c o m e a r o u n d c a m e a r o u n d w o u l d g r e a t s c e n e s c e n e m u s i c m a n s p o i l e r s a h e a d h a r o l d m a r i o n d u e t s w a i t i n g c h a n g e s u p s t a i r s h o u s e s s i d e w a l k s s i n g i n g 7 6 t r o m b o n e s s s i n g i n g g o o d n i g h t s o m e o n e s u d d e n l y s w i t c h s i n g s s o n g s b e a u t i f u l w a y c o n v e y c r o s s s e m o t i o n a l h i g h m o m e n t f i l m s t i l l g u y s d o l l s l o t g o i n g
,,,,


<IPython.core.display.HTML object>


In [None]:
save_info(indices, reviews_raw, targets, model_out_list, attr_list, fname='lime_outs.pkl')

In [None]:
# 

In [None]:
# from lime.lime_text import LimeTextExplainer
# explainer = LimeTextExplainer(
#     class_names = ['negative', 'positive'],
#     bow=True
#   )

# def lime_pred_sentence(sentence):
#     if isinstance(sentence, str):
#         sentence = [sentence]
#     sent_prepped, sent_vec, sent_tensor = preprocess_pipelined(sentence)
#     print(f'sent tensor size {sent_tensor.size()}')
#     with torch.no_grad():
#         out = model(sent_tensor)
#         out_complement = 1-out
#         out_concat = torch.cat([out_complement, out], dim=1)
#         out_concat_np = out_concat.detach().numpy()
    
# #         print(f'{sent_prepped}')
# #         print(f'{sent_vec}')
# #         print(f'{sent_tensor}')
# #         print(f'{out_concat_np.shape}')
#         return out_concat_np

# for r in reviews_raw:
#     explanation = explainer.explain_instance(r, lime_pred_sentence)
#     explanation.show_in_notebook()