#### [Get data and top words](#data)
- Get data from file and construct DataSet object
- Get top words, placebo words

#### [Edit sentences and get embeddings for edited sentences](#sentence_edit)
- Edit sentences by removing top / placebo / empty words
- Sentence embedding by concatinating last 4 layers of BERT embeddings

#### [Matching sentences and calculate ITE:](ite_match)
- Treatment match
- Placebo match
- Control match
- E.g., "This is a good movie" matched with "This is a bad movie"

In [2]:
import pickle
import io, time
from io import BytesIO
from IPython.display import display
import pickle, tarfile, random, re, requests
from collections import Counter, defaultdict
import numpy as np
import pandas as pd
pd.set_option('max_colwidth', -1)

import sklearn
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.linear_model import LogisticRegression

import torch
from transformers import * # here introduces bert
from tqdm.notebook import tqdm

import warnings
warnings.filterwarnings("ignore")

# from allennlp.predictors.predictor import Predictor

In [3]:
from data_structure import Dataset, SentenceEdit, get_IMDB, get_kindle, get_toxic_comment, get_toxic_tw

#### Get data and top words <a id='data'></a>

In [4]:
def simple_vectorize(df):
    """
    Vectorize text
    """
    vec = CountVectorizer(min_df=5, binary=True, max_df=.8)
    X = vec.fit_transform(df.text)
    print(X.shape)
    y = df.label.values
    feats = np.array(vec.get_feature_names())
    
    return X, y, vec, feats

In [5]:
def print_coef(clf, feats, n=10, pattern=None):
    """
    sort and print words by coef stregth (abs(coef))
    """
    if len(clf.classes_) == 2:
        coefs = [-1*clf.coef_[0], clf.coef_[0]] # change the coef relation corresponding with each class
    else:
        coefs = clf.coef_

    for label, coef in zip(clf.classes_, coefs):
        print("\nTop features for class %s" % str(label))
        if pattern:
            # restrict to features matching pattern
            coef = coef.copy()
            coef[[i for i,s in enumerate(feats) if len(re.findall(pattern, s)) == 0]] = 0
        
        topi = coef.argsort()[::-1][:n]
        s = ' '.join('%s/%.2f' % (f,c) for f, c in zip(feats[topi], coef[topi]))
        print(s)
        
def get_top_terms(dataset, coef_thresh=.5, placebo_thresh=.5, C=1):
    """
    Fit classifier, print top-n terms;
    Top features (features have high coef): abs(coef) >= thresh
    Placebos (features have low coef): abs(coef) <= thresh
    """
    clf = LogisticRegression(class_weight='auto', C=C, solver='lbfgs', max_iter=1000)
    clf.fit(dataset.X, dataset.y)
    
    print_coef(clf, dataset.feats, n=100)
    #print('dummy coef= %.3f' % clf.coef_[0][dataset.vec.vocabulary_[DUMMY_TERM]])
    
    top_feature_idx = np.where(abs(clf.coef_[0]) >= coef_thresh)[0]
    placebo_feature_idx = np.where(abs(clf.coef_[0]) <= placebo_thresh)[0]
    feature_coef = np.array([float("%.3f" % c) for c in clf.coef_[0]]) 
    
    return top_feature_idx, placebo_feature_idx, feature_coef

#### Edit sentences (remove top words from sentences) <a id='sentence_edit'></a>

In [8]:
def get_wd_context(sentence, word, window=5):
    """
    Remove word from sentence and return context (at most 5 tokens before and after the word)
    E.g.,
        word: delicious 
        sentence: Love this book full of delicious foods
        context: Love this book full of ... foods (at most 5 (window size) left and right words)
        
    """
    word = word.lower()
    toks = sentence.split()
    
    for i, t in enumerate(toks):
        if re.search(r'(?i)\b%s\b' % word, t): # if find the word, then take left 5 and right 5 words
            s = ' '.join(toks[max(0, i-window):min(i+window+1, len(toks)-1)])
            return re.sub(r'(?i)\b%s\b' % word, ' ', s, re.IGNORECASE)
        
def remove_wd_from_sentences(X, sentences, labels, vec, remove_wd_list, exclude_sent_idx, window=5):
    """
    for each word to be removed
      for each treatment sentence (sentence containing this word)
        make a copy of the sentence with this word removed
        
    remove_wd_list:
        - top_words: abs(coef) > thresh
        - placebo_words: abs(coef) < thresh
    exclude_sent_idx:
        - sentences to exclude (sentences containing placebo_words should not contain top_words)
    
    X: feature matrix;
    sentences: a list of sentences containing keywords;
    labels: sentence labels;
    all_words: words to be removed;
    vec: CountVectorizer;
    window: number of tokens to keep before and after the removing word
    
    unk sentence: sentence with a specific word removed;
        E.g., "Love this book full of delicious foods"
              "Love this book full of ... foods"
                
    returns:
       unk_sentences: a list of SentenceEdit objects
       word2sentences: map from word to a list of SentenceEdit objects related with this word
       sentence_idx: list of processed sentence ids 
       
    """
    word2sentenceEdit = defaultdict(list)
    sentenceEdit_objs = []
    sentence_idx = []
    
    for word in remove_wd_list:
        wi = vec.vocabulary_[word]
        for sent_id in X[:,wi].nonzero()[0]: # sentences containing current word
            if(sent_id not in exclude_sent_idx): # not contain any top words
                wd_context = get_wd_context(sentences[sent_id], word, window) # context within window=5
                sEdit_obj = SentenceEdit(wd_context, sent_id, word, labels[sent_id])
                word2sentenceEdit[word].append(sEdit_obj)
                sentenceEdit_objs.append(sEdit_obj)
                sentence_idx.append(sent_id)
                
#     print('%d sentences with placebo terms\n' % len(unk_sentences))

    return sentenceEdit_objs, word2sentenceEdit, list(set(sentence_idx))

In [9]:
def get_original_sentences(df):
    """
    Original sentences without any word removed;
    Used when control case is not removing any word;
    """
    df['i_th'] = range(df.shape[0])
    original_sentEdit = []
    for ri, row in df.iterrows():
        sEdit_obj = SentenceEdit(row['text'], row['i_th'], '', row['label'])
        original_sentEdit.append(sEdit_obj)
        
    return original_sentEdit

#### BERT embeddings for edited sentences

In [10]:
def embed_one_sentence(sentence, sentence_model, tokenizer):
    """
    # bert_tokenizer.vocab_size
    # bert_tokenizer.tokenize(sentence)
    # bert_tokenizer.convert_tokens_to_ids('on')
    # bert_tokenizer.convert_ids_to_tokens(102)
    # each sentence is encoded as a 3072 vec: 768 * 4 (concat last four layers)
    """
    with torch.no_grad():
        # sentence_model returns (logit output layer, pooler_output, hidden states, attentions)
        hidden_states = sentence_model(torch.tensor([tokenizer.encode(sentence, add_special_tokens=True)]))[2]
        #last_four_layers = [hidden_states[i] for i in (-1, -2, -3, -4)]
        last_four_layers = [hidden_states[i] for i in (0,1,2,3)] # list of 4 element, each element is [1,16,768]
        # cast layers to a tuple and concatenate over the last dimension
        cat_hidden_states = torch.cat(tuple(last_four_layers), dim=-1) # [1,16,3072]
        cat_sentence_embedding = torch.mean(cat_hidden_states, dim=1).squeeze() # 3027
        #return(torch.mean(hidden_states[-1], dim=1).squeeze()) # average word embeddings in last layer
        return cat_sentence_embedding.numpy()                         # average last 4 layers
        
        
def embed_all_sentences(sentences, bert_tokenizer=None, sentence_model=None):
    """
    sentences: a list of SentenceEdit objects;
    Each sentence is a SentenceEdit object;
    Add embedding attribute for SentenceEdit object;
    """
    if not bert_tokenizer:
        bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        sentence_model = BertModel.from_pretrained('bert-base-uncased',output_hidden_states=True, output_attentions=True)
    
    for s in tqdm(sentences):
        s.embedding = embed_one_sentence(s.context, sentence_model, bert_tokenizer)

#### Run experiment for: get data, edit sentences, get embeddings

In [12]:
def get_dataset_embeddings():
    """
    1. Get data from file and construct DataSet object
    2. Get top words, placebo words
    3. Edit sentences and get embeddings for edited sentences:
        Remove top words;
        Remove placebo words;
        Original sentences without edit;
    """
    random.seed(42)
    datasets = []
    for get_data_df, moniker, coef_thresh, placebo_thresh in [
            (get_IMDB, 'imdb', 1.0, 0.1),
            (get_kindle, 'kindle', 0.9, 0.2),
            (get_toxic_comment, 'toxic', 1.0, 0.05), 
            (get_toxic_tw, 'toxic_tw', 0.7, 0.2),
        ]: 
        
        # Get data and show basic information
        df = get_data_df()
        X, y, vec, feats = simple_vectorize(df) # vectorize text
        ds = Dataset(X, y, vec, df, moniker) # construct dataset object
        
        print('%s dataset, %d instances' % (moniker,len(df)))
        print('Label distribution: %s' % str(Counter(y).items()))
        print('Feature matrix: %s' % str(X.shape))
        
        # Get top features 
        ds.top_feature_idx, ds.placebo_feature_idx, ds.coef = get_top_terms(ds, coef_thresh=coef_thresh, placebo_thresh=placebo_thresh, C=1)
        ds.top_features = feats[ds.top_feature_idx]
        ds.placebo_features = feats[ds.placebo_feature_idx]
        print('\n%d top terms: %d pos, %d neg\n' % (len(ds.top_features), len(np.where(ds.coef[ds.top_feature_idx]>0)[0]), len(np.where(ds.coef[ds.top_feature_idx]<0)[0])))
        print('\n%d placebo terms: %d pos, %d neg\n' % (len(ds.placebo_features), len(np.where(ds.coef[ds.placebo_feature_idx]>0)[0]), len(np.where(ds.coef[ds.placebo_feature_idx]<0)[0])))
        
        # Edit sentences (remove top features, placebo features, keep as it is) and get embedding representations
        
#         print('Edit sentences containing top words')                                                                   
        ds.topwd_sentObj_list, ds.topwd_sentObj_dict, topwd_sent_idx = remove_wd_from_sentences(X,df.text,df.label,vec,ds.top_features,exclude_sent_idx=[])
        print('Edit %d sentences for top terms\n' % len(ds.topwd_sentObj_list))
        embed_all_sentences(ds.topwd_sentObj_list) 

#         print('Edit sentences containing placebo words')     
        ds.placebowd_sentObj_list, ds.placebowd_sentObj_dict, placebowd_sent_idx = remove_wd_from_sentences(X,df.text,df.label,vec,ds.placebo_features,topwd_sent_idx)
        print('Edit %d sentences for placebo terms\n' % len(ds.placebowd_sentObj_list))
        embed_all_sentences(ds.placebowd_sentObj_list) 
        
#         print('Original sentences convert to sentence edit object')
        ds.original_sentObj_list = get_original_sentences(df)
        print('%d original sentences')
        embed_all_sentences(ds.original_sentObj_list)    
        
        datasets.append(ds)
        
    return datasets

# pickle.dump(datasets, open('/data/zwang/2020_S/Toxic/Concat_last4_emb/data_with_placebo_match/datasets_emb.pickle','wb'))


In [13]:
start = time.time()
datasets = get_dataset_embeddings()
end = time.time()
print((end-start)/60)

(10662, 4574)
new dataset with 10662 records


Unnamed: 0,label,text
0,-1,"simplistic , silly and tedious ."


imdb dataset, 10662 instances
Label distribution: dict_items([(-1, 5331), (1, 5331)])
Feature matrix: (10662, 4574)

Top features for class -1
boring/2.15 bore/2.11 dull/2.06 supposed/2.00 fails/1.96 badly/1.89 unless/1.79 routine/1.77 waste/1.77 mindless/1.76 pie/1.75 junk/1.74 plodding/1.74 unfunny/1.71 neither/1.68 flat/1.68 worst/1.68 generic/1.66 stupid/1.62 ill/1.62 too/1.62 suffers/1.58 disappointment/1.58 incoherent/1.58 intentions/1.58 wasn/1.54 superficial/1.54 mediocre/1.53 devoid/1.50 disguise/1.49 exhausting/1.48 uninspired/1.48 stunt/1.48 product/1.47 stale/1.46 propaganda/1.45 animal/1.44 tv/1.43 schneider/1.43 mess/1.43 lacking/1.43 benigni/1.43 artificial/1.42 sadly/1.41 tedious/1.41 inept/1.38 uneasy/1.38 lack/1.36 pretentious/1.33 hasn/1.33 sheridan/1.33 seagal/1.32 already/1.32 pointless/1.30 poorly/1.30 bland/1.30 barely/1.30 unfortunately/1.29 conceived/1.29 lifeless/1.29 choppy/1.28 god/1.28 unintentionally/1.28 college/1.27 pathetic/1.27 bits/1.26 none/1.26 bana

HBox(children=(FloatProgress(value=0.0, max=8882.0), HTML(value='')))


Edit 12996 sentences for placebo terms



HBox(children=(FloatProgress(value=0.0, max=12996.0), HTML(value='')))


%d original sentences


HBox(children=(FloatProgress(value=0.0, max=10662.0), HTML(value='')))


(12000, 3390)
new dataset with 12000 records


Unnamed: 0,text,rating,label
0,"this book jumped around so much, it was confusing at certain parts, did not make since, I skimmed through it, did not enjoy",2,-1


kindle dataset, 12000 instances
Label distribution: dict_items([(-1, 6000), (1, 6000)])
Feature matrix: (12000, 3390)

Top features for class -1
waste/2.18 disappointing/2.10 poorly/1.96 deleted/1.91 flat/1.90 weird/1.85 lacks/1.84 stupid/1.78 bored/1.78 religious/1.77 boring/1.70 unless/1.62 sorry/1.60 unrealistic/1.57 not/1.57 silly/1.57 disappointment/1.49 unfortunately/1.48 lost/1.45 wasted/1.43 title/1.43 shallow/1.40 finish/1.40 99/1.40 okay/1.39 worst/1.38 nothing/1.38 mildly/1.36 didnt/1.35 didn/1.33 skip/1.33 free/1.32 dialog/1.32 call/1.31 weak/1.31 chicken/1.29 standard/1.27 lacking/1.27 looked/1.26 basically/1.26 blah/1.25 lacked/1.24 rush/1.24 paid/1.24 lack/1.23 horrible/1.22 english/1.21 star/1.21 lame/1.21 hopes/1.20 print/1.19 desired/1.19 15/1.19 stock/1.19 cinderella/1.18 pat/1.17 sick/1.17 unbelievable/1.16 annoying/1.14 poor/1.13 impressed/1.13 quit/1.13 potential/1.13 disgusting/1.13 grammar/1.11 decent/1.11 chapters/1.10 captivating/1.10 chapter/1.09 luck/1.09 tr

HBox(children=(FloatProgress(value=0.0, max=32734.0), HTML(value='')))


Edit 5243 sentences for placebo terms



HBox(children=(FloatProgress(value=0.0, max=5243.0), HTML(value='')))


%d original sentences


HBox(children=(FloatProgress(value=0.0, max=12000.0), HTML(value='')))


(20872, 13025)
new dataset with 20872 records


Unnamed: 0,id,target,text,label
0,6048366,0.428571,"Zuma is nothing if we compare him with ramaphosa and rupert, black people struggling till now in their own own country and rupert and wmc are busy in looting from our country.",-1


toxic dataset, 20872 instances
Label distribution: dict_items([(-1, 10436), (1, 10436)])
Feature matrix: (20872, 13025)

Top features for class -1
meaningful/1.57 card/1.55 irrational/1.44 recognize/1.41 ndp/1.40 quarters/1.38 campus/1.34 gain/1.32 superior/1.32 severe/1.30 exceptions/1.30 honored/1.30 adding/1.27 figures/1.26 46/1.26 market/1.26 india/1.25 items/1.25 103/1.24 editorial/1.24 spin/1.22 hunting/1.21 reduced/1.20 helicopters/1.20 confused/1.19 censorship/1.18 evident/1.18 niqab/1.16 july/1.14 diminish/1.14 filed/1.14 mps/1.14 theirs/1.13 planes/1.12 reject/1.12 priorities/1.11 trusting/1.11 kettle/1.11 denominations/1.10 opposing/1.10 virtue/1.09 accurately/1.09 despite/1.09 robert/1.09 prosecuted/1.09 physically/1.09 internal/1.09 911/1.08 reform/1.08 shariah/1.07 fuel/1.07 plight/1.06 william/1.05 supporter/1.05 rail/1.05 colorado/1.05 prevalent/1.05 holocaust/1.05 understanding/1.05 happening/1.05 helpless/1.04 bravery/1.04 blackberry/1.04 totalitarian/1.04 ben/1.03 cl

HBox(children=(FloatProgress(value=0.0, max=29014.0), HTML(value='')))


Edit 43542 sentences for placebo terms



HBox(children=(FloatProgress(value=0.0, max=43542.0), HTML(value='')))


%d original sentences


HBox(children=(FloatProgress(value=0.0, max=20872.0), HTML(value='')))


(6774, 1866)
new dataset with 6774 records


Unnamed: 0,id,text,label
0,1105487920813821952,@FlyGuyCree Nigga whatever one you gave me 🤦🏻‍♀️,-1


toxic_tw dataset, 6774 instances
Label distribution: dict_items([(-1, 3186), (1, 3588)])
Feature matrix: (6774, 1866)

Top features for class -1
season/1.70 sleep/1.58 tears/1.47 used/1.44 omg/1.44 tryna/1.40 love/1.35 voice/1.27 meet/1.25 ily/1.23 week/1.22 bruh/1.21 holy/1.20 miss/1.19 cute/1.18 2nd/1.15 reasons/1.14 happen/1.14 calling/1.14 bigger/1.13 process/1.11 draw/1.11 above/1.10 forever/1.10 sunday/1.10 jihad/1.09 summer/1.09 rn/1.09 country/1.08 ago/1.07 today/1.04 skills/1.03 mad/1.03 myself/1.03 fly/1.02 owner/1.01 cry/1.00 month/0.99 james/0.99 starting/0.97 dying/0.97 terms/0.97 killed/0.97 within/0.96 given/0.96 events/0.96 lock/0.95 adult/0.95 perfect/0.95 appreciate/0.95 chicken/0.95 children/0.95 gone/0.94 ye/0.94 birthday/0.93 thru/0.93 level/0.93 jumped/0.93 compare/0.92 bus/0.92 basketball/0.91 find/0.91 together/0.90 honestly/0.90 racism/0.90 basically/0.90 karma/0.90 reach/0.89 kinda/0.89 phone/0.88 walk/0.87 naruto/0.87 classy/0.87 test/0.87 pain/0.86 defense/0

HBox(children=(FloatProgress(value=0.0, max=9224.0), HTML(value='')))


Edit 5457 sentences for placebo terms



HBox(children=(FloatProgress(value=0.0, max=5457.0), HTML(value='')))


%d original sentences


HBox(children=(FloatProgress(value=0.0, max=6774.0), HTML(value='')))


103.93969039122264


In [14]:
# pickle.dump(datasets, open('/data/zwang/2020_S/EMNLP/V_7_rerun/datasets_emb.pickle','wb'))

### Matching sentences and calculate ITE <a id='ite_match'></a>

- Treatment match:  context_A + top_termA = context_B + top_term_B <br>

- Placebo match:  context_A + top_termA = context_B + non_top_term_B <br>

- Control match:  context_A + top_termA = sentence_B (not contain top_termA) <br>

- Takes 6 hours to run one matching strategy for all datasets (one time run and save for future use)

In [15]:
def find_matched_sentence(t_sentObj, c_sentObj_list, min_sim=.7):
    """
    For each treatment sentence (sentence contain a top word), find a matched sentence (sentences with similar contexts);
        
    Find the sentence with closest contexts:
    context_A = sentence_A - word_A
    context_B = sentence_B - word_B
    
    for context_A:
        sort similarity score of (context_A, all other contexts) in descending order
        if((sentence_A != sentence_B) and (word_A != word_B) and (cos(context_A,context_B)>0.7)):
            context_B is a match for context_A
    
    diff: difference between sentence_A.label - sentence_B.label
    """
    
    # similarity between current treatment context with all other contexts
    sims = cosine_similarity([t_sentObj.embedding],[c_sent.embedding for c_sent in c_sentObj_list])[0]
    
    match = None
    match_sim = 0
    for c_sentObj, sim in sorted(zip(c_sentObj_list, sims), key=lambda x: -x[1]): # find the first most similar match
        if((sim >= min_sim) and (c_sentObj.sentence_idx != t_sentObj.sentence_idx) and (c_sentObj.remove_wd != t_sentObj.remove_wd) and (not re.search(r'(?i)\b%s\b' % t_sentObj.remove_wd, c_sentObj.context))):
            match = c_sentObj
            match_sim = sim        
            break
            
    if match:
        diff = (t_sentObj.label - match.label)
    else:
        print('no match')
        diff = 0
#         diff = np.nan

    return diff, match_sim, match # [(match_sim, target_sentence)], [(match_sim,match)]


In [16]:
def calculate_ites_from_matched_sentences(ds_data, matchby='treat', min_sim=0.01):
    """
    For each treatment sentence (sentence contain a top word), find a matched sentence (sentences with similar contexts);
    
    t_sentObj_list: a list of SentenceEdit objects for sentences with top words removed
    c_sentObj_list: a list of SentenceEdit objects for sentences with top / placebo / '' words removed
    matchby = treat / control / placebo
    """
    t_sentObj_list = ds_data.topwd_sentObj_list
    
    if(matchby == 'treat'):
        c_sentObj_list = ds_data.topwd_sentObj_list
    elif(matchby == 'placebo'):
        c_sentObj_list = ds_data.placebowd_sentObj_list
    elif(matchby == 'control'):
        c_sentObj_list = ds_data.original_sentObj_list
    
    matched_pairs = []    
    for t_sentObj in tqdm(t_sentObj_list):
        ite_bylabel, sim, control_obj = find_matched_sentence(t_sentObj, c_sentObj_list, min_sim=min_sim)
            
        matched_pairs.append(
            {
                'term': t_sentObj.remove_wd,
                'sentence_id': t_sentObj.sentence_idx,
                'treat_obj': t_sentObj,
                'control_obj': control_obj, # this is an object             
                'similarity': sim,
                'difference': t_sentObj.embedding - control_obj.embedding, 
                'ite': ite_bylabel,
            }
        )
    return pd.DataFrame(matched_pairs)

#### Experiments using treatment match

In [17]:
# ds_imdb, ds_kindle, ds_toxic, ds_toxic_tw = datasets
ds_imdb, ds_kindle, ds_toxic, ds_toxic_tw = pickle.load(open('/data/zwang/2020_S/EMNLP/V_7_rerun//datasets_emb.pickle','rb'))
datasets = [ds_imdb, ds_kindle, ds_toxic, ds_toxic_tw]

 
start = time.time()
data_ites_byTreat = []
for ds_data in datasets:
    ds_data.ites = calculate_ites_from_matched_sentences(ds_data, matchby='treat', min_sim=0.01)
    data_ites_byTreat.append(ds_data)
    
end = time.time()
print((end-start)/60)
pickle.dump(data_ites_byTreat, open('/data/zwang/2020_S/EMNLP/V_7_rerun/datasets_treat_match.pickle', 'wb'))

HBox(children=(FloatProgress(value=0.0, max=8882.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=32734.0), HTML(value='')))

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

IOPub message rate exceed




HBox(children=(FloatProgress(value=0.0, max=29014.0), HTML(value='')))

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)






HBox(children=(FloatProgress(value=0.0, max=9224.0), HTML(value='')))


1293.4846917788188


#### Experiments using placebo match

In [None]:
# ds_imdb, ds_kindle, ds_toxic, ds_toxic_tw = pickle.load(open('/data/zwang/2020_S/Toxic/Concat_last4_emb/V_1_treat/datasets_emb_mindf5.pickle','rb'))
# datasets = [ds_imdb, ds_kindle, ds_toxic, ds_toxic_tw]

start = time.time()

data_ites_byPlacebo = []
for ds_data in datasets:
    ds_data.ites = calculate_ites_from_matched_sentences(ds_data, matchby='placebo', min_sim=0.01)
    data_ites_byPlacebo.append(ds_data)
    
end = time.time()
print((end-start)/60)
pickle.dump(data_ites_byPlacebo, open('/data/zwang/2020_S/Toxic/Concat_last4_emb/V_3_placebo/datasets_placebo_match.pickle', 'wb'))


#### Experiments using control match

In [None]:
# ds_imdb, ds_kindle, ds_toxic, ds_toxic_tw = pickle.load(open('/data/zwang/2020_S/Toxic/Concat_last4_emb/V_1_treat/datasets_emb_mindf5.pickle','rb'))
# datasets = [ds_imdb, ds_kindle, ds_toxic, ds_toxic_tw]

start = time.time()

data_ites_byControl = []
for ds_data in datasets:
    ds_data.ites = calculate_ites_from_matched_sentences(ds_data, matchby='control', min_sim=0.01)
    data_ites_byControl.append(ds_data)

end = time.time()
print((end-start)/60)
pickle.dump(data_ites_byControl, open('/data/zwang/2020_S/Toxic/Concat_last4_emb/V_2_control/datasets_control_match.pickle', 'wb'))


#### Manually check label correctness for Kindle
- Sentence labels are inherited from document labels

In [10]:
df_kindle = get_kindle()
df_kindle.shape

(20233, 3)

In [11]:
df_kindle.head()

Unnamed: 0,text,rating,label
0,This was a very fun story,5,1
1,Not fast moving but a very well managed pace,5,1
2,The story line is an interesting take on zombie mythology and is a great journey,5,1
3,"The story was good, but I was getting very irritated at all the grammatical and spelling errors",2,-1
4,series is always a good read,5,1


In [6]:
df_kindle.iloc[3648].text

"This was a teaser, I can't wait for part 2"

In [39]:
rand_idx = random.sample(list(df_kindle.index),200)
df_rand = df_kindle.iloc[rand_idx]
df_rand

Unnamed: 0,text,rating,label
3648,So I do not want to sway them either way,1,-1
819,I thought this was a sweet story,4,1
9012,"my favorite book is Deed of Paksenarrion, which takes me a day or two to read",1,-1
8024,"I think the author had fun writing this book, but it's the sort of food I throw together without the benefit of a recipe",2,-1
7314,You are supposed to laugh here if you loved this,1,-1
...,...,...,...
7216,I did finish it but would not reccomend it,2,-1
235,However it was just too confusing,2,-1
2326,"Funny, sexy and a whole lot of fun from start to finish",5,1
1929,figured why not give the story a try,2,-1


In [10]:
# df_rand.to_csv('/data/zwang/2020_S/EMNLP/kindle_random_samples.csv')

In [11]:
df_rand.head()

Unnamed: 0,text,rating,label
17911,"Apart from that, there isn't much to the plot",4,1
12430,A hot steamy love affair with secrets felonies and hotties,5,1
1319,"Corrupt cop, porn girlfriend, porn producer = predictable crap triangle",2,-1
16636,The book started to get interesting and then it ended and is over,2,-1
4264,Not much depth to it,2,-1


In [12]:
df_rand_labeled = pd.read_csv('/data/zwang/2020_S/EMNLP/kindle_random_samples.csv')
df_rand_labeled.head()

Unnamed: 0.1,Unnamed: 0,Zhao,text,rating,label
0,17911,-1,"Apart from that, there isn't much to the plot",4,1
1,12430,1,A hot steamy love affair with secrets felonies and hotties,5,1
2,1319,-1,"Corrupt cop, porn girlfriend, porn producer = predictable crap triangle",2,-1
3,16636,-1,The book started to get interesting and then it ended and is over,2,-1
4,4264,-1,Not much depth to it,2,-1


In [6]:
df_rand_labeled['Unnamed: 0'].values

array([17911, 12430,  1319, 16636,  4264])

In [15]:
df_rand_labeled[df_rand_labeled['Zhao'] == df_rand_labeled['label']].shape

(484, 5)

In [16]:
193/200, 16/500, 484/500

(0.965, 0.032, 0.968)

In [9]:
rand_idx_2 = random.sample(list(set(df_kindle.index) - set(df_rand_labeled['Unnamed: 0'].values)),300)
len(rand_idx_2)

300

In [10]:
set(rand_idx_2).intersection(set(df_rand_labeled['Unnamed: 0'].values))

set()

In [11]:
df_rand_2 = df_kindle.iloc[rand_idx_2]
df_rand_2.to_csv('/data/zwang/2020_S/EMNLP/kindle_random_samples_2.csv')

#### Out-of-vocabulary words in each train / test dataset

In [17]:
ds = pickle.load(open('/data/zwang/2020_S/EMNLP/V_7_rerun/datasets_emb.pickle','rb'))
ds_imdb = ds[0]
ds_kindle = ds[1]
ds_toxic = ds[2]
ds_tw = ds[3]
ds_kindle_short = pickle.load(open('/data/zwang/2020_S/EMNLP/V_6_shortSents/kindle_emb.pickle','rb'))
ds_toxic_short = pickle.load(open('/data/zwang/2020_S/EMNLP/V_6_shortSents/toxic_emb.pickle','rb'))

In [18]:
ds_imdb.df.shape, len(ds_imdb.top_features)

((10662, 3), 366)

In [20]:
ds_kindle_short.df.shape, len(ds_kindle_short.top_features)

((20233, 4), 270)

In [24]:
len(set(ds_imdb.top_features).intersection(set(ds_kindle_short.top_features)))

46

In [27]:
46/366

0.12568306010928962

In [22]:
ds_toxic_short.df.shape, len(ds_toxic_short.top_features)

((15216, 5), 329)

In [23]:
ds_tw.df.shape, len(ds_tw.top_features)

((6774, 4), 341)

In [25]:
len(set(ds_toxic_short.top_features).intersection(set(ds_tw.top_features)))

29

In [26]:
29/341

0.08504398826979472