In [1]:
import pandas as pd
import numpy as np
import torch
import io
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Adam
import time
from sklearn.metrics import f1_score

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

import spacy

import warnings
warnings.filterwarnings("ignore")
import sys

sys.path.insert(0,'/data1/YelpAnalysis/')
from utils import *
nlp = spacy.load('/data2/link10/models/fasttext/en_fasttext_crawl')

sys.path.insert(0,'/data2/Datasets/')
from preprocess import *

dataFolder = '/data2/Datasets/Raw'
device = 'cuda:2'




from tqdm import tqdm
from torch.utils.data import  DataLoader
import torch.optim as optim
from sklearn.feature_extraction.text import CountVectorizer
import nltk
import shap

In [2]:
# CHANGE TRAINING DATASET FOR LEXICON CREATION HERE
# nrc_joy, nrc_sadness, nec_surprise, nrc_fear, nrc_anger,
# empathy, yelp_subset, amazon_finefood_subset, amazon_toys_subset

lexiconDataset = "empathy"
trainDf, devDf, testDf = splitData(getData(dataFolder,lexiconDataset))

# Preparing Data
trainData = generateFastTextData_Spacy(trainDf, nlp, textVariable = 'text')

testData = generateFastTextData_Spacy(testDf, nlp, textVariable = 'text')

trainDataset = Dataset(trainDf, trainData)
testDataset = Dataset(testDf, testData)

# Training NN
NNnet = trainFFN(trainDataset, testDataset, num_epochs = 3, batchSize = 5, device ="cuda:2")

cuda:2


38it [00:00, 126.32it/s]


Acc : 0.5 F1 : 0.6666666666666666
*************************
cuda:2


38it [00:00, 144.45it/s]

Acc : 0.6210526315789474 F1 : 0.6538461538461539
*************************





cuda:2


38it [00:00, 170.53it/s]

Acc : 0.5736842105263158 F1 : 0.4335664335664336
*************************





In [3]:
print((trainData).shape)

(1536, 300)


In [4]:
# Function to count number of unique words in a dataset
def getWordCount(data, textVariable = 'text'):
    vectorizer = CountVectorizer(stop_words=[], tokenizer= nltk.word_tokenize)
    vectorizer.fit(data[textVariable])
    cvFit = vectorizer.transform(data[textVariable])
    wordList = list(vectorizer.vocabulary_.keys())
    counts = np.asarray(cvFit.sum(axis=0))[0]

    wordCount = []
    for i in range(len(wordList)):
        wordCount.append(counts[vectorizer.vocabulary_[wordList[i]]])
    df = pd.DataFrame({'word':wordList,'wordCount':wordCount})    
    return df


In [5]:
count_train_df = getWordCount(trainDf)
count_test_df = getWordCount(testDf)


In [6]:
print(len(count_train_df))
print(len(count_test_df))


9081
2890


In [7]:
nlp1 = spacy.load("/data2/link10/models/fasttext/en_fasttext_crawl")

In [8]:
# returns a count of words in the dataset, tensor representation for dataset, 
# word frequency for the words, and a mapping from a word to the the review IDs in which the 
# word is present

from spacy.tokenizer import Tokenizer
tokenizer1 = Tokenizer(nlp1.vocab)

def getWordVectors(dataframe, data_name = "text"):
    init_cnt=0
    freq_dict = {}
    occur_dict ={}
    review_idx=-1
    with nlp1.disable_pipes():
        for msg in tqdm(dataframe[data_name]): 
            review_idx+=1
            for word in tokenizer1(msg.lower()):
        
                if(str(word)) not in occur_dict:
                    occur_dict[str(word)]=list()
                occur_dict[str(word)].append(review_idx)
                
                
                if str(word) in freq_dict:
                    freq_dict[str(word)] += 1
                else:
                    freq_dict.update({str(word): 1}) 
                    
                init_cnt+=1
                
        assert(review_idx==len(dataframe)-1)
            
        print(init_cnt);
        bg = torch.empty((init_cnt,300))
        curr_cnt=0
        for msg in tqdm(dataframe[data_name]):
            for word in tokenizer1(msg.lower()):
                word_vec = nlp1(str(word)).vector
                curr_embed = torch.from_numpy(word_vec)
                curr_embed = curr_embed.reshape((1, len(curr_embed)))
                bg[curr_cnt] = curr_embed
                curr_cnt+=1
                
        return init_cnt, bg, freq_dict, occur_dict

In [9]:
print(NNnet)
print(trainData.shape)

NNNet(
  (fc1): Linear(in_features=300, out_features=1024, bias=True)
  (fc2): Linear(in_features=1024, out_features=512, bias=True)
  (fc3): Linear(in_features=512, out_features=128, bias=True)
  (fc4): Linear(in_features=128, out_features=2, bias=True)
)
(1536, 300)


In [10]:
# trainData = trainData[:10]
train_bg = torch.FloatTensor(trainData)
train_bg = train_bg.to(device)


In [11]:
op = NNnet(train_bg)

In [12]:
# x = op.cpu().detach().numpy()

In [13]:
print(train_bg.shape)

torch.Size([1536, 300])


In [14]:
train_bg =  train_bg[:500]

In [15]:
# Deep Explainer object with the training data as background
e = shap.DeepExplainer(NNnet, train_bg)

In [None]:
# Calculating SHAP values
st = time.time()
shap_vals = e.shap_values(train_bg)
print(time.time()-st)

In [None]:
print(shap_vals[0])
print(shap_vals[1])

In [None]:
print((shap_vals[1].shape))
shap_scores = shap_vals[1].mean(axis=1)

In [None]:
explain_cnt,explain_examples, explain_dict, explain_occur = getWordVectors(trainDf)

In [None]:
print(explain_examples.shape)

In [None]:
print(len(explain_occur))

In [None]:
# Function which finds the shap score for each word/token
# by averaging the shap scores of all the reviews in which the word occurs

def findShapForToken(shap_scores, explain_occur, explain_dict):
    explain_scores={}
    for word in explain_occur:
        indices = explain_occur[word]
        word_score = 0
        assert(explain_dict[word]==len(indices))
        for index in indices:
            if(index>=len(explain_occur)):
                print(word)
            word_score += shap_scores[index]
        explain_scores[word] = word_score/len(indices)
    return explain_scores       

In [None]:
 word_scores =findShapForToken(shap_scores, explain_occur, explain_dict)

In [None]:
# Converting into final lexicon format
final_data = []
for token in word_scores:
    curr_token_data=[]
    curr_token_data.append(token)
    curr_token_data.append(word_scores[token])
    curr_token_data.append(explain_dict[token])
    final_data.append(curr_token_data)

In [None]:
final_df =  pd.DataFrame(final_data, columns =['word', 'score', 'word_count'])

In [None]:
# CHANGE LEXICON SAVING DESTINATION HERE

final_df.to_csv(path_or_buf="./FINAL_LEX/ffn_deepshap_nrc_anger_1.csv", index=False)

In [None]:

##### EVALUATIONS

In [None]:
final_df = pd.read_csv("/home/tjss/Final/embedding-lexica-creation/lexica/FFN_DeepShap/nrc_anger_ffn_deepshap.csv")

In [None]:
# List of datasets to be evaluated against

dataList = ['nrc_anger', 'song_anger', 'dialog_anger', 'friends_anger']

In [None]:
lexiconWords, lexiconMap = getLexicon(df = final_df)

results = []

for data in dataList:
    results.append(testFFN(NNnet,data,lexiconWords, lexiconMap, nlp, dataFolder))
    
results = pd.DataFrame(results)
results.columns = ['TestData','modelAcc', 'modelF1', 'lexiconAcc', 'lexiconF1']
results.to_csv("Results.csv",index = False, index_label = False)
print("--------------------"+lexiconDataset+"-----------------------------")
print(str(results))
