## Sentiment Analysis

In [1]:
import pandas as pd
import numpy as np
import torch
import codecs
from nltk.tokenize import sent_tokenize
from transformers import AdamW, AutoTokenizer, AutoModelForSequenceClassification

In [2]:
%cd ..

/workspace/asurion_f22/Asurion-customer-propensity


In [3]:
%cd ..

/workspace/asurion_f22


In [4]:
df = pd.read_csv('data/transcript_may_encoded_100.csv')
# df = pd.read_csv('data/transcript_march_collated_encoded.csv')
df = df.drop(columns='Unnamed: 0')
# df = df[df['accepted_flg'] == True]
df.head()

Unnamed: 0,ID,sales_offer_date,accepted_flg,encrypted_collated_transcription
0,4f81e2619aae688a3a814da7f58afdecb9720e9fdd5070...,2022-05-31,False,Thank you for calling [CLIENT] Tech for speaki...
1,03bcb2c2dd3e29b8b7ba6c0cf8c7232c8637d6bc73a760...,2022-05-13,False,[CLIENT] tech My name is [NAME]. May I have yo...
2,c1eeaa4c1fe8030bb6f0deaa81a13b468001c699586f46...,2022-05-29,False,Hi. Thank you for call. tech you're speaking w...
3,5bf5d93c818534d9edcff8cbb45f28d3e0438cf7ab7850...,2022-05-22,False,Thank you for calling [CLIENT] Tech Coach. My ...
4,c484a81de8c25bcfb9d95f36976cfb425a3d2c7602a93c...,2022-05-21,False,Hi. Thank you for calling [CLIENT] Barcode Coa...


In [5]:
import nltk
nltk.download('all')

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to /root/nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]    |       to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_ru is already
[nltk_data]    |       up-to-date!
[nltk_data]    | Downloading package basque_grammars to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package basque_grammars is already up-to-date!
[nltk_data]    | Downloading package biocreative_ppi to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Pac

True

In [6]:
def sentence_token_nltk(str):
    sent_tokenize_list = sent_tokenize(str)
    return sent_tokenize_list

In [7]:
# pip install stanza

In [8]:
# import stanza
# def sentence_token_stanza(str):
#     nlp = stanza.Pipeline(lang='en', processors='tokenize')
#     doc = nlp(str)
#     sent_tokenize_list = [sentence.text for sentence in doc.sentences]
#     return sent_tokenize_list

In [9]:
from transformers import BertForSequenceClassification, BertTokenizer

def tokenize_inputs(text, checkpoint = 'ProsusAI/finbert', t=BertTokenizer):
    tokenizer = t.from_pretrained(checkpoint, is_split_into_words = True)
    return tokenizer(text, truncation=True)

In [10]:
# model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('ProsusAI/finbert')

In [11]:
def sentiment_analysis(df, text_col:str, length:int):
    '''
    Function to do sentiment analysis by paragraphs into shorter pieces
    '''
    for i in df.index:
        # break paragraphs into sentences
        sent_list = sentence_token_nltk(df.loc[i, text_col])
        count = 0
        token_list, input_list = [], []
        mask_list, attention_list = [], []
        for sent in sent_list:
            # tokenize sentences
            res = tokenize_inputs(sent, checkpoint = 'ProsusAI/finbert', t=BertTokenizer)
            tokens, masks = res['input_ids'], res['attention_mask']
            count += len(tokens)

            if count <= length:
                token_list = token_list + tokens
                mask_list = mask_list + masks
            else:    
                input_list.append(token_list)
                attention_list.append(mask_list)
                token_list = tokens
                mask_list = masks
                count=len(tokens)

        input_list.append(token_list)
        attention_list.append(mask_list) 
        for j in range(len(input_list)):      
            # get required padding length
            pad_len = length - len(input_list[j])

            # check if list length satisfies required chunk size
            if pad_len > 0:
                # if padding length is more than 0, we gonna add padding
                input_list[j] = input_list[j] + [0] * pad_len
                attention_list[j] = attention_list[j] + [0] * pad_len

        input_ids = torch.FloatTensor(input_list)
        attention_mask = torch.FloatTensor(attention_list)
        input_dict = {'input_ids': input_ids.long(),'attention_mask': attention_mask.int()}
        outputs = model(**input_dict)
        
        probs = torch.nn.functional.softmax(outputs[0], dim=-1)
        probs = probs.mean(dim=0)
        winner = torch.argmax(probs).item()
        df.loc[i, 'sentiment'] = ['positive', 'negative', 'neutral'][winner]
    return df

In [12]:
df1=sentiment_analysis(df = df, text_col='encrypted_collated_transcription', length=512)

In [13]:
df1

Unnamed: 0,ID,sales_offer_date,accepted_flg,encrypted_collated_transcription,sentiment
0,4f81e2619aae688a3a814da7f58afdecb9720e9fdd5070...,2022-05-31,False,Thank you for calling [CLIENT] Tech for speaki...,neutral
1,03bcb2c2dd3e29b8b7ba6c0cf8c7232c8637d6bc73a760...,2022-05-13,False,[CLIENT] tech My name is [NAME]. May I have yo...,neutral
2,c1eeaa4c1fe8030bb6f0deaa81a13b468001c699586f46...,2022-05-29,False,Hi. Thank you for call. tech you're speaking w...,neutral
3,5bf5d93c818534d9edcff8cbb45f28d3e0438cf7ab7850...,2022-05-22,False,Thank you for calling [CLIENT] Tech Coach. My ...,neutral
4,c484a81de8c25bcfb9d95f36976cfb425a3d2c7602a93c...,2022-05-21,False,Hi. Thank you for calling [CLIENT] Barcode Coa...,neutral
5,b6716b08ca0380e7b5c7e9b78fcecf5787ccf3d61d1869...,2022-05-03,False,Thank you for calling Coach. This is Sean spea...,neutral
6,9f900ab27f1f249ce94d45947421b50ce9b9a85ec0d909...,2022-05-17,True,Thank you for calling [CLIENT] Tech Coach. Ben...,neutral
7,143fada0c669611e23e414d405a4f528a2528718a20c00...,2022-05-03,False,Morning. Thanks for [CLIENT] Tech Coach. My na...,neutral
8,8cbccda49661f95352e153157f8894c4d2d8e52d28740b...,2022-05-09,False,thank Hello? Hi. Thank you for calling [CLIENT...,neutral
9,46f3a9e86aa971a85d0d4cda2a886233d88f9e1da57eff...,2022-05-19,False,Hi. Thank you for [CLIENT] Tech Coach. My name...,neutral


In [14]:
# df.loc[26, 'encrypted_collated_transcription']