## Sentiment analysis

In [27]:
# pip install pandas
# pip install -q transformers
# !pip3 install emoji

Collecting emoji
  Using cached emoji-1.7.0-py3-none-any.whl
Installing collected packages: emoji
Successfully installed emoji-1.7.0


In [30]:
import pandas as pd
from transformers import pipeline

In [31]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [32]:
sentiment_pipeline = pipeline("sentiment-analysis", model="finiteautomata/bertweet-base-sentiment-analysis", truncation=True)

In [33]:
sep = pd.read_csv('./datasets/sep_combi_final_preprocessed.csv', usecols = ['num_comments', 'link_flair_text', 'post', 'covid_onset', 
       'cleaned_text'] ,low_memory=False)
sep.shape

(205594, 5)

In [34]:
sep_sent = sep[(sep['num_comments'] > 13) & (sep['link_flair_text'] == 'Discussion') & (sep['post'] == 'submission')].reset_index(drop=True)

In [35]:
sep_sent['cleaned_text'] = sep_sent['cleaned_text'].astype(str)

In [36]:
sep_sent['tokenised_text'] = sep_sent['cleaned_text'].apply(lambda x: tokenizer(x))

In [37]:
sep_sent['hugging_outcome'] = sep_sent['cleaned_text'].apply(lambda x: sentiment_pipeline(x))

# function to unpack out list containing dictionaries or dictionaries
def unpack_cell(df, unpacked_col, new_col, key):
    """
    create new column within dataframe storing what is requested and return a collection of unique dictionaries
    
    Parameters
    ----------
    Parameters to pass as part of num_col_null
    
    df : dataframe
        dataframe containing column.
    unpacked_col : string
        column name in dataframe
    new_col : string
        column name in dataframe
    key : string
        'key' that is being unpacked from dictionary
        
    Return
    ------
    value returned after calling the function
    
    list_col : list
        list containing a collection of unique dictionaries that was unpacked
    
    """

    
    #============
    
    # store all possible dictionaries
    list_col = [] 
    
    # store list to attach to new_col
    cat_list = []
    
    # loop to extract dictionary
    for index, row in tqdm(df[unpacked_col].iteritems(), total=len(df)):
    #    if index > 55:
    #        break

        # create list to store value
        temp_list = []   
        
        # check if row is a list
        if type(row) == list:
            # check if is empty
            if row == []:
                cat_list.append(np.nan)
            else:
                # loop the list within the cell
                for i in range(len(row)):
                    # check if is existing identified value
                    if row[i] not in list_col:
                        # if is not in existing identified value, append
                        list_col.append(row[i])
                    temp_list.append(row[i].get(key, np.nan))
                cat_list.append(temp_list)
        else:
            if row not in list_col:
            # if is not in existing identified value, append
                list_col.append(row)
            cat_list.append(row.get(key, np.nan))
    
    # attach list to new_col in df
    df[new_col] = cat_list
    
    return list_col

In [38]:
pd.set_option('display.max_colwidth', None)
sep_sent.head()

Unnamed: 0,num_comments,link_flair_text,post,covid_onset,cleaned_text,tokenised_text,hugging_outcome
0,32,Discussion,submission,no,What is everyone planning on getting during the VIB sale,"[input_ids, attention_mask]","[{'label': 'NEU', 'score': 0.6297136545181274}]"
1,21,Discussion,submission,no,The sale went live today for Rouge members What damage did you do Share your goodies here Also if you were not Rouge but decided to pick up some things to grab before they sell out please share as well What I purchased Bite Agave Lip Mask Tatcha Skincare for Makeup Lovers set or some similar name the one with the lip balm mask and water cream Fenty Beauty Invisimatte Blotting Powder Sephora foundation brush Fenty Beauty Gloss Bomb Dr Jart Ceramidin Liquid Caudaulie Vinoperfect Brightening set Nest Fragrances Diffuser in Moroccan Amber Pat McGrath Mothership II Eyshadow Palette in Sublime,"[input_ids, attention_mask]","[{'label': 'NEU', 'score': 0.9587526321411133}]"
2,33,Discussion,submission,no,Anyone getting anything with the 25 off of 50 they sent via email I love this coupon because I loooove SR Blue Moon which is exactly 50 Anyone ordering anything EDIT The code is 2017REWARD and it is for Rouge only I think,"[input_ids, attention_mask]","[{'label': 'POS', 'score': 0.9651825428009033}]"
3,50,Discussion,submission,no,Which Brands would you like to be present in Sephora Opinions I think I would like to see more Korean brands there because I am just in love with K skincare,"[input_ids, attention_mask]","[{'label': 'POS', 'score': 0.9509091973304749}]"
4,14,Discussion,submission,no,Sephora Play AWFUL this month I have generally been fairly impressed with Play there have been a couple products here and there that I was irritated by as I have clearly marked on my profile that I DO NOT need those like why would you send mystifying products and dry shampoo to someone with dry skin and hair but overall its been alright But this month I am LIVID 2 samples from crappy little brands I Have never even heard of PLUS a Sephora brand sample and NO BAG Did everyone get tired of their shit and unsubscribe so now they need to make cutbacks Why was this months SO bad I did not even get what they teased me with in the monthly email Am I the only one who feels this way Because I honestly feel ripped off I think Ill be unsubscribing,"[input_ids, attention_mask]","[{'label': 'NEG', 'score': 0.9753867387771606}]"


In [39]:
sep_sent['hugging_outcome'][0]

[{'label': 'NEU', 'score': 0.6297136545181274}]

In [40]:
# Using list comprehension
# Get values of particular key in list of dictionaries
def get_sentiment(x):
    value =  [sub['label'] for sub in x]
    senti = (value)[0]
    return senti

In [46]:
def get_score(x):
    value =  [sub['score'] for sub in x]
    score = (value)[0]   
    return score

In [47]:
sep_sent['sentiment'] = sep_sent['hugging_outcome'].apply(lambda x: get_sentiment(x))

In [48]:
sep_sent['score'] = sep_sent['hugging_outcome'].apply(lambda x: get_score(x))

In [52]:
sep_sent.tail()

Unnamed: 0,num_comments,link_flair_text,post,covid_onset,cleaned_text,tokenised_text,hugging_outcome,sentiment,score
210,19,Discussion,submission,yes,hmm I did not know my fave MAC lipstick shade that was finished had this much product left hiding in the bottom,"[input_ids, attention_mask]","[{'label': 'NEG', 'score': 0.7708085775375366}]",NEG,0.770809
211,72,Discussion,submission,yes,predictions for 2022s birthday gifts I Am getting excited to see what the birthday gifts for 2022 are going to be so I thought it would be fun to see what other people think could be in next years line up I Am thinking maybe some kind of Charlotte Tilbury set since its one of the hottest brands right now,"[input_ids, attention_mask]","[{'label': 'POS', 'score': 0.9474762678146362}]",POS,0.947476
212,20,Discussion,submission,yes,Sephora Will Not Complete My Refund I returned two items to Sephora by mail on Dec 1st I received the refund on one of the products but when I called the help line they said the other item came incomplete essentially saying the actual product was not in the return package I 100 put the the product in the box Also how dumb would I have to be to keep the product I got super frustrated that they thought I would do such a thing Is there anything I can do Or is it essential the distributors word over mine,"[input_ids, attention_mask]","[{'label': 'NEG', 'score': 0.9566491842269897}]",NEG,0.956649
213,46,Discussion,submission,yes,What Is your favourite Tom Ford Fragrance,"[input_ids, attention_mask]","[{'label': 'NEU', 'score': 0.754647433757782}]",NEU,0.754647
214,80,Discussion,submission,yes,How many foundations do you own I have so many foundations that I want to try How many do you own I have 2 already and do not use them every day so I am afraid they will spoil before I get to use them,"[input_ids, attention_mask]","[{'label': 'NEU', 'score': 0.8904446959495544}]",NEU,0.890445


In [51]:
pd.DataFrame(sep_sent).to_csv('datasets/sep_sentiment_analysis.csv', index=False)