# zero shot

In [1]:
# pip install pandas
# pip install -q transformers

In [2]:
import pandas as pd
from transformers import pipeline
import numpy as np

In [3]:
classifier = pipeline("zero-shot-classification",
                      model="facebook/bart-large-mnli")

In [4]:
sep = pd.read_csv('./datasets/sep_combi_final_preprocessed.csv', usecols = ['num_comments', 'link_flair_text', 'post', 'covid_onset', 
       'cleaned_text'] ,low_memory=False)
sep.shape

(205594, 5)

In [5]:
sep_zero = sep[(sep['num_comments'] > 13) & (sep['link_flair_text'] == 'Discussion') & (sep['post'] == 'submission')].reset_index(drop=True)

In [6]:
sep_zero.head()

Unnamed: 0,num_comments,link_flair_text,post,covid_onset,cleaned_text
0,32,Discussion,submission,no,What is everyone planning on getting during th...
1,21,Discussion,submission,no,The sale went live today for Rouge members Wha...
2,33,Discussion,submission,no,Anyone getting anything with the 25 off of 50 ...
3,50,Discussion,submission,no,Which Brands would you like to be present in S...
4,14,Discussion,submission,no,Sephora Play AWFUL this month I have generally...


In [7]:
sep_zero['cleaned_text'] = sep_zero['cleaned_text'].astype(str)

In [8]:
sep_zero.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 215 entries, 0 to 214
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   num_comments     215 non-null    int64 
 1   link_flair_text  215 non-null    object
 2   post             215 non-null    object
 3   covid_onset      215 non-null    object
 4   cleaned_text     215 non-null    object
dtypes: int64(1), object(4)
memory usage: 8.5+ KB


In [9]:
candidate_labels = ['purchases', 'skincare', 'fragrance', 'online order issues', 'shipping issues', 'makeup lipwear', 'customer experience', 'makeup natural', 'customer rewards', 'haircare', 'appreciation', 'makeup longwear']

In [10]:
def zero_shot(x): 
    x = classifier(x, candidate_labels, multi_label=True)
    return x

In [12]:
sep_zero['zero_shot'] = sep_zero['cleaned_text'].apply(lambda x: zero_shot(x))

In [13]:
def get_label_score_dict(row, threshold):
    result_dict = dict()
    for _label, _score in zip(row['labels'], row['scores']):
        if _score > threshold:
            result_dict.update({_label: 1})
        else:
            result_dict.update({_label: 0})
    return result_dict

In [14]:
th = 0.6    #whatever threshold value you want
result = list(map(lambda x: get_label_score_dict(x, th), sep_zero['zero_shot']))
result_df = pd.DataFrame(result)

In [15]:
result_df

Unnamed: 0,purchases,customer experience,makeup lipwear,customer rewards,skincare,makeup longwear,fragrance,haircare,appreciation,online order issues,makeup natural,shipping issues
0,1,0,0,0,0,0,0,0,0,0,0,0
1,1,1,1,1,1,0,0,0,1,0,0,0
2,1,1,0,1,0,0,0,0,1,0,0,0
3,0,1,0,0,1,0,0,0,1,0,0,0
4,0,1,0,0,1,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
210,0,1,1,0,0,0,0,0,0,0,0,0
211,0,0,0,0,1,0,0,0,0,0,0,0
212,1,1,1,1,1,1,1,0,0,0,1,1
213,0,0,0,0,0,0,1,0,0,0,0,0


In [16]:
sep_zero = pd.concat([sep_zero, result_df ], axis=1)

In [20]:
sent = pd.read_csv('./datasets/sep_sentiment_analysis.csv', usecols = ['sentiment','score'] ,low_memory=False)
sent.shape

(215, 2)

In [21]:
sep_zero_sent = pd.concat([sep_zero, sent], axis=1)

In [24]:
sep_zero_sent

Unnamed: 0,num_comments,link_flair_text,post,covid_onset,cleaned_text,zero_shot,purchases,customer experience,makeup lipwear,customer rewards,skincare,makeup longwear,fragrance,haircare,appreciation,online order issues,makeup natural,shipping issues,sentiment,score
0,32,Discussion,submission,no,What is everyone planning on getting during th...,{'sequence': 'What is everyone planning on get...,1,0,0,0,0,0,0,0,0,0,0,0,NEU,0.629714
1,21,Discussion,submission,no,The sale went live today for Rouge members Wha...,{'sequence': 'The sale went live today for Rou...,1,1,1,1,1,0,0,0,1,0,0,0,NEU,0.958753
2,33,Discussion,submission,no,Anyone getting anything with the 25 off of 50 ...,{'sequence': 'Anyone getting anything with the...,1,1,0,1,0,0,0,0,1,0,0,0,POS,0.965183
3,50,Discussion,submission,no,Which Brands would you like to be present in S...,{'sequence': 'Which Brands would you like to b...,0,1,0,0,1,0,0,0,1,0,0,0,POS,0.950909
4,14,Discussion,submission,no,Sephora Play AWFUL this month I have generally...,{'sequence': 'Sephora Play AWFUL this month I ...,0,1,0,0,1,0,0,1,0,0,0,0,NEG,0.975387
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
210,19,Discussion,submission,yes,hmm I did not know my fave MAC lipstick shade ...,{'sequence': 'hmm I did not know my fave MAC l...,0,1,1,0,0,0,0,0,0,0,0,0,NEG,0.770809
211,72,Discussion,submission,yes,predictions for 2022s birthday gifts I Am gett...,{'sequence': 'predictions for 2022s birthday g...,0,0,0,0,1,0,0,0,0,0,0,0,POS,0.947476
212,20,Discussion,submission,yes,Sephora Will Not Complete My Refund I returned...,{'sequence': 'Sephora Will Not Complete My Ref...,1,1,1,1,1,1,1,0,0,0,1,1,NEG,0.956649
213,46,Discussion,submission,yes,What Is your favourite Tom Ford Fragrance,{'sequence': 'What Is your favourite Tom Ford ...,0,0,0,0,0,0,1,0,0,0,0,0,NEU,0.754647


In [31]:
df = sep_zero_sent.groupby(['covid_onset','sentiment']).sum()
df

Unnamed: 0_level_0,Unnamed: 1_level_0,num_comments,purchases,customer experience,makeup lipwear,customer rewards,skincare,makeup longwear,fragrance,haircare,appreciation,online order issues,makeup natural,shipping issues,score
covid_onset,sentiment,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
no,NEG,231,3,7,1,4,2,0,1,2,0,3,0,1,6.926614
no,NEU,710,11,11,5,6,8,1,2,0,5,2,0,3,17.112125
no,POS,185,2,2,1,1,1,0,1,0,3,0,0,0,2.903797
yes,NEG,1996,25,26,10,3,14,4,7,0,6,5,2,3,33.429064
yes,NEU,4657,49,60,23,12,57,2,12,7,31,2,1,1,74.350288
yes,POS,1588,19,31,16,7,30,3,6,1,25,2,3,0,37.999106


In [30]:
pd.DataFrame(sep_zero_sent).to_csv('datasets/sep_zeroshot_sentana.csv', index=False)