# zero shot

In [1]:
# pip install pandas
# pip install -q transformers

In [1]:
import pandas as pd
from transformers import pipeline
import numpy as np

In [2]:
classifier = pipeline("zero-shot-classification",
                      model="facebook/bart-large-mnli")

In [4]:
sep = pd.read_csv('./datasets/sep_combi_final_preprocessed.csv', usecols = ['num_comments', 'score','link_flair_text', 'post', 'covid_onset', 
       'cleaned_text'] ,low_memory=False)
sep.shape

(205594, 6)

In [5]:
sep_zero = sep[(sep['num_comments'] > 13) & (sep['post'] == 'submission')].reset_index(drop=True)

In [6]:
sep_zero.head()

Unnamed: 0,num_comments,score,link_flair_text,post,covid_onset,cleaned_text
0,15,4,,submission,no,Trans woman here wanting a Sephora makeover I ...
1,17,3,,submission,no,Does Sephora still give out physical cards for...
2,14,3,,submission,no,VIB Rouge Welcome Gift Availability I became V...
3,14,6,,submission,no,Pressed powder foundation brush recommendation...
4,18,4,,submission,no,Sephora Play Subscription Box Anyone here memb...


In [7]:
sep_zero['cleaned_text'] = sep_zero['cleaned_text'].astype(str)

In [8]:
sep_zero.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3764 entries, 0 to 3763
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   num_comments     3764 non-null   int64 
 1   score            3764 non-null   int64 
 2   link_flair_text  1960 non-null   object
 3   post             3764 non-null   object
 4   covid_onset      3764 non-null   object
 5   cleaned_text     3764 non-null   object
dtypes: int64(2), object(4)
memory usage: 176.6+ KB


In [9]:
candidate_labels = ['purchases', 'skincare', 'fragrance', 'online order issues', 'shipping issues', 'makeup lipwear', 'customer experience', 'makeup natural', 'customer rewards', 'haircare', 'appreciation', 'makeup longwear']

In [10]:
def zero_shot(x): 
    x = classifier(x, candidate_labels, multi_label=True)
    return x

In [12]:
sep_zero['zero_shot'] = sep_zero['cleaned_text'].apply(lambda x: zero_shot(x))

In [13]:
def get_label_score_dict(row, threshold):
    result_dict = dict()
    for _label, _score in zip(row['labels'], row['scores']):
        if _score > threshold:
            result_dict.update({_label: 1})
        else:
            result_dict.update({_label: 0})
    return result_dict

In [14]:
th = 0.6    #whatever threshold value you want
result = list(map(lambda x: get_label_score_dict(x, th), sep_zero['zero_shot']))
result_df = pd.DataFrame(result)

In [15]:
result_df

Unnamed: 0,customer experience,customer rewards,skincare,purchases,makeup lipwear,appreciation,makeup longwear,makeup natural,shipping issues,online order issues,fragrance,haircare
0,1,1,1,1,0,0,0,0,0,0,0,0
1,1,1,1,0,1,1,0,0,0,0,1,0
2,1,1,1,1,0,1,0,0,1,1,1,0
3,1,0,1,0,0,1,0,0,0,0,0,0
4,1,0,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
3759,0,0,1,0,0,0,0,0,0,0,0,0
3760,1,1,1,1,1,0,1,0,0,0,1,0
3761,1,0,1,1,0,0,0,0,0,0,0,0
3762,1,1,1,0,1,0,0,0,0,0,0,0


In [16]:
sep_zero = pd.concat([sep_zero, result_df], axis=1)

In [17]:
sent = pd.read_csv('./datasets/sep_sentiment_analysis_bigger.csv', usecols = ['sentiment','score'] ,low_memory=False)
sent.shape

(3764, 2)

In [18]:
sep_zero_sent = pd.concat([sep_zero, sent], axis=1)

In [19]:
sep_zero_sent

Unnamed: 0,num_comments,score,link_flair_text,post,covid_onset,cleaned_text,zero_shot,customer experience,customer rewards,skincare,...,makeup lipwear,appreciation,makeup longwear,makeup natural,shipping issues,online order issues,fragrance,haircare,score.1,sentiment
0,15,4,,submission,no,Trans woman here wanting a Sephora makeover I ...,{'sequence': 'Trans woman here wanting a Sepho...,1,1,1,...,0,0,0,0,0,0,0,0,0.895318,NEU
1,17,3,,submission,no,Does Sephora still give out physical cards for...,{'sequence': 'Does Sephora still give out phys...,1,1,1,...,1,1,0,0,0,0,1,0,0.908152,NEU
2,14,3,,submission,no,VIB Rouge Welcome Gift Availability I became V...,{'sequence': 'VIB Rouge Welcome Gift Availabil...,1,1,1,...,0,1,0,0,1,1,1,0,0.734637,NEU
3,14,6,,submission,no,Pressed powder foundation brush recommendation...,{'sequence': 'Pressed powder foundation brush ...,1,0,1,...,0,1,0,0,0,0,0,0,0.849880,NEU
4,18,4,,submission,no,Sephora Play Subscription Box Anyone here memb...,{'sequence': 'Sephora Play Subscription Box An...,1,0,0,...,0,1,0,0,0,0,0,0,0.955396,POS
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3759,80,1,Discussion,submission,yes,How many foundations do you own I have so many...,{'sequence': 'How many foundations do you own ...,0,0,1,...,0,0,0,0,0,0,0,0,0.890445,NEU
3760,81,1,PSA,submission,yes,4X Points on Entire Order when you purchase an...,{'sequence': '4X Points on Entire Order when y...,1,1,1,...,1,0,1,0,0,0,1,0,0.502281,NEU
3761,20,1,CANADA,submission,yes,You Are telling me I wasted 100 points on THIS...,{'sequence': 'You Are telling me I wasted 100 ...,1,0,1,...,0,0,0,0,0,0,0,0,0.981611,NEG
3762,19,1,PSA,submission,yes,FYI the Points Multiplier only works on Sephor...,{'sequence': 'FYI the Points Multiplier only w...,1,1,1,...,1,0,0,0,0,0,0,0,0.879546,NEU


In [20]:
df = sep_zero_sent.groupby(['covid_onset','sentiment']).sum()
df

Unnamed: 0_level_0,Unnamed: 1_level_0,num_comments,score,customer experience,customer rewards,skincare,purchases,makeup lipwear,appreciation,makeup longwear,makeup natural,shipping issues,online order issues,fragrance,haircare,score
covid_onset,sentiment,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
no,NEG,5484,2911,187,71,117,132,49,34,22,10,58,59,43,10,184.823355
no,NEU,10096,6689,249,148,186,172,70,118,15,7,31,35,55,11,302.80308
no,POS,5184,4513,131,72,77,84,34,98,7,3,11,8,24,7,147.214815
yes,NEG,16593,2471,417,90,251,326,125,98,45,12,132,142,94,39,439.671107
yes,NEU,45167,7960,924,262,701,744,265,415,51,33,156,106,201,103,1263.811874
yes,POS,26986,5161,610,219,410,449,184,544,26,29,33,32,124,41,751.240293


In [21]:
pd.DataFrame(sep_zero_sent).to_csv('datasets/sep_zeroshot_sentana_bigger.csv', index=False)