# Data Labeling

### Import Packages

In [1]:
import pandas as pd
import matplotlib.pyplot as plt

import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
sentiment_analyzer = SentimentIntensityAnalyzer()

from textblob import TextBlob
import tweetnlp

### Load Dataset

In [2]:
df = pd.read_csv('reddit_combined_clean.csv', encoding='latin') 
df.head()

Unnamed: 0,category,subreddit,author,created_date,score,text,text_clean
0,comment,r/china,SuspiciousStable9649,5/14/2022 3:53,1,"- Workers ?Â?can see the ceiling, due to mar...",workers can see the ceiling due to market matu...
1,comment,r/unitedkingdom,Phallic_Entity,11/1/2020 1:21,8,> The country will be broke and in masses of ...,the country will be broke and in masses of deb...
2,comment,r/usa,dannylenwinn,4/2/2021 16:28,-1,"The U.S. economy added a whopping 916,000 job...",the us economy added a whopping jobs last mont...
3,comment,r/cscareerquestions,,11/30/2022 21:11,26,?Â?I don?Â?t have cobol mentioned on my resu...,i dont have cobol mentioned on my resume okay ...
4,comment,r/cscareerquestions,soprof,11/17/2016 9:41,13,20 years...\nThat's like a century in IT. Quak...,years thats like a century in it quake and dia...


### Label 1: VADER

In [3]:
%%time
res = {}
for i, row in df.iterrows():
    text = row['text_clean']
    res[i] = sentiment_analyzer.polarity_scores(text)
    
vader_df = pd.DataFrame(res).T
vader_df.head()

Wall time: 9.43 s


Unnamed: 0,neg,neu,pos,compound
0,0.185,0.784,0.031,-0.9119
1,0.151,0.8,0.049,-0.5574
2,0.086,0.839,0.075,-0.5532
3,0.0,0.934,0.066,0.2263
4,0.063,0.806,0.131,0.6532


In [4]:
df = pd.concat([df, vader_df], axis=1)
df['label_1'] = df['compound'].map(lambda x: 'POSITIVE' if x > 0 else 'NEUTRAL' if x == 0 else 'NEGATIVE')

df.drop(['neg', 'neu','pos','compound'], axis=1, inplace=True)
df.head()

Unnamed: 0,category,subreddit,author,created_date,score,text,text_clean,label_1
0,comment,r/china,SuspiciousStable9649,5/14/2022 3:53,1,"- Workers ?Â?can see the ceiling, due to mar...",workers can see the ceiling due to market matu...,NEGATIVE
1,comment,r/unitedkingdom,Phallic_Entity,11/1/2020 1:21,8,> The country will be broke and in masses of ...,the country will be broke and in masses of deb...,NEGATIVE
2,comment,r/usa,dannylenwinn,4/2/2021 16:28,-1,"The U.S. economy added a whopping 916,000 job...",the us economy added a whopping jobs last mont...,NEGATIVE
3,comment,r/cscareerquestions,,11/30/2022 21:11,26,?Â?I don?Â?t have cobol mentioned on my resu...,i dont have cobol mentioned on my resume okay ...,POSITIVE
4,comment,r/cscareerquestions,soprof,11/17/2016 9:41,13,20 years...\nThat's like a century in IT. Quak...,years thats like a century in it quake and dia...,POSITIVE


### Label 2: TextBlob

In [5]:
%%time
df['polarity'] = df['text_clean'].apply(lambda x: TextBlob(x).sentiment.polarity)
df['label_2'] = df['polarity'].map(lambda x: 'POSITIVE' if x > 0 else 'NEUTRAL' if x == 0 else 'NEGATIVE')

df.drop(['polarity'], axis=1, inplace=True)
df.head()

Wall time: 4.48 s


Unnamed: 0,category,subreddit,author,created_date,score,text,text_clean,label_1,label_2
0,comment,r/china,SuspiciousStable9649,5/14/2022 3:53,1,"- Workers ?Â?can see the ceiling, due to mar...",workers can see the ceiling due to market matu...,NEGATIVE,POSITIVE
1,comment,r/unitedkingdom,Phallic_Entity,11/1/2020 1:21,8,> The country will be broke and in masses of ...,the country will be broke and in masses of deb...,NEGATIVE,NEGATIVE
2,comment,r/usa,dannylenwinn,4/2/2021 16:28,-1,"The U.S. economy added a whopping 916,000 job...",the us economy added a whopping jobs last mont...,NEGATIVE,POSITIVE
3,comment,r/cscareerquestions,,11/30/2022 21:11,26,?Â?I don?Â?t have cobol mentioned on my resu...,i dont have cobol mentioned on my resume okay ...,POSITIVE,POSITIVE
4,comment,r/cscareerquestions,soprof,11/17/2016 9:41,13,20 years...\nThat's like a century in IT. Quak...,years thats like a century in it quake and dia...,POSITIVE,POSITIVE


### Label 3: TweetNLP


In [6]:
model = tweetnlp.load_model('sentiment')

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  return torch._C._cuda_getDeviceCount() > 0


In [7]:
%%time
res = {}
for i, row in df.iterrows():
    text = row['text_clean']
    res[i] = model.sentiment(text)
        
pipeline_df = pd.DataFrame(res).T
pipeline_df['label_3'] = pipeline_df['label'].apply(lambda x: x.upper())
pipeline_df.drop(columns=['label'], inplace=True)
pipeline_df.head()

Wall time: 53min 35s


Unnamed: 0,label_3
0,NEGATIVE
1,NEGATIVE
2,POSITIVE
3,NEUTRAL
4,NEUTRAL


In [8]:
df = pd.concat([df, pipeline_df], axis=1)
df.head()

Unnamed: 0,category,subreddit,author,created_date,score,text,text_clean,label_1,label_2,label_3
0,comment,r/china,SuspiciousStable9649,5/14/2022 3:53,1,"- Workers ?Â?can see the ceiling, due to mar...",workers can see the ceiling due to market matu...,NEGATIVE,POSITIVE,NEGATIVE
1,comment,r/unitedkingdom,Phallic_Entity,11/1/2020 1:21,8,> The country will be broke and in masses of ...,the country will be broke and in masses of deb...,NEGATIVE,NEGATIVE,NEGATIVE
2,comment,r/usa,dannylenwinn,4/2/2021 16:28,-1,"The U.S. economy added a whopping 916,000 job...",the us economy added a whopping jobs last mont...,NEGATIVE,POSITIVE,POSITIVE
3,comment,r/cscareerquestions,,11/30/2022 21:11,26,?Â?I don?Â?t have cobol mentioned on my resu...,i dont have cobol mentioned on my resume okay ...,POSITIVE,POSITIVE,NEUTRAL
4,comment,r/cscareerquestions,soprof,11/17/2016 9:41,13,20 years...\nThat's like a century in IT. Quak...,years thats like a century in it quake and dia...,POSITIVE,POSITIVE,NEUTRAL


### Final Label

In [9]:
label_df = df[['label_1', 'label_2', 'label_3']].copy()
label_df.head()

Unnamed: 0,label_1,label_2,label_3
0,NEGATIVE,POSITIVE,NEGATIVE
1,NEGATIVE,NEGATIVE,NEGATIVE
2,NEGATIVE,POSITIVE,POSITIVE
3,POSITIVE,POSITIVE,NEUTRAL
4,POSITIVE,POSITIVE,NEUTRAL


In [10]:
label_df['label_1_num'] = label_df['label_1'].map(lambda x: 1 if x == 'POSITIVE' else 0 if x == 'NEUTRAL' else -1)
label_df['label_2_num'] = label_df['label_2'].map(lambda x: 1 if x == 'POSITIVE' else 0 if x == 'NEUTRAL' else -1)
label_df['label_3_num'] = label_df['label_3'].map(lambda x: 1 if x == 'POSITIVE' else 0 if x == 'NEUTRAL' else -1)

label_df['final_label_num'] = label_df['label_1_num'] + label_df['label_2_num'] + label_df['label_3_num']
label_df['final_label'] = label_df['final_label_num'].map(lambda x: 'POSITIVE' if x > 0 else 'NEUTRAL' if x == 0 else 'NEGATIVE')

In [11]:
df = pd.concat([df, label_df['final_label']], axis=1)
df.head()

Unnamed: 0,category,subreddit,author,created_date,score,text,text_clean,label_1,label_2,label_3,final_label
0,comment,r/china,SuspiciousStable9649,5/14/2022 3:53,1,"- Workers ?Â?can see the ceiling, due to mar...",workers can see the ceiling due to market matu...,NEGATIVE,POSITIVE,NEGATIVE,NEGATIVE
1,comment,r/unitedkingdom,Phallic_Entity,11/1/2020 1:21,8,> The country will be broke and in masses of ...,the country will be broke and in masses of deb...,NEGATIVE,NEGATIVE,NEGATIVE,NEGATIVE
2,comment,r/usa,dannylenwinn,4/2/2021 16:28,-1,"The U.S. economy added a whopping 916,000 job...",the us economy added a whopping jobs last mont...,NEGATIVE,POSITIVE,POSITIVE,POSITIVE
3,comment,r/cscareerquestions,,11/30/2022 21:11,26,?Â?I don?Â?t have cobol mentioned on my resu...,i dont have cobol mentioned on my resume okay ...,POSITIVE,POSITIVE,NEUTRAL,POSITIVE
4,comment,r/cscareerquestions,soprof,11/17/2016 9:41,13,20 years...\nThat's like a century in IT. Quak...,years thats like a century in it quake and dia...,POSITIVE,POSITIVE,NEUTRAL,POSITIVE


### Store Dataset

In [12]:
df.to_csv('reddit_combined_clean_label.csv', index=False)