## Using Snorkel labeling framework  
Reference: https://www.snorkel.org/use-cases

**Note: best to execute from separate VM due to multiple dependencies**

In [1]:
import sys
print(sys.version)

3.7.12 | packaged by conda-forge | (default, Oct 26 2021, 06:08:53) 
[GCC 9.4.0]


In [2]:
import warnings
# warnings.filterwarnings('ignore')

In [3]:
import os
import pandas as pd

from textblob import TextBlob

import re
import operator

In [4]:
from tqdm.auto import tqdm
tqdm.pandas()

In [5]:
# !pip install snorkel

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn import metrics

In [7]:
import snorkel as sk

from snorkel.labeling import labeling_function, PandasLFApplier, LFAnalysis, \
                            LabelingFunction, filter_unlabeled_dataframe

from snorkel.labeling.model.label_model import LabelModel
from snorkel.labeling.model.baselines import MajorityLabelVoter



from snorkel.analysis import get_label_buckets
from snorkel.preprocess import preprocessor
from snorkel.labeling.lf.nlp import nlp_labeling_function
from snorkel.utils import probs_to_preds

#### Copy files to local FS from GCP bucket

In [8]:
path_tweets = '/home/jupyter/data/tweets'

if not os.path.isdir(path_tweets): os.mkdir(path_tweets)

### Read and pre-process data

In [9]:
tweets_raw = pd.read_json('https://storage.googleapis.com/msca-bdp-data-open/tweets/tweets.json', orient='records', lines=True)
tweets_raw = tweets_raw.sample(n=1000)
tweets_raw.shape

(1000, 10)

In [10]:
pd.set_option('display.max_colwidth', None)

In [11]:
tweets_raw.head(5)

Unnamed: 0,id,lang,created_at,screen_name,location,retweet_count,retweet,text_no_rt,tweet_class,text_clean
4355,8.869336e+17,en,2017-07-17 13:00:17+00:00,irvin_iew2,"Pennsylvania, USA",0.0,,DISSA Work Study Office Assistant - University Park Campus #PSUWorkStudy https://t.co/1fRhWYmAmM,university,DISSA Work Study Office Assistant - University Park Campus #PSUWorkStudy https://t.co/1fRhWYmAmM
36383,1.045621e+18,en,2018-09-28 10:28:40+00:00,OlisahElvis,"Akwa Ibom, Nigeria",0.0,RT,@seunthemarketer: My boss driver who could not go to the university ensured that he sent his fiancée to the university. \n\nImmediately au…,university,RT @seunthemarketer: My boss driver who could not go to the university ensured that he sent his fiancée to the university. Immediately au…
40354,9.69696e+17,en,2018-03-02 22:08:43+00:00,nmp658,,0.0,RT,"@shannonrwatts: Another day in America, another school shooting. Meanwhile, @NRA lobbyists are visiting the President in the Oval Office…",other,"RT @shannonrwatts: Another day in America, another school shooting. Meanwhile, @NRA lobbyists are visiting the President in the Oval Office…"
79894,1.035501e+18,en,2018-08-31 12:13:30+00:00,SchmidtLab,"Aurora, Colorado",0.0,RT,"@DanielBolnick: Mistakes were made. Using an unfamiliar university ordering system, I somehow purchased 1200 pounds of serological pipet…",university,"RT @DanielBolnick: Mistakes were made. Using an unfamiliar university ordering system, I somehow purchased 1200 pounds of serological pipet…"
86667,8.89258e+17,en,2017-07-23 22:56:45+00:00,dave_heller,"Whitefish Bay, WI",0.0,,Marquette wins this recruiting battle vs. #Badgers https://t.co/1ydStMS0JL,other,Marquette wins this recruiting battle vs. #Badgers https://t.co/1ydStMS0JL


In [12]:
tweets = tweets_raw[['text_clean']]
tweets = tweets.rename(columns={'text_clean': 'text'})

### Labeling with Regex

In [14]:
ABSTAIN = -1
Institution = 0
Education = 1
Other = 2

In [15]:
#Trying!!
# Labeling functions is a key concept in Snorkel. 
# So you can write as many functions that can contain any logic to label the observation


@labeling_function()
def university(x):
    return Education if re.search(r"universit*", x.text, flags=re.I) else ABSTAIN

@labeling_function()
def school(x):
    return Education if re.search(r"school*", x.text, flags=re.I) else ABSTAIN

@labeling_function()
def college(x):
    return Education if re.search(r"college*", x.text, flags=re.I) else ABSTAIN

@labeling_function()
def city(x):
    return Institution if re.search(r"cit(?:y|ies)|state\w*|province\w*", x.text, flags=re.I) else ABSTAIN

@labeling_function()
def hospital(x):
    return Institution if re.search(r"hospital", x.text, flags=re.I) else ABSTAIN

In [16]:
lfs=([university, school, college, city, hospital])

applier = PandasLFApplier(lfs)

# The output of the apply method is a label matrix, which is a # NumPy array L 
# with one column for each LF and one row for each data point
L_train = applier.apply(tweets)

  from pandas import Panel
100%|██████████| 1000/1000 [00:00<00:00, 7018.64it/s]


#### Explore labeling results

In [17]:
# Polarity: The set of unique labels this LF outputs (excluding abstains)
# Coverage: The fraction of the dataset the LF labels
# Overlaps: The fraction of the dataset where this LF and at least one other LF label
# Conflicts: The fraction of the dataset where this LF and at least one other LF label and disagree

LFAnalysis(L=L_train, lfs=lfs).lf_summary()

Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts
university,0,[1],0.424,0.071,0.024
school,1,[1],0.036,0.012,0.0
college,2,[1],0.048,0.04,0.004
city,3,[0],0.042,0.021,0.021
hospital,4,[0],0.005,0.003,0.003


#### Explore the records where "College" labeled as "Education"

In [18]:
tweets.iloc[L_train[:, 2] == Education].sample(10, random_state=1) #Slicing corresponds to "j"

Unnamed: 0,text
28140,RT @lupeyz_: It’s okay to not go to college. It’s okay to go to a community college. It’s okay to go to University. It’s okay to take out l…
47966,RT @Breaking911: WHAT?: Students walk out of a diversity discussion at Portland State University when Evergreen State College biologist Hea…
29529,RT @lupeyz_: It’s okay to not go to college. It’s okay to go to a community college. It’s okay to go to University. It’s okay to take out l…
60671,"RT @cassdalr12: Going to a university, does not make you anymore educated than someone going to a community college"
25634,RT @itscollegebabes: University of Alabama 📍 https://t.co/6wTsfAClYy
83234,RT @otaaee: if u aint press to go back to yo college/university then u at the wrong one. https://t.co/V1dUXtx2Fj
27759,RT @lupeyz_: It’s okay to not go to college. It’s okay to go to a community college. It’s okay to go to University. It’s okay to take out l…
79055,RT @Laneybehr: University: hi :))) college is expensive!!! We offer our students 10% off at our bookstore :)) Spotify: YOU WANT MUSIC????…
30373,RT @lupeyz_: It’s okay to not go to college. It’s okay to go to a community college. It’s okay to go to University. It’s okay to take out l…
7137,RT @itscollegebabes: University of Southern California 📍 https://t.co/rlaU8quM6C


#### Explore the records where "School" labeled and "University" abstained

In [19]:
buckets = get_label_buckets(L_train[:, 0], L_train[:, 1]) #Slicing corresponds to "j"
tweets.iloc[buckets[(ABSTAIN, Education)]].sample(10, random_state=1)

Unnamed: 0,text
40978,RT @shannonrwatts: Since Parkland shooting: A gun was fired unintentionally at a K-12 school in Florida A Georgia teacher fired a gun ins…
84652,@AndyManar I wish Andy would fight that hard for his own schools like he fight for Chicago
68010,RT @bhramabull: Big shout to the great event yesterday at the #stoptheviolence rally at #marshallhighschool celebrity ev...…
40133,RT @shannonrwatts: Since Parkland shooting: A gun was fired unintentionally at a K-12 school in Florida A Georgia teacher fired a gun ins…
42355,"RT @shannonrwatts: Another day in America, another school shooting. Meanwhile, @NRA lobbyists are visiting the President in the Oval Office…"
68296,"RT @solomonortizjr: ""As a UT graduate, I've never been more embarrassed for my school"" .@farenthold said I'm embarrassed for Dist. 27! ht…"
28661,Medical School students address environmental issues from physicians’ perspective https://t.co/96MZlVLMAH via @Harvard
67979,RT @bhramabull: Big shout to the great event yesterday at the #stoptheviolence rally at #marshallhighschool celebrity ev...…
40967,RT @shannonrwatts: Since Parkland shooting: A gun was fired unintentionally at a K-12 school in Florida A Georgia teacher fired a gun ins…
39825,RT @shannonrwatts: Since Parkland shooting: A gun was fired unintentionally at a K-12 school in Florida A Georgia teacher fired a gun ins…


### Labeling with sentiment score using TextBlob

In [20]:
@preprocessor(memoize=True)
def textblob_sentiment(x):
    scores = TextBlob(x.text)
    x.polarity = scores.sentiment.polarity
    x.subjectivity = scores.sentiment.subjectivity
    return x

@labeling_function(pre=[textblob_sentiment])
def textblob_polarity(x):
    return Other if x.polarity > 0.9 else ABSTAIN

@labeling_function(pre=[textblob_sentiment])
def textblob_subjectivity(x):
    return Other if x.subjectivity >= 0.5 else ABSTAIN

In [21]:
lfs = [textblob_polarity, textblob_subjectivity]

applier = PandasLFApplier(lfs)
L_train = applier.apply(tweets)

LFAnalysis(L_train, lfs).lf_summary()

  from pandas import Panel
100%|██████████| 1000/1000 [00:01<00:00, 934.90it/s]


Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts
textblob_polarity,0,[2],0.006,0.004,0.0
textblob_subjectivity,1,[2],0.275,0.004,0.0


## Labeling using the number of words

In [22]:
@labeling_function()
def short_comment(x):
    return Other if len(x.text.split()) <= 5 else ABSTAIN

In [23]:
lfs = [short_comment]

applier = PandasLFApplier(lfs)
L_train = applier.apply(tweets)

LFAnalysis(L_train, lfs).lf_summary()

  from pandas import Panel
100%|██████████| 1000/1000 [00:00<00:00, 4140.67it/s]


Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts
short_comment,0,[2],0.027,0.0,0.0


## Labeling based on SpaCy NER

In [24]:
@nlp_labeling_function()
def has_gpe_nlp(x):
    if len(x.text.split()) >= 5 and any([ent.label_ == "GPE" for ent in x.doc.ents]):
# Label everything longer than 5 words containing Geo-Political entities as Institution        
        return Institution
    else:
        return ABSTAIN

In [25]:
lfs = [has_gpe_nlp]

applier = PandasLFApplier(lfs)
L_train = applier.apply(tweets)

LFAnalysis(L_train, lfs).lf_summary()

  from pandas import Panel
100%|██████████| 1000/1000 [00:13<00:00, 75.80it/s]


Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts
has_gpe_nlp,0,[0],0.325,0.0,0.0


#### Explore data labeled with SpaCy NER

In [26]:
tweets.iloc[L_train[:, 0] == Institution].sample(10, random_state=1) #Slicing corresponds to "j"

Unnamed: 0,text
72565,"RT @ccc_prod: 14 days to go!!! See you all on September 14 at the University of Baguio for The Dream Tour @maymayentrata07 In Concert, Live…"
53429,"@KylieJenner omg do you realize stormi, chicago and koko’s daughter are already best friends? Just imagine how cute the pics will be"
60618,RT @SheriffClarke: Somebody notify Black LIES Matter rioters in St. Louis to report to Chicago now!! Blacks are killing other blacks. https…
97104,RT @BostonGlobe: A federal judge in Chicago on Thursday ordered the immediate release of a 9-year-old boy who had remained in government cu…
54136,"RT @poetrypotion: Poet Profile: Kwame Aidoo Kwame ‘Write’ Aidoo (Ghana, 1986) is a writer, artist and cultural manager with a background o…"
43176,"RT @HinaQuotes: On the security front, in 2015 the military launched an offensive to remove extremist groups in northwestern Pakistan and a…"
94400,RT @BTSxIllinois: Tune in to @1035KISSFM Chicago on Sunday at 10 PM CST to hear @BTS_twt on the @iHeartCountdown! 🎈 ------- #TeenChoice #Ch…
85050,"#ChicagoCubs #Cubs Royals beat White Sox 5-4, extending Chicago's skid to 9 https://t.co/0alb6YNTpK #ChicagoCubs"
15746,"RT @ImTheBombDotCom: PLEASE RT THIS🙏🏽 Project #Halloween4Homeless CHICAGO, IL 🖤🖤🖤🖤🖤 https://t.co/cU3k3qxKyM"
47670,RT @kotafoundation: Poached populations of #elephants in Tanzania show more than 6% are tuskless according to student researchers from Stir…


## Combining labeling functions and exploring conflicts and overlaps

In [27]:
lfs = [
    university, 
    school, 
    college, 
    city, 
    hospital,
    short_comment,
    has_gpe_nlp
]

applier = PandasLFApplier(lfs)
L_train = applier.apply(tweets)

LFAnalysis(L_train, lfs).lf_summary()

  from pandas import Panel
100%|██████████| 1000/1000 [00:00<00:00, 2338.81it/s]


Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts
university,0,[1],0.424,0.115,0.069
school,1,[1],0.036,0.027,0.016
college,2,[1],0.048,0.04,0.004
city,3,[0],0.042,0.037,0.021
hospital,4,[0],0.005,0.004,0.003
short_comment,5,[2],0.027,0.005,0.005
has_gpe_nlp,6,[0],0.325,0.078,0.061


## Convert labels from LF into a single label

### Baseline model: the majority vote on a per-data point basis
Our goal is now to convert the labels from our LFs into a single noise-aware probabilistic (or confidence-weighted) label per data point. A simple baseline for doing this is to take the majority vote on a per-data point basis

In [28]:
majority_model = MajorityLabelVoter(cardinality=3, verbose=True) # cardinality = number of categories
preds_train = majority_model.predict(L=L_train)

In [29]:
preds_train[0:10]

array([ 1, -1,  1, -1,  0,  1,  0,  1,  0,  0])

### More sophisticated Snorkel LabelModel, combining outputs of the LFs

This model will ultimately produce a single set of noise-aware training labels, which are probabilistic or confidence-weighted labels. We will then use these labels to train a classifier for our task. For more technical details of this overall approach, see Snorkel NeurIPS 2016 and AAAI 2019 papers.  

Note that no gold labels are used during the training process. The only information we need is the label matrix, which contains the output of the LFs on our training set. The LabelModel is able to learn weights for the labeling functions using only the label matrix as input. We also specify the cardinality, or number of classes.

In [30]:
label_model = LabelModel(cardinality=3, verbose=True)
label_model.fit(L_train=L_train, n_epochs=500, log_freq=100, seed=123)

## Using Snorkel labeling function - with hand labeled data

In [31]:
tweets = tweets_raw[['text_clean', 'tweet_class']]
tweets = tweets.rename(columns={'text_clean': 'text'})

In [32]:
tweets.groupby(['tweet_class']).size().reset_index(name='counts')

Unnamed: 0,tweet_class,counts
0,city,14
1,game,30
2,other,562
3,university,394


In [33]:
tweets.head(5)

Unnamed: 0,text,tweet_class
1086,I don't know where university will I go on college.,university
18383,RT @sofiaorden: Gaelic in modern Scotland - for iBooks - The Open University |... #Linguistics https://t.co/PlTn7Bb9M0 #Linguistics,university
63534,RT @DrNeelakshiGswm: Dumbo bhakt frm Assam don't even know tht Dr Mamoni R Goswami was Professor of Delhi University. Unlike their part…,university
23310,The Grown Woman Tales III: For Boss Babes - Jul 23 #chicago https://t.co/469XmwbSnu,other
62171,Sean McGowan The Turnaround Guy! https://t.co/zQr6OYImDh #wsj #reuters #nasdaq #business #forbes #bloomberg #nytimes #cnn #foxnews #Chicago,other


In [34]:
tweets['label'] = tweets.tweet_class.replace(['city', 'game', 'other', 'university'], [0, 2, 2, 1])
tweets.groupby(['label']).size().reset_index(name='counts')

Unnamed: 0,label,counts
0,0,14
1,1,394
2,2,592


#### Split the data into train / test

In [35]:
df_train, df_test = train_test_split(tweets, test_size=0.2, random_state=100)

# Create label vectors
Y_test = df_test.label.values

print ("Train Size:", df_train.shape)
print ("Test Size:", df_test.shape)

Train Size: (800, 3)
Test Size: (200, 3)


In [36]:
lfs = [
    university, 
    school, 
    college, 
    city, 
    hospital,
    short_comment,
    has_gpe_nlp
]

applier = PandasLFApplier(lfs=lfs)
L_train = applier.apply(df=df_train)
L_test = applier.apply(df=df_test)

LFAnalysis(L_train, lfs).lf_summary()

  from pandas import Panel
100%|██████████| 800/800 [00:09<00:00, 80.04it/s]
100%|██████████| 200/200 [00:02<00:00, 87.78it/s]


Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts
university,0,[1],0.43,0.12625,0.075
school,1,[1],0.03875,0.02875,0.01625
college,2,[1],0.0525,0.0425,0.0025
city,3,[0],0.03625,0.03375,0.02375
hospital,4,[0],0.005,0.00375,0.0025
short_comment,5,[2],0.0275,0.00375,0.00375
has_gpe_nlp,6,[0],0.32625,0.075,0.06375


#### Applying Majority Model

In [37]:
majority_model = MajorityLabelVoter(cardinality=3, verbose=True) # cardinality = number of categories
preds_train = majority_model.predict(L=L_train)

#### Applying Snorkel LabelModel

In [38]:
label_model = LabelModel(cardinality=3, verbose=True)
label_model.fit(L_train=L_train, n_epochs=500, log_freq=100, seed=123)

#### Comparing model performance accuracy on labeled data  
These tweets have not been properly labeled, so the accuracy will not be really indicative of the model performance

In [39]:
majority_acc = majority_model.score(L=L_test, Y=Y_test, tie_break_policy="random")[
    "accuracy"
]
print(f"{'Majority Vote Accuracy:':<25} {majority_acc * 100:.1f}%")

label_model_acc = label_model.score(L=L_test, Y=Y_test, tie_break_policy="random")[
    "accuracy"
]
print(f"{'Label Model Accuracy:':<25} {label_model_acc * 100:.1f}%")


Majority Vote Accuracy:   51.5%
Label Model Accuracy:     50.5%


### Filtering out training data points which did not recieve a label from any LF
These data points contain no signal.

In [40]:
probs_train = label_model.predict_proba(L_train)

In [41]:
df_train_filtered, probs_train_filtered = filter_unlabeled_dataframe(
    X=df_train, y=probs_train, L=L_train
)

In [42]:
df_train_filtered.shape

(605, 3)

## Training a Classifier on Snorkel Labels
The output of the Snorkel LabelModel is just a set of labels which can be used with most popular libraries for performing supervised classification for NLP (i.e. Scikit-Learn, TensorFlow, Keras, PyTorch, etc.)

#### Prepare and vectorize the features

In [43]:
vectorizer = CountVectorizer(ngram_range=(1, 5))
X_train = vectorizer.fit_transform(df_train_filtered.text.tolist())
X_test = vectorizer.transform(df_test.text.tolist())

If we want to use a library or model that doesn’t accept probabilistic labels (such as Scikit-Learn), we can instead replace each label distribution with the label of the class that has the maximum probability. This can easily be done using the probs_to_preds helper method. Please note: this transformation is lossy, as we no longer have values for our confidence in each label.

In [44]:
preds_train_filtered = probs_to_preds(probs=probs_train_filtered)

### Naive Bayes Model

In [45]:
# instantiate and train Multinomial Naive Bayes model
nb = MultinomialNB()
nb.fit(X=X_train, y=preds_train_filtered)

# make class predictions for X_test
y_pred_class = nb.predict(X_test)

In [46]:
print(f"Test Accuracy: {metrics.accuracy_score(Y_test, y_pred_class) * 100:.1f}%")

Test Accuracy: 38.0%


In [47]:
# calculate precision and recall
print(classification_report(Y_test, y_pred_class))

              precision    recall  f1-score   support

           0       0.07      1.00      0.13         6
           1       0.61      0.95      0.74        74
           2       0.00      0.00      0.00       120

    accuracy                           0.38       200
   macro avg       0.23      0.65      0.29       200
weighted avg       0.23      0.38      0.28       200



  'precision', 'predicted', average, warn_for)


### Logistic Regression Model

In [48]:
# instantiate and train a logistic regression model
logreg = LogisticRegression(solver = 'lbfgs')
logreg.fit(X_train, y=preds_train_filtered)

# make class predictions for X_test
y_pred_class = logreg.predict(X_test)



In [49]:
# calculate accuracy of class predictions
print(f"Test Accuracy: {metrics.accuracy_score(Y_test, y_pred_class) * 100:.1f}%")

Test Accuracy: 41.5%


In [50]:
# calculate precision and recall
print(classification_report(Y_test, y_pred_class))

              precision    recall  f1-score   support

           0       0.06      1.00      0.11         6
           1       0.80      1.00      0.89        74
           2       1.00      0.03      0.05       120

    accuracy                           0.41       200
   macro avg       0.62      0.67      0.35       200
weighted avg       0.90      0.41      0.36       200



In [51]:
import datetime
import pytz

datetime.datetime.now(pytz.timezone('US/Central')).strftime("%a, %d %B %Y %H:%M:%S")

'Thu, 02 February 2023 20:37:02'