In [23]:
# This is a tutorial provided by SkillC

In [None]:
! pip install snorkel
! pip install utils

In [1]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [2]:
%cd /content/drive/My Drive/Colab Notebooks/Spam Detection/

/content/drive/My Drive/Colab Notebooks/Spam Detection


In [None]:
# !git clone https://github.com/snorkel-team/snorkel-tutorials.git
# Change : snorkel-tutorials -> snorkeltutorials

Cloning into 'snorkel-tutorials'...
remote: Enumerating objects: 1362, done.[K
remote: Counting objects: 100% (303/303), done.[K
remote: Compressing objects: 100% (122/122), done.[K
remote: Total 1362 (delta 148), reused 266 (delta 132), pack-reused 1059[K
Receiving objects: 100% (1362/1362), 3.56 MiB | 7.22 MiB/s, done.
Resolving deltas: 100% (863/863), done.


In [5]:
# Pulling the demo SPAM Dataset from Snorkel
from snorkeltutorials.spam.utils import load_spam_dataset

%cd /content/drive/My Drive/Colab Notebooks/Spam Detection/snorkeltutorials

df_train, df_test = load_spam_dataset()

# We pull out the label vectors for ease of use later
Y_test = df_test.label.values

/content/drive/My Drive/Colab Notebooks/Spam Detection/snorkeltutorials


In [6]:
df_train.head()

Unnamed: 0,author,date,text,label,video
0,Alessandro leite,2014-11-05T22:21:36,pls http://www10.vakinha.com.br/VaquinhaE.aspx...,-1.0,1
1,Salim Tayara,2014-11-02T14:33:30,"if your like drones, plz subscribe to Kamal Ta...",-1.0,1
2,Phuc Ly,2014-01-20T15:27:47,go here to check the views :3﻿,-1.0,1
3,DropShotSk8r,2014-01-19T04:27:18,"Came here to check the views, goodbye.﻿",-1.0,1
4,css403,2014-11-07T14:25:48,"i am 2,126,492,636 viewer :D﻿",-1.0,1


In [10]:
df_train['video'].value_counts() # There are 4 videos used to take spam/ham messages.

4    448
3    438
1    350
2    350
Name: video, dtype: int64

In [11]:
df_test['video'].value_counts() # There is a 5th video, which will be used for testing the classifier.

5    250
Name: video, dtype: int64

**Class labels for spam = 1, ham = 0, abstain = -1**

In [12]:
# From the training dataset, we should remove the label in order to label them automatically.
df_train[["author", "text", "video"]].sample(20, random_state=2)
#Based on this, we will define the label functions.

Unnamed: 0,author,text,video
4,ambareesh nimkar,"""eye of the tiger"" ""i am the champion"" seems l...",2
87,pratik patel,"mindblowing dance.,.,.superbbb song﻿",3
14,RaMpAgE420,Check out Berzerk video on my channel ! :D,4
80,Jason Haddad,"Hey, check out my new website!! This site is a...",1
104,austin green,Eminem is my insperasen and fav﻿,4
305,M.E.S,hey guys look im aware im spamming and it piss...,4
22,John Monster,Οh my god ... Roar is the most liked video at ...,2
338,Alanoud Alsaleh,I started hating Katy Perry after finding out ...,2
336,Leonardo Baptista,http://www.avaaz.org/po/petition/Youtube_Corpo...,1
143,UKz DoleSnacher,Remove This video its wank﻿,1


In [16]:
#Let's generate Weak Labels with the help of Labelling Functions

In [13]:
from snorkel.labeling import labeling_function
from snorkel.labeling import PandasLFApplier
from snorkel.labeling import LFAnalysis
from snorkel.analysis import get_label_buckets

In [14]:
SPAM = 1
HAM = 0
ABSTAIN = -1

In [15]:
#Spam comments have the phrase "check out", so lets make a function for that

import re
@labeling_function()
def regex_check_out(x): #1st LF - check for phrase "check out"
    return SPAM if re.search(r"check.*out", x.text, flags=re.I) else ABSTAIN

In [17]:
# Writing an LF to gauge sentiment using TextBlob (we can use other packages with snorkel)

from snorkel.preprocess import preprocessor
from textblob import TextBlob
@preprocessor(memoize=True)
def textblob_sentiment(x):
    scores = TextBlob(x.text)
    x.polarity = scores.sentiment.polarity #Polarity == sentiment, +1 is positive, -1 is negative, 0 is a neutral sentiment.
    x.subjectivity = scores.sentiment.subjectivity
    return x

@labeling_function(pre=[textblob_sentiment]) #If comment is positive (>0.9), then its  HAM
def textblob_polarity(x):
    return HAM if x.polarity > 0.9 else ABSTAIN

@labeling_function(pre=[textblob_sentiment]) #If commentis subjective (>0.5), then its HAM, else ABSTAIN
def textblob_subjectivity(x):
    return HAM if x.subjectivity >= 0.5 else ABSTAIN

In [20]:
# Writing some LFs for SPAM
from snorkel.labeling import LabelingFunction

def keyword_lookup(x, keywords, label):
    if any(word in x.text.lower() for word in keywords):
        return label
    return ABSTAIN


def make_keyword_lf(keywords, label=SPAM):
    return LabelingFunction(
        name=f"keyword_{keywords[0]}",
        f=keyword_lookup,
        resources=dict(keywords=keywords, label=label),
    )


#Spam comments talk about 'my channel', 'my video', etc
keyword_my = make_keyword_lf(keywords=["my"])

#Spam comments ask users to subscribe to their channels.
keyword_subscribe = make_keyword_lf(keywords=["subscribe"])

#Spam comments post links to other channels
keyword_link = make_keyword_lf(keywords=["http"])

#Spam comments make requests rather than commenting.
keyword_please = make_keyword_lf(keywords=["please", "plz"])

#Ham comments actually talk about the video's content.
keyword_song = make_keyword_lf(keywords=["song"], label=HAM)


In [21]:
#Ham comments are often short, such as 'cool video!'

@labeling_function()
def short_comment(x):
    return HAM if len(x.text.split()) < 5 else ABSTAIN

In [22]:
from snorkel.labeling.lf.nlp import nlp_labeling_function
#Using NLP labelling, if the comment talks about a person and its short, then HAM, else ABSTAIN

@nlp_labeling_function()
def has_person_nlp(x):
    if len(x.doc) < 20 and any([ent.label_ == "PERSON" for ent in x.doc.ents]):
        return HAM
    else:
        return ABSTAIN

In [24]:
# Put all the Labelling functions into a list.
lfs = [
    keyword_my,
    keyword_subscribe,
    keyword_link,
    keyword_please,
    keyword_song,
    regex_check_out,
    short_comment,
    has_person_nlp,
    textblob_polarity,
    textblob_subjectivity
]

In [25]:
# Apply the LFS in the dataframe
applier = PandasLFApplier(lfs=lfs)
L_train = applier.apply(df=df_train)
L_test = applier.apply(df=df_test)

100%|██████████| 1586/1586 [00:16<00:00, 97.95it/s] 
100%|██████████| 250/250 [00:02<00:00, 107.43it/s]


In [26]:
LFAnalysis(L=L_train, lfs=lfs).lf_summary()
#Coverage - The fraction of the dataset the LF labels
#Overlap - The fraction of the dataset where this LF and at least one other LF label
#Conflict - The fraction of the dataset where this LF and at least one other LF label and disagree

Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts
keyword_my,0,[1],0.198613,0.186003,0.110971
keyword_subscribe,1,[1],0.127364,0.107188,0.067465
keyword_http,2,[1],0.119168,0.101513,0.082598
keyword_please,3,[1],0.112232,0.10971,0.057377
keyword_song,4,[0],0.141866,0.111602,0.043506
regex_check_out,5,[1],0.233922,0.129256,0.083859
short_comment,6,[0],0.225725,0.144388,0.074401
has_person_nlp,7,[0],0.083859,0.062421,0.027743
textblob_polarity,8,[0],0.035309,0.030895,0.005044
textblob_subjectivity,9,[0],0.357503,0.261665,0.160151


In [27]:
#Since we cant use multiple labels based on different LFs, we can combine
#the outputs to get 1 final label

In [45]:
L_train

array([[-1, -1,  1, ..., -1, -1, -1],
       [-1,  1, -1, ..., -1, -1,  0],
       [-1, -1, -1, ..., -1, -1, -1],
       ...,
       [ 1,  1, -1, ..., -1, -1,  0],
       [-1,  1, -1, ..., -1, -1,  0],
       [ 1, -1, -1, ..., -1, -1, -1]])

### Majority-Vote-based Model

In [28]:
# Our goal is now to convert the labels from our LFs into
# a single noise-aware probabilistic (or confidence-weighted)
# label per data point

In [29]:
from snorkel.labeling.model import MajorityLabelVoter

majority_model = MajorityLabelVoter()
preds_train = majority_model.predict(L=L_train)

preds_train

array([ 1,  1, -1, ...,  1,  1,  1])

### Confiedence-weighted Label Model

In [30]:
# Our LFs have varying properties and should not be treated identically.
# LFs may be correlated, resulting in certain signals being overrepresented in a majority-vote-based model.
# To handle this, we use a more sophisticated Snorkel LabelModel to combine LF outputs.

# This model will ultimately produce a single set of noise-aware training labels,
# which are probabilistic or confidence-weighted labels

In [31]:
from snorkel.labeling.model import LabelModel

label_model = LabelModel(cardinality=2, verbose=True)
label_model.fit(L_train=L_train, n_epochs=500, log_freq=100, seed=123)

probs_train = label_model.predict_proba(L=L_train)
probs_train

100%|██████████| 500/500 [00:01<00:00, 395.00epoch/s]


array([[6.57400061e-01, 3.42599939e-01],
       [2.25180862e-06, 9.99997748e-01],
       [5.00000000e-01, 5.00000000e-01],
       ...,
       [2.27481634e-07, 9.99999773e-01],
       [1.08918560e-03, 9.98910814e-01],
       [1.22570277e-08, 9.99999988e-01]])

In [32]:
#Compare the two
majority_acc = majority_model.score(L=L_test, Y=Y_test, tie_break_policy="random")[
    "accuracy"
]
print(f"{'Majority Vote Accuracy:':<25} {majority_acc * 100:.1f}%")

label_model_acc = label_model.score(L=L_test, Y=Y_test, tie_break_policy="random")[
    "accuracy"
]
print(f"{'Label Model Accuracy:':<25} {label_model_acc * 100:.1f}%")

Majority Vote Accuracy:   84.4%
Label Model Accuracy:     87.6%


In [34]:
#Some datapoints didnt get classified for any label, so we filter them out.
from snorkel.labeling import filter_unlabeled_dataframe

df_train_filtered, probs_train_filtered = filter_unlabeled_dataframe(
    X=df_train, y=probs_train, L=L_train
)

In [35]:
#Using the model to classify

from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(ngram_range=(1, 5))
X_train = vectorizer.fit_transform(df_train_filtered.text.tolist())
X_test = vectorizer.transform(df_test.text.tolist())

In [36]:
from snorkel.utils import probs_to_preds
preds_train_filtered = probs_to_preds(probs=probs_train_filtered)

In [37]:
from sklearn.linear_model import LogisticRegression

sklearn_model = LogisticRegression(C=1e3, solver="liblinear")
sklearn_model.fit(X=X_train, y=preds_train_filtered)

In [38]:
print(f"Test Accuracy: {sklearn_model.score(X=X_test, y=Y_test) * 100:.1f}%")

Test Accuracy: 93.6%


In [39]:
#Predictions
import pandas as pd

new_review = ['check out my channel. it is the best',
              'your channel is the best']
df = pd.DataFrame(new_review, columns=['review'])
df = vectorizer.transform(df['review'])
results = sklearn_model.predict(df)

In [41]:
i=0
for item in results:
  i+=1
  if item ==0:
    print(f'Review #{i} is ham')
  else:
    print(f'Review #{i} is spam')

Review #1 is spam
Review #2 is ham
