In [49]:
import pandas as pd
import random
base = 'database.csv'

data = pd.read_csv(base)
data = data[["is_retweet","text"]]

In [50]:
IMPORTANT = 1
ABSTAIN = 0

In [51]:
data

Unnamed: 0,is_retweet,text
0,False,From Donald Trump: Wishing everyone a wonderfu...
1,False,Trump International Tower in Chicago ranked 6t...
2,False,Wishing you and yours a very Happy and Bountif...
3,False,Donald Trump Partners with TV1 on New Reality ...
4,False,"--Work has begun, ahead of schedule, to build ..."
...,...,...
35214,False,".@ICEgov HSI agents and ERO officers, on behal..."
35215,False,Thank you @GOPLeader Kevin McCarthy! Couldn’t ...
35216,False,"As I made very clear today, our country needs ..."
35217,True,"RT @Scavino45: “Utilities cutting rates, cite ..."


In [52]:
from sklearn.model_selection import train_test_split
df_train, df_test = train_test_split(data,train_size=0.8)

In [53]:
from snorkel.labeling import labeling_function,PandasLFApplier,LFAnalysis

In [54]:
@labeling_function()
def label1(x):
    return IMPORTANT if "vaccine" in x.text.lower() else ABSTAIN


@labeling_function()
def label2(x):
    return IMPORTANT if "trump" in x.text.lower() else ABSTAIN


@labeling_function()
def label3(x):
    return IMPORTANT if "pandemic" in x.text.lower() else ABSTAIN

In [55]:
lfs = [label1, label2, label3]

In [56]:
applier = PandasLFApplier(lfs=lfs)
L_train = applier.apply(df=df_train)

100%|██████████| 28175/28175 [00:01<00:00, 26672.69it/s]


In [57]:
L_train

array([[0, 0, 0],
       [0, 0, 0],
       [0, 0, 0],
       ...,
       [0, 1, 0],
       [0, 1, 0],
       [0, 0, 0]])

In [58]:
LFAnalysis(L=L_train,lfs=lfs).lf_summary()

Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts
label1,0,"[0, 1]",1.0,1.0,0.38921
label2,1,"[0, 1]",1.0,1.0,0.38921
label3,2,"[0, 1]",1.0,1.0,0.38921


In [59]:
from snorkel.labeling.model import MajorityLabelVoter,LabelModel

In [60]:
label_model = LabelModel(cardinality=2,verbose=True)
label_model.fit(L_train=L_train,n_epochs=500,log_freq=100,seed=123)

INFO:root:Computing O...
INFO:root:Estimating \mu...
  0%|          | 0/500 [00:00<?, ?epoch/s]INFO:root:[0 epochs]: TRAIN:[loss=3.388]
INFO:root:[100 epochs]: TRAIN:[loss=0.003]
 34%|███▍      | 171/500 [00:00<00:00, 1708.12epoch/s]INFO:root:[200 epochs]: TRAIN:[loss=0.001]
INFO:root:[300 epochs]: TRAIN:[loss=0.001]
 73%|███████▎  | 366/500 [00:00<00:00, 1849.34epoch/s]INFO:root:[400 epochs]: TRAIN:[loss=0.000]
100%|██████████| 500/500 [00:00<00:00, 1856.41epoch/s]
INFO:root:Finished Training


In [61]:
df_train.head()

Unnamed: 0,is_retweet,text
1449,False,If you treat people right they will treat you ...
25120,False,They just arrested pol Shelly Silver in New Yo...
32317,False,ObamaCare will explode and we will all get tog...
21914,False,"""@BettyeBear: @realDonaldTrump Every working c..."
1610,False,Aberdeen tourism is booming because of my grea...


In [62]:
df_train['Labels'] = label_model.predict(L=L_train,tie_break_policy="abstain")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train['Labels'] = label_model.predict(L=L_train,tie_break_policy="abstain")


In [63]:
df_train.query('Labels == 0')

Unnamed: 0,is_retweet,text,Labels
1449,False,If you treat people right they will treat you ...,0
25120,False,They just arrested pol Shelly Silver in New Yo...,0
32317,False,ObamaCare will explode and we will all get tog...,0
21914,False,"""@BettyeBear: @realDonaldTrump Every working c...",0
1610,False,Aberdeen tourism is booming because of my grea...,0
...,...,...,...
23766,False,"""@gabriellebragg4: How do I find out where @re...",0
7763,False,.@TrumpDoral’s record $200M renovations are on...,0
15377,False,"""@TheBigJamesG staff at Trump Doral was amazi...",0
17730,False,"""@pandoerra: @realDonaldTrump I think you coul...",0
