# Set Up

In [1]:
import pandas as pd
import re

In [14]:
data = pd.read_csv("/Users/yara/GitHub/mumin-classifier/mumin-small-final.csv")

In [15]:
def replace_user_link(text):
  text = re.sub(r"http\S+","<url>",text) 
  text = re.sub(r"@\S+","<user>",text) 

  return text

In [16]:
data.full_m_text = data.full_m_text.apply(replace_user_link)
data.full_en_text = data.full_en_text.apply(replace_user_link)

In [19]:
path = '/Users/yara/GitHub/mumin-classifier/'
# Split and save data for later 
data.label = data.label.replace("misinformation", 1)
data.label = data.label.replace("factual", 0)

train = data.query('train_mask == True')
val = data.query('val_mask == True')
test = data.query('test_mask == True')

train_m = train[["full_m_text", "label"]].rename(columns={"full_m_text": "text"})
val_m = val[["full_m_text", "label"]].rename(columns={"full_m_text": "text"})
test_m = test[["full_m_text", "label"]].rename(columns={"full_m_text": "text"})

train_en = train[["full_en_text", "label"]].rename(columns={"full_en_text": "text"})
val_en = val[["full_en_text", "label"]].rename(columns={"full_en_text": "text"})
test_en = test[["full_en_text", "label"]].rename(columns={"full_en_text": "text"})

train_m.to_csv(f"{path}data/train_m.csv")
val_m.to_csv(f"{path}data/val_m.csv")
test_m.to_csv(f"{path}data/test_m.csv")

train_en.to_csv(f"{path}data/train_en.csv")
val_en.to_csv(f"{path}data/val_en.csv")
test_en.to_csv(f"{path}data/test_en.csv")

In [27]:
# Split up the data
train = data.query('train_mask == True')
val = data.query('val_mask == True')
test = data.query('test_mask == True')

# Extract original language tweet
X_train_m = train.full_m_text
X_val_m = val.full_m_text
X_test_m = test.full_m_text

# Extract tweet in english
X_train_en = train.full_en_text
X_val_en = val.full_en_text
X_test_en = test.full_en_text

# Extract the labels
y_train = train.label
y_val = val.label
y_test = test.label

In [5]:
def report(test_scores):
  print(f'Misinformation F1: {100 * test_scores[1]:.2f}%')
  print(f'Factual F1: {100 * test_scores[0]:.2f}%')
  print(f'Macro-average F1: {100 * test_scores.mean():.2f}%')

In [33]:
classes = data.label.unique()

# Classifiers

### Majority Class

In [34]:
from sklearn.dummy import DummyClassifier
from sklearn.metrics import f1_score

In [35]:
model = DummyClassifier(strategy="most_frequent")

model.fit(X_train_en, y_train)

train_preds = model.predict(X_train_en)
val_preds = model.predict(X_val_en)
test_preds = model.predict(X_test_en)

train_scores = f1_score(y_train, train_preds, average=None)
val_scores = f1_score(y_val, val_preds, average=None)
test_scores = f1_score(y_test, test_preds, average=None)

report(train_scores)
report(val_scores)
report(test_scores)

Misinformation F1: 95.41%
Factual F1: 0.00%
Macro-average F1: 47.70%
Misinformation F1: 99.38%
Factual F1: 0.00%
Macro-average F1: 49.69%
Misinformation F1: 97.93%
Factual F1: 0.00%
Macro-average F1: 48.97%


In [36]:
model = DummyClassifier(strategy="uniform", random_state=17)

model.fit(X_train_en, y_train)

train_preds = model.predict(X_train_en)
val_preds = model.predict(X_val_en)
test_preds = model.predict(X_test_en)

train_scores = f1_score(y_train, train_preds, average=None)
val_scores = f1_score(y_val, val_preds, average=None)
test_scores = f1_score(y_test, test_preds, average=None)

report(train_scores)
report(val_scores)
print('*** Test scores ***')
report(test_scores)

Misinformation F1: 65.02%
Factual F1: 14.63%
Macro-average F1: 39.83%
Misinformation F1: 67.21%
Factual F1: 2.45%
Macro-average F1: 34.83%
*** Test scores ***
Misinformation F1: 66.33%
Factual F1: 7.59%
Macro-average F1: 36.96%


### Zero-shot RoBERTa


In [14]:
!pip install transformers
!pip install sentencepiece



In [37]:
from transformers import pipeline, RobertaTokenizer
from transformers import RobertaForSequenceClassification

In [38]:
model = RobertaForSequenceClassification.from_pretrained("roberta-base")
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
classifierRB = pipeline("zero-shot-classification", tokenizer=tokenizer,
                      model=model) 

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.weight', 'lm_head.layer_norm.bias', 'roberta.pooler.dense.weight', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifie

In [39]:
def classifyRB(text):
  result = classifierRB(text, classes)
  return result["labels"][0]

In [40]:
test_preds = X_test_en.apply(classifyRB)

In [42]:
test_scores = f1_score(y_test, test_preds, average=None)
print('*** Test scores ***')
report(test_scores)

*** Test scores ***
Misinformation F1: 97.64%
Factual F1: 0.00%
Macro-average F1: 48.82%


### Zero-shot XLM-RoBERTa


In [43]:
from transformers import pipeline, XLMRobertaTokenizer, XLMRobertaForSequenceClassification

In [44]:
model = XLMRobertaForSequenceClassification.from_pretrained("xlm-roberta-base")
tokenizer = XLMRobertaTokenizer.from_pretrained("xlm-roberta-base")
classifierXLMRB = pipeline("zero-shot-classification", tokenizer=tokenizer, model=model) #, device=0, use_fast=True, multi_label=False)

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaForSequenceClassification: ['lm_head.dense.weight', 'lm_head.layer_norm.bias', 'roberta.pooler.dense.weight', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense

In [45]:
def classifyXLMRB(text):
  result = classifierXLMRB(text, classes)
  return result["labels"][0]

In [50]:
test_preds = X_test_m.apply(classifyXLMRB)

In [51]:
test_scores = f1_score(y_test, test_preds, average=None)
print('*** Test scores ***')
report(test_scores)

*** Test scores ***
Misinformation F1: 48.48%
Factual F1: 10.58%
Macro-average F1: 29.53%


In [48]:
test_preds = X_test_en.apply(classifyXLMRB)

In [49]:
test_scores = f1_score(y_test, test_preds, average=None)
print('*** Test scores ***')
report(test_scores)

*** Test scores ***
Misinformation F1: 48.48%
Factual F1: 10.58%
Macro-average F1: 29.53%
