# Importing and installing required libraries

In [None]:
!pip install transformers torch pandas
!pip install transformers[torch]
!pip install datasets

Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m823.6/823.6 kB[0m [31m22.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.1/14.1 MB[0m [31m31.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Downloading nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import AlbertTokenizer, AlbertForSequenceClassification, Trainer, TrainingArguments, AlbertConfig, AlbertModel
from transformers import DistilBertConfig, DistilBertForSequenceClassification, DistilBertTokenizer, DistilBertModel
import torch
from datasets import *

#Models

In [None]:
models = {}

class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

def trainModel(model, config, testSize, tokenizer, numEpochs, dataset):
  train_df, val_df = train_test_split(dataset, test_size=testSize)
  # Convert the 'text' column to string, just in case some of them are not
  train_df['text'] = train_df['text'].astype(str)
  val_df['text'] = val_df['text'].astype(str)

  # Replace NaN values with an empty string
  train_df['text'].fillna('', inplace=True)
  val_df['text'].fillna('', inplace=True)

  # Tokenize the text for the entire dataset
  train_encodings = tokenizer(train_df['text'].tolist(), truncation=True, padding=True, max_length=512)
  val_encodings = tokenizer(val_df['text'].tolist(), truncation=True, padding=True, max_length=512)

  # Create datasets
  train_dataset = Dataset(train_encodings, train_df['label'].tolist())
  val_dataset = Dataset(val_encodings, val_df['label'].tolist())

  training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=numEpochs,      # total number of training epochs
    per_device_train_batch_size=8,   # batch size per device during training
    per_device_eval_batch_size=8,    # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
    evaluation_strategy="epoch"
  )

  trainer = Trainer(
      model=model,
      args=training_args,
      train_dataset=train_dataset,
      eval_dataset=val_dataset
  )

  trainer.train()

  return model

#Health Domain

##Covid Fake news dataset

In [None]:
url = "https://raw.githubusercontent.com/waleed-munir/covid_fake_news_dataset/main/covid_fake_new_dataset.csv"
df_covid_fake_news_dataset = pd.read_csv(url)

#df = df.drop(df.columns[0], axis=1)
#df = df.drop(df.columns[2], axis=1)

#df['combined_text'] = df['title'] + " [SEP] " + df['text']
#df.dropna(subset=['label'], inplace=True)

#covid_fake_news_dataset = df

df_covid_fake_news_dataset.head()

Unnamed: 0.1,Unnamed: 0,title,text,subcategory,label
0,0,FACEBOOK DELETES MICHIGAN ANTI-LOCKDOWN GROUP ...,Facebook has shuttered a popular group for Mic...,false news,0
1,1,Other Viewpoints: COVID-19 is worse than the flu,We can now officially put to rest all comparis...,true,1
2,2,Bermuda's COVID-19 cases surpass 100,The Ministry of Health in Bermuda has confirme...,true,1
3,3,Purdue University says students face 'close to...,"Purdue University President Mitch Daniels, the...",partially false,0
4,4,THE HIGH COST OF LOCKING DOWN AMERICA: “WE’VE ...,Locking down much of the country may have help...,false news,0


In [None]:
df_covid_fake_news_dataset = df_covid_fake_news_dataset[['text', 'label', 'title']].assign(metadata=df_covid_fake_news_dataset.iloc[:,3:].agg(dict,1))
df_covid_fake_news_dataset.head()

Unnamed: 0,text,label,title,metadata
0,Facebook has shuttered a popular group for Mic...,0,FACEBOOK DELETES MICHIGAN ANTI-LOCKDOWN GROUP ...,"{'subcategory': 'false news', 'label': 0}"
1,We can now officially put to rest all comparis...,1,Other Viewpoints: COVID-19 is worse than the flu,"{'subcategory': 'true', 'label': 1}"
2,The Ministry of Health in Bermuda has confirme...,1,Bermuda's COVID-19 cases surpass 100,"{'subcategory': 'true', 'label': 1}"
3,"Purdue University President Mitch Daniels, the...",0,Purdue University says students face 'close to...,"{'subcategory': 'partially false', 'label': 0}"
4,Locking down much of the country may have help...,0,THE HIGH COST OF LOCKING DOWN AMERICA: “WE’VE ...,"{'subcategory': 'false news', 'label': 0}"


In [None]:
df_covid_fake_news_dataset['label'] = 1 - df_covid_fake_news_dataset['label']
df_covid_fake_news_dataset.head()

Unnamed: 0,text,label,title,metadata
0,Facebook has shuttered a popular group for Mic...,1,FACEBOOK DELETES MICHIGAN ANTI-LOCKDOWN GROUP ...,"{'subcategory': 'false news', 'label': 0}"
1,We can now officially put to rest all comparis...,0,Other Viewpoints: COVID-19 is worse than the flu,"{'subcategory': 'true', 'label': 1}"
2,The Ministry of Health in Bermuda has confirme...,0,Bermuda's COVID-19 cases surpass 100,"{'subcategory': 'true', 'label': 1}"
3,"Purdue University President Mitch Daniels, the...",1,Purdue University says students face 'close to...,"{'subcategory': 'partially false', 'label': 0}"
4,Locking down much of the country may have help...,1,THE HIGH COST OF LOCKING DOWN AMERICA: “WE’VE ...,"{'subcategory': 'false news', 'label': 0}"


In [None]:
df_covid_fake_news_dataset['text'] = df_covid_fake_news_dataset['title'] + ' [SEP] ' + df_covid_fake_news_dataset['text']
df_covid_fake_news_dataset.head()

Unnamed: 0,text,label,title,metadata
0,FACEBOOK DELETES MICHIGAN ANTI-LOCKDOWN GROUP ...,1,FACEBOOK DELETES MICHIGAN ANTI-LOCKDOWN GROUP ...,"{'subcategory': 'false news', 'label': 0}"
1,Other Viewpoints: COVID-19 is worse than the f...,0,Other Viewpoints: COVID-19 is worse than the flu,"{'subcategory': 'true', 'label': 1}"
2,Bermuda's COVID-19 cases surpass 100 [SEP] The...,0,Bermuda's COVID-19 cases surpass 100,"{'subcategory': 'true', 'label': 1}"
3,Purdue University says students face 'close to...,1,Purdue University says students face 'close to...,"{'subcategory': 'partially false', 'label': 0}"
4,THE HIGH COST OF LOCKING DOWN AMERICA: “WE’VE ...,1,THE HIGH COST OF LOCKING DOWN AMERICA: “WE’VE ...,"{'subcategory': 'false news', 'label': 0}"


##FNIR Dataset

In [None]:
fakeNewsUrl = "https://raw.githubusercontent.com/waleed-munir/covid_fake_news_dataset/main/fakeNews.csv"
trueNewsUrl = "https://raw.githubusercontent.com/waleed-munir/covid_fake_news_dataset/main/trueNews.csv"

df_FNIR_true = pd.read_csv(trueNewsUrl)
df_FNIR_false = pd.read_csv(fakeNewsUrl)

df_FNIR_false['Binary Label'] = 1
df_FNIR_true['Binary Label'] = 0

df_FNIR_false.head()

Unnamed: 0,Date Posted,Link,Text,Region,Country,Explanation,Origin,Origin_URL,Fact_checked_by,Poynter_Label,Binary Label
0,2/7/20,https://www.poynter.org/?ifcn_misinformation=t...,Tencent revealed the real number of deaths.\t\t,Europe,France,The screenshot is questionable.,Twitter,https://www.liberation.fr/checknews/2020/02/07...,CheckNews,Misleading,1
1,2/7/20,https://www.poynter.org/?ifcn_misinformation=t...,Taking chlorine dioxide helps fight coronavir...,Europe,Germany,Chlorine dioxide does guard against the coron...,Website,https://correctiv.org/faktencheck/medizin-und-...,Correctiv,FALSE,1
2,2/7/20,https://www.poynter.org/?ifcn_misinformation=t...,This video shows workmen uncovering a bat-inf...,India,India,A video shows bats nesting in the roof; howev...,Facebook,https://factcheck.afp.com/video-shows-workmen-...,AFP,MISLEADING,1
3,2/7/20,https://www.poynter.org/?ifcn_misinformation=t...,The Asterix comic books and The Simpsons pred...,India,India,Coronavirus has been around since the 1960s. ...,Twitter,https://www.boomlive.in/health/did-the-simpson...,BOOM FactCheck,Misleading,1
4,2/7/20,https://www.poynter.org/?ifcn_misinformation=c...,Chinese President Xi Jinping visited a mosque...,India,India,Chinese President Xi Jinping's visit to the m...,Facebook,http://newsmobile.in/articles/2020/02/07/chine...,NewsMobile,FALSE,1


In [None]:
df_FNIR_true.head()

Unnamed: 0,Date Posted,Link,Text,Region,Username,Publisher,Label,Binary Label
0,2/11/20,https://twitter.com/the_hindu/status/122725962...,Just in: Novel coronavirus named 'Covid-19': U...,India,the_hindu,The Hindu,1,0
1,2/12/20,https://twitter.com/ndtv/status/12274908434742...,WHO officially names #coronavirus as Covid-19....,India,ndtv,NDTV,1,0
2,2/12/20,https://twitter.com/the_hindu/status/122744471...,"The #UN #health agency announced that ""COVID-1...",India,the_hindu,The Hindu,1,0
3,2/14/20,https://twitter.com/IndiaToday/status/12282764...,The Indian Embassy in Tokyo has said that one ...,India,indiatoday,IndiaToday,1,0
4,2/15/20,https://twitter.com/the_hindu/status/122854247...,Ground Zero | How Kerala used its experience i...,India,the_hindu,The Hindu,1,0


In [None]:
df_FNIR_true = df_FNIR_true[['Text', 'Binary Label', 'Link', 'Date Posted']]
df_FNIR_false = df_FNIR_false[['Text', 'Binary Label', 'Origin_URL', 'Date Posted']]

df_FNIR_true.rename(columns={'Text': 'text', 'Binary Label': 'label', 'Link': 'link', 'Date Posted': 'date'}, inplace=True)
df_FNIR_false.rename(columns={'Text': 'text', 'Binary Label': 'label', 'Origin_URL': 'link', 'Date Posted': 'date'}, inplace=True)

df_FNIR = pd.concat([df_FNIR_true, df_FNIR_false], axis=0, ignore_index=True)
df_FNIR = df_FNIR[['text', 'label']].assign(metadata=df_FNIR.iloc[:,3:].agg(dict,1))

df_FNIR.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_FNIR_true.rename(columns={'Text': 'text', 'Binary Label': 'label', 'Link': 'link', 'Date Posted': 'date'}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_FNIR_false.rename(columns={'Text': 'text', 'Binary Label': 'label', 'Origin_URL': 'link', 'Date Posted': 'date'}, inplace=True)


Unnamed: 0,text,label,metadata
0,Just in: Novel coronavirus named 'Covid-19': U...,0,{'date': '2/11/20'}
1,WHO officially names #coronavirus as Covid-19....,0,{'date': '2/12/20'}
2,"The #UN #health agency announced that ""COVID-1...",0,{'date': '2/12/20'}
3,The Indian Embassy in Tokyo has said that one ...,0,{'date': '2/14/20'}
4,Ground Zero | How Kerala used its experience i...,0,{'date': '2/15/20'}


##Final Training dataset

In [None]:
df_covid_fake_news_dataset.head()

Unnamed: 0,text,label,title,metadata
0,FACEBOOK DELETES MICHIGAN ANTI-LOCKDOWN GROUP ...,1,FACEBOOK DELETES MICHIGAN ANTI-LOCKDOWN GROUP ...,"{'subcategory': 'false news', 'label': 0}"
1,Other Viewpoints: COVID-19 is worse than the f...,0,Other Viewpoints: COVID-19 is worse than the flu,"{'subcategory': 'true', 'label': 1}"
2,Bermuda's COVID-19 cases surpass 100 [SEP] The...,0,Bermuda's COVID-19 cases surpass 100,"{'subcategory': 'true', 'label': 1}"
3,Purdue University says students face 'close to...,1,Purdue University says students face 'close to...,"{'subcategory': 'partially false', 'label': 0}"
4,THE HIGH COST OF LOCKING DOWN AMERICA: “WE’VE ...,1,THE HIGH COST OF LOCKING DOWN AMERICA: “WE’VE ...,"{'subcategory': 'false news', 'label': 0}"


In [None]:
df_FNIR.head()

Unnamed: 0,text,label,metadata
0,Just in: Novel coronavirus named 'Covid-19': U...,0,{'date': '2/11/20'}
1,WHO officially names #coronavirus as Covid-19....,0,{'date': '2/12/20'}
2,"The #UN #health agency announced that ""COVID-1...",0,{'date': '2/12/20'}
3,The Indian Embassy in Tokyo has said that one ...,0,{'date': '2/14/20'}
4,Ground Zero | How Kerala used its experience i...,0,{'date': '2/15/20'}


In [None]:
health_dataset = pd.concat([df_FNIR, df_covid_fake_news_dataset], ignore_index=True)
health_dataset.head()

Unnamed: 0,text,label,metadata,title
0,Just in: Novel coronavirus named 'Covid-19': U...,0,{'date': '2/11/20'},
1,WHO officially names #coronavirus as Covid-19....,0,{'date': '2/12/20'},
2,"The #UN #health agency announced that ""COVID-1...",0,{'date': '2/12/20'},
3,The Indian Embassy in Tokyo has said that one ...,0,{'date': '2/14/20'},
4,Ground Zero | How Kerala used its experience i...,0,{'date': '2/15/20'},


##Training

In [None]:
# Load the configuration of ALBERT
config = AlbertConfig.from_pretrained('albert-base-v2')

# Load ALBERT model
model = AlbertForSequenceClassification.from_pretrained('albert-base-v2', num_labels=2)

# Let's say you want to freeze the first two layers of the encoder.
for layer in model.albert.encoder.albert_layer_groups[:2]:
    for param in layer.parameters():
        param.requires_grad = False

# Now the first two layers' parameters will not be updated during training.
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')

testSize = 0.1
numEpochs = 2

model = trainModel(model, config, testSize, tokenizer, numEpochs, health_dataset)

Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss
1,0.274,0.219812
2,0.329,0.180029


Checkpoint destination directory ./results/checkpoint-500 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./results/checkpoint-1000 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./results/checkpoint-1500 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./results/checkpoint-2000 already exists and is non-empty. Saving will proceed but saved results may be invalid.


In [None]:
models["health"] = {"model": model, "tokenizer":tokenizer}

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Specify the path in Google Drive to save the model
model_save_path = '/content/drive/My Drive'

# Save the model
model.save_pretrained(model_save_path)

# Save the tokenizer
tokenizer.save_pretrained(model_save_path)

# Step 4: Load the model and tokenizer from the saved directory
# Note: Ensure Google Drive is mounted if you're loading in a new session

model_loaded = AlbertForSequenceClassification.from_pretrained(model_save_path)
tokenizer_loaded = AlbertTokenizer.from_pretrained(model_save_path)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


You are using a model of type distilbert to instantiate a model of type albert. This is not supported for all configurations of models and can yield errors.


ValueError: The state dictionary of the model you are trying to load is corrupted. Are you sure it was properly saved?

#Science Domain

##Climate Dataset

In [None]:
dataset = load_dataset("climate_fever")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/8.09k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/869k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/1535 [00:00<?, ? examples/s]

In [None]:
def pd_set_see_full_dataframe():
    # Permanently changes the pandas settings
    pd.set_option('display.max_rows', None)
    pd.set_option('display.max_columns', None)
    pd.set_option('display.width', None)
    # pd.set_option('display.max_colwidth', -1)

def pd_reset_options():
    # Resets the options
    pd.reset_option('all')

def display_whole_df(df):
    pd_set_see_full_dataframe()
    display(df)
    pd_reset_options()

In [None]:
df_climate_dataset = pd.DataFrame()

for x in dataset.keys():
    df_climate_dataset = pd.concat([df_climate_dataset, dataset[x].to_pandas()], axis=0, ignore_index=True)

In [None]:
# display_whole_df(df_climate_dataset)
df_climate_dataset = df_climate_dataset.rename(columns={'claim_label': 'label', 'claim': 'text', 'evidences':'metadata'})
df_climate_dataset.head()

Unnamed: 0,claim_id,text,label,metadata
0,0,Global warming is driving polar bears toward e...,0,[{'evidence_id': 'Extinction risk from global ...
1,5,The sun has gone into ‘lockdown’ which could c...,0,"[{'evidence_id': 'Famine:386', 'evidence_label..."
2,6,The polar bear population has been growing.,1,"[{'evidence_id': 'Polar bear:1332', 'evidence_..."
3,9,Ironic' study finds more CO2 has slightly cool...,1,"[{'evidence_id': 'Atmosphere of Mars:131', 'ev..."
4,10,Human additions of CO2 are in the margin of er...,1,[{'evidence_id': 'Carbon dioxide in Earth's at...


In [None]:
df_climate_dataset = df_climate_dataset[['text', 'label', 'metadata']]
df_climate_dataset.head()

Unnamed: 0,text,label,metadata
0,Global warming is driving polar bears toward e...,0,[{'evidence_id': 'Extinction risk from global ...
1,The sun has gone into ‘lockdown’ which could c...,0,"[{'evidence_id': 'Famine:386', 'evidence_label..."
2,The polar bear population has been growing.,1,"[{'evidence_id': 'Polar bear:1332', 'evidence_..."
3,Ironic' study finds more CO2 has slightly cool...,1,"[{'evidence_id': 'Atmosphere of Mars:131', 'ev..."
4,Human additions of CO2 are in the margin of er...,1,[{'evidence_id': 'Carbon dioxide in Earth's at...


In [None]:
df_climate_dataset['label'].value_counts()

0    654
2    474
1    253
3    154
Name: label, dtype: int64

In [None]:
df_climate_dataset_true = df_climate_dataset.loc[df_climate_dataset['label'] == 0]
df_climate_dataset_false = df_climate_dataset.loc[df_climate_dataset['label'] == 1]
df_climate_dataset_true['label'].value_counts()
df_climate_dataset_false['label'].value_counts()

1    253
Name: label, dtype: int64

In [None]:
df_climate_dataset = pd.concat([df_climate_dataset_true, df_climate_dataset_false], ignore_index=True)
df_climate_dataset['label'].value_counts()

0    654
1    253
Name: label, dtype: int64

In [None]:
df_climate_dataset.head()

Unnamed: 0,text,label,metadata
0,Global warming is driving polar bears toward e...,0,[{'evidence_id': 'Extinction risk from global ...
1,The sun has gone into ‘lockdown’ which could c...,0,"[{'evidence_id': 'Famine:386', 'evidence_label..."
2,They tell us that we are the primary forces co...,0,"[{'evidence_id': 'Carbon dioxide:183', 'eviden..."
3,The Great Barrier Reef is experiencing the mos...,0,"[{'evidence_id': 'Coral bleaching:52', 'eviden..."
4,"Volcanoes Melting West Antarctic Glaciers, Not...",0,"[{'evidence_id': 'Antarctica:375', 'evidence_l..."


##Training

Creating the model and freezing the first few layers

In [None]:
# Load the configuration of ALBERT
config = AlbertConfig.from_pretrained('albert-base-v2')

# Load ALBERT model
model = AlbertForSequenceClassification.from_pretrained('albert-base-v2', num_labels=2)


# Now the first two layers' parameters will not be updated during training.
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')

testSize = 0.1
numEpochs = 4

model = trainModel(model, config, testSize, tokenizer, numEpochs, df_climate_dataset)

config.json:   0%|          | 0.00/684 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/47.4M [00:00<?, ?B/s]

Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/760k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.31M [00:00<?, ?B/s]

NameError: name 'trainModel' is not defined

In [None]:
models["science"] = {"model": model, "tokenizer":tokenizer}

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Specify the path in Google Drive to save the model
model_save_path = '/content/drive/My Drive'

# Save the model
model.save_pretrained(model_save_path)

# Save the tokenizer
tokenizer.save_pretrained(model_save_path)

# Step 4: Load the model and tokenizer from the saved directory
# Note: Ensure Google Drive is mounted if you're loading in a new session

model_loaded = AlbertForSequenceClassification.from_pretrained(model_save_path)
tokenizer_loaded = AlbertTokenizer.from_pretrained(model_save_path)

#Social Media Domain

##GossipCop Dataset

In [None]:
def pd_set_see_full_dataframe():
    # Permanently changes the pandas settings
    pd.set_option('display.max_rows', None)
    pd.set_option('display.max_columns', None)
    pd.set_option('display.width', None)
    # pd.set_option('display.max_colwidth', -1)

def pd_reset_options():
    # Resets the options
    pd.reset_option('all')

def display_whole_df(df):
    pd_set_see_full_dataframe()
    display(df)
    pd_reset_options()

In [None]:
df_gossip_false = pd.read_csv("https://raw.githubusercontent.com/KaiDMML/FakeNewsNet/master/dataset/gossipcop_fake.csv")
df_gossip_true = pd.read_csv("https://raw.githubusercontent.com/KaiDMML/FakeNewsNet/master/dataset/gossipcop_real.csv")

In [None]:
df_gossip_true['label'] = 0
df_gossip_false['label'] = 1

In [None]:
df_gossip = pd.concat([df_gossip_true, df_gossip_false], axis=0, ignore_index=True)
df_gossip.head()

Unnamed: 0,id,news_url,title,tweet_ids,label
0,gossipcop-882573,https://www.brides.com/story/teen-mom-jenelle-...,Teen Mom Star Jenelle Evans' Wedding Dress Is ...,912371411146149888\t912371528343408641\t912372...,0
1,gossipcop-875924,https://www.dailymail.co.uk/tvshowbiz/article-...,Kylie Jenner refusing to discuss Tyga on Life ...,901989917546426369\t901989992074969089\t901990...,0
2,gossipcop-894416,https://en.wikipedia.org/wiki/Quinn_Perkins,Quinn Perkins,931263637246881792\t931265332022579201\t931265...,0
3,gossipcop-857248,https://www.refinery29.com/en-us/2018/03/19192...,I Tried Kim Kardashian's Butt Workout & Am For...,868114761723936769\t868122567910936576\t868128...,0
4,gossipcop-884684,https://www.cnn.com/2017/10/04/entertainment/c...,Celine Dion donates concert proceeds to Vegas ...,915528047004209152\t915529285171122176\t915530...,0


In [None]:
df_gossip['label'].value_counts()

0    16817
1     5323
Name: label, dtype: int64

In [None]:
df_gossip.rename(columns={'title':'text', 'news_url':'link'}, inplace=True)
df_gossip = df_gossip[['text', 'label']].assign(metadata=df_gossip.iloc[:,2:].agg(dict,1))
df_gossip.head()

Unnamed: 0,text,label,metadata
0,Teen Mom Star Jenelle Evans' Wedding Dress Is ...,0,{'text': 'Teen Mom Star Jenelle Evans' Wedding...
1,Kylie Jenner refusing to discuss Tyga on Life ...,0,{'text': 'Kylie Jenner refusing to discuss Tyg...
2,Quinn Perkins,0,"{'text': 'Quinn Perkins', 'tweet_ids': '931263..."
3,I Tried Kim Kardashian's Butt Workout & Am For...,0,{'text': 'I Tried Kim Kardashian's Butt Workou...
4,Celine Dion donates concert proceeds to Vegas ...,0,{'text': 'Celine Dion donates concert proceeds...


##Training

In [None]:
# Load the configuration
config = DistilBertConfig.from_pretrained('distilbert-base-uncased')
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Load the model with the specified configuration
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', config=config)


# Now the first two layers parameters will not be updated during training.
testSize = 0.1
numEpochs = 1

model = trainModel(model, config, testSize, tokenizer, numEpochs, df_gossip)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss
1,0.5361,0.316052


Checkpoint destination directory ./results/checkpoint-500 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./results/checkpoint-1000 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./results/checkpoint-1500 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./results/checkpoint-2000 already exists and is non-empty. Saving will proceed but saved results may be invalid.


In [None]:
models["crime"] = {"model": model, "tokenizer":tokenizer}

In [None]:
from google.colab import drive

drive.mount('/content/drive')

model_save_path = '/content/drive/My Drive'

# Save the model
model.save_pretrained(model_save_path)

# Save the tokenizer
tokenizer.save_pretrained(model_save_path)

model_loaded = DistilBertForSequenceClassification.from_pretrained(model_save_path)
tokenizer_loaded = DistilBertTokenizer.from_pretrained(model_save_path)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


#Politics Domain

##liar Dataset

In [None]:
liar_dataset = load_dataset("liar")
df_liar_dataset = pd.DataFrame()
df_liar_dataset = pd.DataFrame()

for x in liar_dataset.keys():
    df_liar_dataset = pd.concat([df_liar_dataset, liar_dataset[x].to_pandas()], axis=0, ignore_index=True)

df_liar_dataset.head()


Downloading data:   0%|          | 0.00/1.30M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/171k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/172k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/10269 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1283 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1284 [00:00<?, ? examples/s]

Unnamed: 0,id,label,statement,subject,speaker,job_title,state_info,party_affiliation,barely_true_counts,false_counts,half_true_counts,mostly_true_counts,pants_on_fire_counts,context
0,2635.json,0,Says the Annies List political group supports ...,abortion,dwayne-bohac,State representative,Texas,republican,0.0,1.0,0.0,0.0,0.0,a mailer
1,10540.json,1,When did the decline of coal start? It started...,"energy,history,job-accomplishments",scott-surovell,State delegate,Virginia,democrat,0.0,0.0,1.0,1.0,0.0,a floor speech.
2,324.json,2,"Hillary Clinton agrees with John McCain ""by vo...",foreign-policy,barack-obama,President,Illinois,democrat,70.0,71.0,160.0,163.0,9.0,Denver
3,1123.json,0,Health care reform legislation is likely to ma...,health-care,blog-posting,,,none,7.0,19.0,3.0,5.0,44.0,a news release
4,9028.json,1,The economic turnaround started at the end of ...,"economy,jobs",charlie-crist,,Florida,democrat,15.0,9.0,20.0,19.0,2.0,an interview on CNN


In [None]:
df_liar_dataset.rename(columns={'statement': 'text'}, inplace=True)

##Fake News Dataset

In [None]:
df_fake_news_dataset = pd.read_csv("https://raw.githubusercontent.com/waleed-munir/covid_fake_news_dataset/main/train1.csv")
df_fake_news_dataset.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4999 entries, 0 to 4998
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      4999 non-null   int64 
 1   title   4857 non-null   object
 2   author  4521 non-null   object
 3   text    4988 non-null   object
 4   label   4999 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 195.4+ KB


In [None]:
# display_whole_df(df_fake_news_dataset)
df_fake_news_dataset['text'] = df_fake_news_dataset['title'] + ' [SEP] ' + df_fake_news_dataset['text']
df_fake_news_dataset.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,Why the Truth Might Get You Fired [SEP] Why th...,1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,15 Civilians Killed In Single US Airstrike Hav...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Iranian woman jailed for fictional unpublished...,1


##ISOT Dataset

In [None]:
df_isot_dataset_true= pd.read_csv("https://raw.githubusercontent.com/waleed-munir/covid_fake_news_dataset/main/True1.csv")
df_isot_dataset_false= pd.read_csv("https://raw.githubusercontent.com/waleed-munir/covid_fake_news_dataset/main/Fake1.csv")

In [None]:
df_isot_dataset_true.info(verbose=True)
df_isot_dataset_false.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8999 entries, 0 to 8998
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    8999 non-null   object
 1   text     8999 non-null   object
 2   subject  8999 non-null   object
 3   date     8999 non-null   object
dtypes: object(4)
memory usage: 281.3+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8999 entries, 0 to 8998
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    8999 non-null   object
 1   text     8999 non-null   object
 2   subject  8999 non-null   object
 3   date     8999 non-null   object
dtypes: object(4)
memory usage: 281.3+ KB


In [None]:
df_isot_dataset_true['label'] = 0
df_isot_dataset_false['label'] = 1

In [None]:
df_isot_dataset = pd.concat([df_isot_dataset_true, df_isot_dataset_false], axis=0, ignore_index=True)

In [None]:
# display_whole_df(df_fake_news_dataset)
display(df_isot_dataset.head())
set(df_isot_dataset['subject'].tolist())

Unnamed: 0,title,text,subject,date,label
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017",0
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017",0
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017",0
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017",0
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017",0


{'News', 'politicsNews'}

##Final Training Dataset

In [None]:
politics_dataset = pd.concat([df_isot_dataset, df_fake_news_dataset], ignore_index=True)
politics_dataset = politics_dataset[['text', 'label']]
politics_dataset.head()

Unnamed: 0,text,label
0,WASHINGTON (Reuters) - The head of a conservat...,0
1,WASHINGTON (Reuters) - Transgender people will...,0
2,WASHINGTON (Reuters) - The special counsel inv...,0
3,WASHINGTON (Reuters) - Trump campaign adviser ...,0
4,SEATTLE/WASHINGTON (Reuters) - President Donal...,0


In [None]:
labels = df_liar_dataset["label"]
unique_labels = set(labels)
print(unique_labels)

{0, 1, 2, 3, 4, 5}


##Training

In [None]:
config = DistilBertConfig.from_pretrained('distilbert-base-uncased')
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased', max_length=512)

# Load the model with the specified configuration
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', config=config)

# Freeze the first 2 layers of the model
for param in model.distilbert.transformer.layer[:2]:
    param.requires_grad = False
# Now the first two layers' parameters will not be updated during training.
testSize = 0.1
numEpochs = 5

model = trainModel(model, config, testSize, tokenizer, numEpochs, politics_dataset)

In [None]:
models["socialmedia"] = {"model": model, "tokenizer":tokenizer}

In [None]:
from google.colab import drive

drive.mount('/content/drive')

model_save_path = '/content/drive/My Drive'

# Save the model
model.save_pretrained(model_save_path)

# Save the tokenizer
tokenizer.save_pretrained(model_save_path)

model_loaded = DistilBertForSequenceClassification.from_pretrained(model_save_path)
tokenizer_loaded = DistilBertTokenizer.from_pretrained(model_save_path)

#Crime Domain

In [None]:
url = "https://raw.githubusercontent.com/waleed-munir/covid_fake_news_dataset/main/FA-KES-Dataset.csv"

In [None]:
def pd_set_see_full_dataframe():
    # Permanently changes the pandas settings
    pd.set_option('display.max_rows', None)
    pd.set_option('display.max_columns', None)
    pd.set_option('display.width', None)
    # pd.set_option('display.max_colwidth', -1)

def pd_reset_options():
    # Resets the options
    pd.reset_option('all')

def display_whole_df(df):
    pd_set_see_full_dataframe()
    display(df)
    pd_reset_options()

In [None]:
df_fa_kes = pd.read_csv(url, encoding='latin-1')
df_fa_kes = df_fa_kes[['article_title','article_content','labels','date','location', 'source']]

In [None]:
display(df_fa_kes['labels'].value_counts())
df_fa_kes['labels'].replace({0:False, 1:True}, inplace=True)
df_fa_kes['labels'].replace({False:1, True:0}, inplace=True)

display(df_fa_kes['labels'].value_counts())
df_fa_kes.rename(columns={'article_title':'title','article_content':'text', 'labels':'label'}, inplace=True)
display(df_fa_kes.head())

df_fa_kes = df_fa_kes[['text', 'label']].assign(metadata=df_fa_kes.iloc[:,2:].agg(dict,1))
display(df_fa_kes.head())

1    426
0    378
Name: labels, dtype: int64

0    426
1    378
Name: labels, dtype: int64

Unnamed: 0,title,text,label,date,location,source
0,Syria attack symptoms consistent with nerve ag...,Wed 05 Apr 2017 Syria attack symptoms consiste...,1,4/5/2017,idlib,nna
1,Homs governor says U.S. attack caused deaths b...,Fri 07 Apr 2017 at 0914 Homs governor says U.S...,1,4/7/2017,homs,nna
2,Death toll from Aleppo bomb attack at least 112,Sun 16 Apr 2017 Death toll from Aleppo bomb at...,1,4/16/2017,aleppo,nna
3,Aleppo bomb blast kills six Syrian state TV,Wed 19 Apr 2017 Aleppo bomb blast kills six Sy...,1,4/19/2017,aleppo,nna
4,29 Syria Rebels Dead in Fighting for Key Alepp...,Sun 10 Jul 2016 29 Syria Rebels Dead in Fighti...,1,7/10/2016,aleppo,nna


Unnamed: 0,text,label,metadata
0,Wed 05 Apr 2017 Syria attack symptoms consiste...,1,"{'label': 1, 'date': '4/5/2017', 'location': '..."
1,Fri 07 Apr 2017 at 0914 Homs governor says U.S...,1,"{'label': 1, 'date': '4/7/2017', 'location': '..."
2,Sun 16 Apr 2017 Death toll from Aleppo bomb at...,1,"{'label': 1, 'date': '4/16/2017', 'location': ..."
3,Wed 19 Apr 2017 Aleppo bomb blast kills six Sy...,1,"{'label': 1, 'date': '4/19/2017', 'location': ..."
4,Sun 10 Jul 2016 29 Syria Rebels Dead in Fighti...,1,"{'label': 1, 'date': '7/10/2016', 'location': ..."


##Training

In [None]:
# Load the configuration of ALBERT
config = AlbertConfig.from_pretrained('albert-base-v2')

# Load ALBERT model
model = AlbertForSequenceClassification.from_pretrained('albert-base-v2', num_labels=2)

# Let's say you want to freeze the first two layers of the encoder.
for layer in model.albert.encoder.albert_layer_groups[:2]:
    for param in layer.parameters():
        param.requires_grad = False

# Now the first two layers parameters will not be updated during training.
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')

testSize = 0.1
numEpochs = 5

model = trainModel(model, config, testSize, tokenizer, numEpochs, df_fa_kes)

In [None]:
models["politics"] = {"model": model, "tokenizer":tokenizer}

#Baseline model

##Combined Dataset

In [None]:
df_combined = pd.concat([politics_dataset, df_gossip, df_climate_dataset, health_dataset], ignore_index=True)

##Training

In [None]:
# Load the configuration
config = DistilBertConfig.from_pretrained('distilbert-base-uncased')
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Load the model with the specified configuration
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', config=config)

# Freeze the first 2 layers of the model
for param in model.distilbert.transformer.layer[:2]:
    param.requires_grad = False
# Now the first two layers' parameters will not be updated during training.
testSize = 0.1
numEpochs = 1

model = trainModel(model, config, testSize, tokenizer, numEpochs, df_combined)

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss
1,0.1012,0.194588


In [None]:
drive.mount('/content/drive')

# Specify the path in Google Drive to save the model
model_save_path = '/content/drive/My Drive'

# Save the model
model.save_pretrained(model_save_path)

# Save the tokenizer
tokenizer.save_pretrained(model_save_path)

# Step 4: Load the model and tokenizer from the saved directory
# Note: Ensure Google Drive is mounted if you're loading in a new session

model_loaded = AlbertForSequenceClassification.from_pretrained(model_save_path)
tokenizer_loaded = AlbertTokenizer.from_pretrained(model_save_path)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


You are using a model of type distilbert to instantiate a model of type albert. This is not supported for all configurations of models and can yield errors.


ValueError: The state dictionary of the model you are trying to load is corrupted. Are you sure it was properly saved?

In [None]:
models["baseline"] = {"model": model, "tokenizer":tokenizer}

#Classifier

##Adding classifier labels

In [None]:
health_dataset['category'] = 0
df_climate_dataset['category'] = 1
df_gossip['category'] = 2
politics_dataset['category'] = 3
df_fa_kes['category'] = 4
df_combined = pd.concat([politics_dataset, df_gossip, df_climate_dataset, health_dataset], ignore_index=True)
df_combined['label'] = df_combined['category']

##Training

In [None]:
# Load the configuration
config = DistilBertConfig.from_pretrained('distilbert-base-uncased')
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Load DistilBert For Sequence Classification with 5 labels
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=5)

# Freeze the first 2 layers of the model
for param in model.distilbert.transformer.layer[:2]:
    param.requires_grad = False
# Now the first two layers' parameters will not be updated during training.
testSize = 0.1
numEpochs = 1

model = trainModel(model, config, testSize, tokenizer, numEpochs, df_combined)

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss
