In [1]:
import pandas as pd

  from . import _distributor_init


In [2]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

In [3]:
from datasets import Dataset
import torch
from torch.utils.data import Dataset, DataLoader

from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer
)

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [5]:
import torch.nn as nn

In [6]:
df = pd.read_csv("clean_data_merged.csv")

In [7]:
df

Unnamed: 0,title,text,subject,date,label,content,clean_content
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017",1,"As U.S. budget fight looms, Republicans flip t...",u.s. budget fight loom republicans flip fiscal...
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017",1,U.S. military to accept transgender recruits o...,u.s. military accept transgender recruit monda...
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017",1,Senior U.S. Republican senator: 'Let Mr. Muell...,senior u.s. republican senator let mr. mueller...
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017",1,FBI Russia probe helped by Australian diplomat...,fbi russia probe help australian diplomat tip ...
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017",1,Trump wants Postal Service to charge 'much mor...,trump want postal service charge amazon shipme...
...,...,...,...,...,...,...,...
44893,McPain: John McCain Furious That Iran Treated ...,21st Century Wire says As 21WIRE reported earl...,Middle-east,"January 16, 2016",0,McPain: John McCain Furious That Iran Treated ...,mcpain john mccain furious iran treat sailors ...
44894,JUSTICE? Yahoo Settles E-mail Privacy Class-ac...,21st Century Wire says It s a familiar theme. ...,Middle-east,"January 16, 2016",0,JUSTICE? Yahoo Settles E-mail Privacy Class-ac...,justice yahoo settles e mail privacy class act...
44895,Sunnistan: US and Allied â€˜Safe Zoneâ€™ Plan to T...,Patrick Henningsen 21st Century WireRemember ...,Middle-east,"January 15, 2016",0,Sunnistan: US and Allied â€˜Safe Zoneâ€™ Plan to T...,sunnistan allied safe zone plan territorial bo...
44896,How to Blow $700 Million: Al Jazeera America F...,21st Century Wire says Al Jazeera America will...,Middle-east,"January 14, 2016",0,How to Blow $700 Million: Al Jazeera America F...,blow $ al jazeera america finally call quit ce...


In [8]:
texts = df["clean_content"].astype(str)
labels = df["label"]

In [9]:
df = df.drop_duplicates(subset="clean_content").reset_index(drop=True)

In [10]:
X_train, X_test, y_train, y_test = train_test_split(
    texts, labels,
    test_size=0.2,
    random_state=42,
    stratify=labels
)

In [11]:
X_train = X_train.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
X_test  = X_test.reset_index(drop=True)
y_test  = y_test.reset_index(drop=True)

In [12]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

MAX_LEN = 256

In [13]:
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts.tolist() if hasattr(texts, "tolist") else list(texts)
        self.labels = labels.tolist() if hasattr(labels, "tolist") else list(labels)
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):

        # ðŸ”¥ HANDLE BATCH INDICES
        if isinstance(idx, (list, tuple)):
            texts = [str(self.texts[i]) for i in idx]
            labels = [self.labels[i] for i in idx]

            encoding = self.tokenizer(
                texts,
                padding="max_length",
                truncation=True,
                max_length=self.max_len,
                return_tensors="pt"
            )

            return {
                "input_ids": encoding["input_ids"],        # (batch, seq)
                "label": torch.tensor(labels, dtype=torch.float32)
            }

        # ðŸ”¥ HANDLE SINGLE INDEX
        if hasattr(idx, "item"):
            idx = idx.item()
        idx = int(idx)

        text = str(self.texts[idx])
        label = self.labels[idx]

        encoding = self.tokenizer(
            text,
            padding="max_length",
            truncation=True,
            max_length=self.max_len,
            return_tensors="pt"
        )

        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "label": torch.tensor(label, dtype=torch.float32)
        }

In [14]:
train_dataset = TextDataset(X_train, y_train, tokenizer, MAX_LEN)
test_dataset  = TextDataset(X_test, y_test, tokenizer, MAX_LEN)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader  = DataLoader(test_dataset, batch_size=32)

In [15]:
class LSTMModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, 1)

    def forward(self, x):
        x = self.embedding(x)
        _, (hidden, _) = self.lstm(x)
        out = self.fc(hidden[-1])
        return torch.sigmoid(out).squeeze(1)   # safer

In [16]:
class BiLSTMModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)

        self.lstm = nn.LSTM(
            embed_dim,
            hidden_dim,
            batch_first=True,
            bidirectional=True
        )

        self.fc = nn.Linear(hidden_dim * 2, 1)   # ðŸ”¥ doubled

    def forward(self, x):
        x = self.embedding(x)
        _, (hidden, _) = self.lstm(x)

        # forward + backward hidden states
        hidden = torch.cat((hidden[-2], hidden[-1]), dim=1)

        out = self.fc(hidden)
        return torch.sigmoid(out).squeeze(1)   # BCELoss compatible

In [17]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [18]:
def train_one_epoch(model, loader, optimizer, criterion):
    model.train()
    total_loss = 0

    for batch in loader:
        X = batch["input_ids"].to(device)
        y = batch["label"].to(device)

        optimizer.zero_grad()
        preds = model(X)
        loss = criterion(preds, y)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    return total_loss / len(loader)

In [19]:
vocab_size = tokenizer.vocab_size
model = LSTMModel(vocab_size, 128, 128)
model.to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = torch.nn.BCELoss()

In [20]:
for batch in train_loader:
    X = batch["input_ids"].to(device)
    y = batch["label"].to(device)

    preds = model(X)

    print("preds:", preds.shape)
    print("y:", y.shape)
    break

preds: torch.Size([32])
y: torch.Size([32])


In [21]:
EPOCHS = 5

for epoch in range(EPOCHS):
    train_loss = train_one_epoch(model, train_loader, optimizer, criterion)
    print(f"Epoch {epoch+1}/{EPOCHS}  Loss: {train_loss:.4f}")

Epoch 1/5  Loss: 0.6068
Epoch 2/5  Loss: 0.1195
Epoch 3/5  Loss: 0.0271
Epoch 4/5  Loss: 0.0133
Epoch 5/5  Loss: 0.0114


In [22]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def evaluate_metrics(model, loader):
    model.eval()

    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in loader:
            X = batch["input_ids"].to(device)
            y = batch["label"].to(device)

            preds = model(X)
            predicted = (preds >= 0.5).float()

            all_preds.extend(predicted.cpu().numpy())
            all_labels.extend(y.cpu().numpy())

    acc = accuracy_score(all_labels, all_preds)
    precision = precision_score(all_labels, all_preds)
    recall = recall_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds)

    return acc, precision, recall, f1

In [23]:
acc, prec, rec, f1 = evaluate_metrics(model, test_loader)

print(f"Accuracy : {acc:.4f}")
print(f"Precision: {prec:.4f}")
print(f"Recall   : {rec:.4f}")
print(f"F1 Score : {f1:.4f}")

Accuracy : 0.9982
Precision: 0.9995
Recall   : 0.9967
F1 Score : 0.9981


In [24]:
VOCAB_SIZE = tokenizer.vocab_size
EMBED_DIM = 128
HIDDEN_DIM = 128

bi_model = BiLSTMModel(VOCAB_SIZE, EMBED_DIM, HIDDEN_DIM).to(device)

In [25]:
criterion = torch.nn.BCELoss()
optimizer = torch.optim.Adam(bi_model.parameters(), lr=1e-3)

In [26]:
EPOCHS = 5

for epoch in range(EPOCHS):
    train_loss = train_one_epoch(bi_model, train_loader, optimizer, criterion)
    print(f"Epoch {epoch+1}")
    print(f"Train Loss: {train_loss:.4f}")

Epoch 1
Train Loss: 0.0435
Epoch 2
Train Loss: 0.0104
Epoch 3
Train Loss: 0.0097
Epoch 4
Train Loss: 0.0030
Epoch 5
Train Loss: 0.0021


In [27]:
acc, prec, rec, f1 = evaluate_metrics(bi_model, test_loader)

print(f"Accuracy : {acc:.4f}")
print(f"Precision: {prec:.4f}")
print(f"Recall   : {rec:.4f}")
print(f"F1 Score : {f1:.4f}")

Accuracy : 0.9992
Precision: 0.9993
Recall   : 0.9991
F1 Score : 0.9992


In [28]:
train_set = set(X_train)
test_set  = set(X_test)

overlap = train_set.intersection(test_set)

print("Exact duplicate samples:", len(overlap))

# show some examples
for i, text in enumerate(list(overlap)[:5]):
    print(f"\nDuplicate {i+1}:\n{text[:200]}")

Exact duplicate samples: 1805

Duplicate 1:
mainstream media wonâ€™t new white house communications director anthony scaramucci explain call trump hack video mainstream medium side story hedge fund magnate anthony scaramucci appear fox business n

Duplicate 2:
arizona biker violent dreamer bad nightmare upcoming phoenix trump rally left itch fight problem ve itch fight trump supporter outnumber t fight bite bit chew time group biker say plan protect support

Duplicate 3:
breaking video obama state dept miraculously find + email ambassador chris stevens day hillary testimony great time joe biden announce hmmm wonder barack timing discovery email dirty chicago politic c

Duplicate 4:
cnn hack attempts gotcha moment trump immediately regret video watch cnn dana bash ask donald question wish didn t trump shut dana bash question take time attend hotel grand opening pic.twitter.com/lt

Duplicate 5:
army threatens green beret war hero court martial whistleblowing failed hostage rescue army t 

In [31]:
def predict_lstm(text, model, tokenizer, max_len=256):
    model.eval()

    encoding = tokenizer(
        text,
        truncation=True,
        padding="max_length",
        max_length=max_len,
        return_tensors="pt"
    )

    input_ids = encoding["input_ids"].to(device)

    with torch.no_grad():
        prob = model(input_ids)   # shape (1,)
        pred = (prob >= 0.5).long().item()

    return pred

In [34]:
text = "U.S. economy grows at 1.4% rate in the fourth quarter, slower than expected"
pred = predict_lstm(text, model, tokenizer)
print(pred)

0
