# Project NLP

In [None]:
import pandas as pd 
import numpy as np  
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
import torch
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
train_df = pd.read_csv('training_data.csv', sep="\t", header=None, names=['label', 'headline'])
X = train_df['headline']
y = train_df['label']   
print(train_df.head())
print(train_df.columns)

In [None]:
X_train, X_val, y_train, y_val= train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
vectoraizer = TfidfVectorizer(stop_words='english', max_features=5000)
X_train_tfidf = vectoraizer.fit_transform(X_train)
X_val_tfidf = vectoraizer.transform(X_val)

In [None]:
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train_tfidf, y_train)


In [None]:
y_pred = clf.predict(X_val_tfidf)
print(f"Accuracy:", accuracy_score(y_val, y_pred))
print("Classification Report:\n", classification_report(y_val, y_pred))

In [None]:
train_df

In [None]:
test_df = pd.read_csv('testing_data.csv', sep="\t", header=None, names=['label', 'headline'])
X_test = test_df['headline']
X_test_tfidf = vectoraizer.transform(X_test)
y_test_pred = clf.predict(X_test_tfidf)


In [None]:
# 3. Initialize the pre-trained model pipeline
from transformers import pipeline
classifier = pipeline("text-classification", model="jy46604790/Fake-News-Bert-Detect", truncation=True)

# 4. Run predictions ONLY on the validation headlines
print("Running predictions on the validation set...")
preds = []
for text in X_val:
    result = classifier(text)[0]
    label = 1 if result['label'] == 'LABEL_1' else 0
    preds.append(label)

# 5. Calculate the validation accuracy for the pre-trained model
acc = accuracy_score(y_val, preds)
print(f"\nValidation Accuracy for Pre-trained BERT Model: {acc:.4f}")

In [None]:
#replace '2' the label column with predicted values
test_df['label'] = y_test_pred

In [None]:
test_df.to_csv('predicted_test_data.csv', index=False)
print("Predictions saved to 'predicted_test_data.csv'")

In [None]:
#Exploring models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Naive Bayes": MultinomialNB(),
    "Random Forest": RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1)
}

# Ensemble (VotingClassifier: majority vote)
ensemble = VotingClassifier(
    estimators=[
        ('lr', LogisticRegression(max_iter=1000)),
        ('nb', MultinomialNB()),
        ('rf', RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1))
    ],
    voting='hard'
)

models["Ensemble"] = ensemble

In [None]:
for name, model in models.items():
    model.fit(X_train_tfidf, y_train)
    y_pred = model.predict(X_val_tfidf)
    print(f"{name} Accuracy:", accuracy_score(y_val, y_pred))
    print(" Classification Report:\n", classification_report(y_val, y_pred))

In [None]:
best_model = models["Ensemble"]  # changed after comparing above validation accuracy
best_model.fit(vectoraizer.transform(X), y)  # retrain on full training set

In [None]:
X_test = test_df["headline"]
X_test_tfidf = vectoraizer.transform(X_test)
y_test_pred = best_model.predict(X_test_tfidf)

# Replace labels with predictions
test_df["label"] = y_test_pred

# Save predictions (keep tab format, no header)
test_df.to_csv("testing_predictions.csv", index=False)
print("Predictions saved to testing_predictions_v2.csv")

In [None]:
from transformers import pipeline
MODEL = "jy46604790/Fake-News-Bert-Detect"
clf = pipeline("text-classification", model=MODEL, tokenizer=MODEL)

In [None]:
# 1. Exploring pretrained models with Transformer pipeline
df = pd.read_csv("../dataset/testing_predictions.csv", sep=",", header=None, names=["label", "headline"])

df.head(20)

In [None]:
from sklearn.metrics import accuracy_score

# 1) Run predictions on the TEST split
texts = test_df["headline"].astype(str).tolist()

# You can do it in batch (faster and safer)
results = classifier(texts, truncation=True)

# Map model labels to 0/1 as you expect
def to01(r):
    # adjust these names if your model outputs different labels
    return 1 if r["label"] == "LABEL_1" or r["label"] == "FAKE" else 0

preds = [to01(r) for r in results]

# 2) Check lengths before scoring
print("len preds:", len(preds), "len truth:", len(test_df))

# 3) Accuracy
acc = accuracy_score(test_df["label"].values, preds)
print(f"Accuracy: {acc:.2f}")


In [None]:
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer
# 1. Load your data with pandas
train_df = pd.read_csv("training_data.csv", sep="\t", header=None, names=["label", "headline"])

# 2. Split your data
# We create full dataframes for easy conversion to Datasets
train_data, val_data = train_test_split(train_df, test_size=0.2, random_state=42, stratify=train_df['label'])

# 3. Convert pandas DataFrames to Hugging Face Dataset objects
train_dataset = Dataset.from_pandas(train_data)
val_dataset = Dataset.from_pandas(val_data)

# 4. Load the correct tokenizer
model_name = "jy46604790/Fake-News-Bert-Detect"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# 5. Define the tokenization function
# Make sure the key matches your column name ('headline')
def tokenize_function(examples):
    return tokenizer(examples['headline'], padding="max_length", truncation=True, max_length=128)

# 6. Apply the tokenizer using the correct .map() method
# Now this will work because train_dataset is a Dataset object
tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_val = val_dataset.map(tokenize_function, batched=True)

# You can now proceed to load your model and use the Trainer API with
# tokenized_train and tokenized_val.
print("Tokenization successful!")
print(tokenized_train)

In [None]:
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained("jy46604790/Fake-News-Bert-Detect", num_labels=2)
from sklearn.metrics import precision_recall_fscore_support
def compute_metrics(pred):
    predictions, labels = pred
    preds = predictions.argmax(axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}

In [None]:
import sys, os, inspect, transformers
from transformers import TrainingArguments

print("transformers version:", transformers.__version__)
print("python exe:", sys.executable)
print("transformers file:", transformers.__file__)
print("TA module:", TrainingArguments.__module__)
print("TA has evaluation_strategy?:", "evaluation_strategy" in str(inspect.signature(TrainingArguments.__init__)))

# vê se existe algum arquivo/pasta local chamado 'transformers*' que possa estar sombreando
print("local shadows:", [p for p in os.listdir() if p.lower().startswith("transformers")])


In [None]:
from transformers import TrainingArguments, Trainer
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.1,
    save_strategy="epoch",
    logging_dir="./logs",
    logging_strategy="steps",
    logging_steps=50,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    report_to="none",
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

In [None]:
# 1. Evaluate model BEFORE fine-tuning
print("\n Evaluating the model BEFORE fine-tuning")
eval_results_before = trainer.evaluate(eval_dataset=tokenized_val)
accuracy_before = eval_results_before["eval_accuracy"]

# 2. Fine-tuning (você já fez isso antes com trainer.train())

# 3. Evaluate model AFTER fine-tuning
print("\n Evaluating the model AFTER fine-tuning")
eval_results_after = trainer.evaluate(eval_dataset=tokenized_val)
accuracy_after = eval_results_after["eval_accuracy"]

# 4. Final Comparison
print("\n-- Performance Comparison --")
print(f"Accuracy BEFORE Fine-Tuning: {accuracy_before:.2f}")
print(f"Accuracy AFTER Fine-Tuning:  {accuracy_after:.2f}")
