In [None]:
#%% Step 1: Import Required Libraries
import re
import zipfile
import os
import pandas as pd
import numpy as np
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import (accuracy_score, precision_score, recall_score, 
                            f1_score, confusion_matrix, roc_curve, auc, 
                            RocCurveDisplay)

#%% Step 2: Data Preparation
# Load datasets
fake_df = pd.read_csv("Fake[1].csv")
true_df = pd.read_csv("True[1].csv")

# Add labels and combine
fake_df['label'] = 0
true_df['label'] = 1
combined_df = pd.concat([fake_df, true_df], ignore_index=True)

# Combine title and text
combined_df['full_text'] = combined_df['title'] + " " + combined_df['text']

#%% Step 3: Custom Tokenizer Implementation
class VeritasTokenizer:
    def __init__(self):
        self.emoticons = r"""
            (?:
                [<>]?
                [:;=8]                     # eyes
                [\-o\*\']?                 # optional nose
                [\)\]\(\[dDpP/\:\}\{@\|\\] # mouth      
                |
                [\)\]\(\[dDpP/\:\}\{@\|\\] # mouth
                [:;=8]                     # eyes
                [\-o\*\']?                 # optional nose
                [<>]?
            )"""
        self.contractions = re.compile(r"(\b\w+\b)(n't|'ll|'ve|'re|'d|'s|'m)\b")
        self.repeats = re.compile(r"(\w*)(\w)\2{2,}(\w*)")
        
    def tokenize(self, text):
        # Handle emoticons
        text = re.sub(self.emoticons, lambda m: " " + m.group() + " ", text, flags=re.VERBOSE)
        
        # Handle contractions
        text = self.contractions.sub(r"\1 \2", text)
#%% Step 1: Import Required Libraries
import re
import zipfile
import os
import pandas as pd
import numpy as np
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import (accuracy_score, precision_score, recall_score, 
                            f1_score, confusion_matrix, roc_curve, auc, 
                            RocCurveDisplay)

#%% Step 2: Data Preparation
# Load datasets
fake_df = pd.read_csv("Fake[1].csv")
true_df = pd.read_csv("True[1].csv")

# Add labels and combine
fake_df['label'] = 0
true_df['label'] = 1
combined_df = pd.concat([fake_df, true_df], ignore_index=True)

# Combine title and text
combined_df['full_text'] = combined_df['title'] + " " + combined_df['text']

#%% Step 3: Custom Tokenizer Implementation
class VeritasTokenizer:
    def __init__(self):
        self.emoticons = r"""
            (?:
                [<>]?
                [:;=8]                     # eyes
                [\-o\*\']?                 # optional nose
                [\)\]\(\[dDpP/\:\}\{@\|\\] # mouth      
                |
                [\)\]\(\[dDpP/\:\}\{@\|\\] # mouth
                [:;=8]                     # eyes
                [\-o\*\']?                 # optional nose
                [<>]?
            )"""
        self.contractions = re.compile(r"(\b\w+\b)(n't|'ll|'ve|'re|'d|'s|'m)\b")
        self.repeats = re.compile(r"(\w*)(\w)\2{2,}(\w*)")
        
    def tokenize(self, text):
        # Handle emoticons
        text = re.sub(self.emoticons, lambda m: " " + m.group() + " ", text, flags=re.VERBOSE)
        
        # Handle contractions
        text = self.contractions.sub(r"\1 \2", text)
        
        # Handle repeated characters
        text = self.repeats.sub(lambda m: f"{m.group(1)}{m.group(2)} <REPEAT:{len(m.group(0))-len(m.group(1)+m.group(3))}> {m.group(3)}", text)
        
        # Final tokenization
        tokens = re.findall(r"""
            \b\w+['’]?\w*\b|          # Words with apostrophes
            {emoticons}|               # Emoticons
            <REPEAT:\d+>|             # Repeat tokens
            [.,!?;:()\"'`]            # Punctuation
        """.format(emoticons=self.emoticons), text, flags=re.VERBOSE)
        
        return [token.lower() for token in tokens if token.strip()]

#%% Step 4: Rule-Based POS Tagger
class VeritasPOSTagger:
    def __init__(self):
        self.rules = {
            'VERB': [r'ing$', r'ed$', r's$'],
            'ADJ': [r'ous$', r'ive$', r'al$', r'ic$'],
            'ADV': [r'ly$']
        }
        
    def tag(self, tokens):
        tagged = []
        for i, token in enumerate(tokens):
            pos = 'NOUN'  # Default
            for pos_type, patterns in self.rules.items():
                for pattern in patterns:
                    if re.search(pattern, token):
                        pos = pos_type
                        break
            # Special case for verbs after 'to'
            if i > 0 and tokens[i-1] == 'to' and pos == 'NOUN':
                pos = 'VERB'
            tagged.append((token, pos))
        return tagged

#%% Step 5: Custom Lemmatizer
class VeritasLemmatizer:
    def __init__(self):
        self.suffix_map = {
            'VERB': [('ing', ''), ('ed', 'e'), ('s', '')],
            'NOUN': [('s', ''), ('ies', 'y'), ('es', '')],
            'ADJ': [('er', ''), ('est', '')]
        }
        
    def lemmatize(self, token, pos):
        token = token.lower()
        for suffix, replacement in self.suffix_map.get(pos, []):
            if token.endswith(suffix):
                return token[:-len(suffix)] + replacement
        return token

#%% Step 6: Full Text Processing Pipeline
tokenizer = VeritasTokenizer()
pos_tagger = VeritasPOSTagger()
lemmatizer = VeritasLemmatizer()

def process_text(text):
    tokens = tokenizer.tokenize(str(text))
    tagged = pos_tagger.tag(tokens)
    return [lemmatizer.lemmatize(token, pos) for token, pos in tagged]

# Process sample data
combined_df['processed'] = combined_df['full_text'].apply(process_text)

#%% Step 7: Feature Extraction
# Convert processed text to strings for vectorization
combined_df['processed_str'] = combined_df['processed'].apply(' '.join)

# Create TF-IDF features
tfidf = TfidfVectorizer(max_features=10000, stop_words='english')
X = tfidf.fit_transform(combined_df['processed_str'])
y = combined_df['label']

#%% Step 8: Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y)

#%% Step 9: Model Training
# Naive Bayes
nb = MultinomialNB(alpha=0.1)
nb.fit(X_train, y_train)

# SVM (using SGD for efficiency)
svm = SGDClassifier(loss='hinge', penalty='l2', alpha=0.0001, 
                   max_iter=1000, random_state=42)
svm.fit(X_train, y_train)

#%% Step 10: Evaluation
def evaluate_model(model, name):
    y_pred = model.predict(X_test)
    probas = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else None
    
    print(f"\n{name} Performance:")
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    print(f"Precision: {precision_score(y_test, y_pred):.4f}")
    print(f"Recall: {recall_score(y_test, y_pred):.4f}")
    print(f"F1 Score: {f1_score(y_test, y_pred):.4f}")
    
    # Confusion Matrix
    cm = confusion_matrix(y_test, y_pred)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title(f'{name} Confusion Matrix')
    plt.show()
    
    # ROC Curve
    if probas is not None:
        fpr, tpr, _ = roc_curve(y_test, probas)
        roc_auc = auc(fpr, tpr)
        RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=roc_auc, 
                       estimator_name=name).plot()
        plt.title(f'{name} ROC Curve (AUC = {roc_auc:.4f})')
        plt.show()

evaluate_model(nb, "Naive Bayes")
evaluate_model(svm, "SVM")

#%% Step 11: Visualizations
# Word Clouds
def generate_wordcloud(text, title):
    wordcloud = WordCloud(width=800, height=400, 
                         background_color='white').generate(text)
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud)
    plt.title(title)
    plt.axis("off")
    plt.show()

# Fake news word cloud
fake_text = ' '.join(combined_df[combined_df['label'] == 0]['processed_str'])
generate_wordcloud(fake_text, "Fake News Word Cloud")

# Real news word cloud
true_text = ' '.join(combined_df[combined_df['label'] == 1]['processed_str'])
generate_wordcloud(true_text, "Real News Word Cloud")

# Repeat token analysis
repeat_counts = Counter()
for tokens in combined_df['processed']:
    repeat_counts.update([token for token in tokens if '<REPEAT' in token])
    
plt.figure(figsize=(10, 5))
sns.barplot(x=list(repeat_counts.keys())[:10], 
            y=list(repeat_counts.values())[:10])
plt.title("Top 10 Repeated Character Patterns")
plt.xticks(rotation=45)
plt.show()
        

#%% Step 4: Rule-Based POS Tagger
class VeritasPOSTagger:
    def __init__(self):
        self.rules = {
            'VERB': [r'ing$', r'ed$', r's$'],
            'ADJ': [r'ous$', r'ive$', r'al$', r'ic$'],
            'ADV': [r'ly$']
        }
        
    def tag(self, tokens):
        tagged = []
        for i, token in enumerate(tokens):
            pos = 'NOUN'  # Default
            for pos_type, patterns in self.rules.items():
                for pattern in patterns:
                    if re.search(pattern, token):
                        pos = pos_type
                        break
            # Special case for verbs after 'to'
            if i > 0 and tokens[i-1] == 'to' and pos == 'NOUN':
                pos = 'VERB'
            tagged.append((token, pos))
        return tagged

#%% Step 5: Custom Lemmatizer
class VeritasLemmatizer:
    def __init__(self):
        self.suffix_map = {
            'VERB': [('ing', ''), ('ed', 'e'), ('s', '')],
            'NOUN': [('s', ''), ('ies', 'y'), ('es', '')],
            'ADJ': [('er', ''), ('est', '')]
        }
        
    def lemmatize(self, token, pos):
        token = token.lower()
        for suffix, replacement in self.suffix_map.get(pos, []):
            if token.endswith(suffix):
                return token[:-len(suffix)] + replacement
        return token

#%% Step 6: Full Text Processing Pipeline
tokenizer = VeritasTokenizer()
pos_tagger = VeritasPOSTagger()
lemmatizer = VeritasLemmatizer()

def process_text(text):
    tokens = tokenizer.tokenize(str(text))
    tagged = pos_tagger.tag(tokens)
    return [lemmatizer.lemmatize(token, pos) for token, pos in tagged]

# Process sample data
combined_df['processed'] = combined_df['full_text'].apply(process_text)

#%% Step 7: Feature Extraction
# Convert processed text to strings for vectorization
combined_df['processed_str'] = combined_df['processed'].apply(' '.join)

# Create TF-IDF features
tfidf = TfidfVectorizer(max_features=10000, stop_words='english')
X = tfidf.fit_transform(combined_df['processed_str'])
y = combined_df['label']

#%% Step 8: Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y)

#%% Step 9: Model Training
# Naive Bayes
nb = MultinomialNB(alpha=0.1)
nb.fit(X_train, y_train)

# SVM (using SGD for efficiency)
svm = SGDClassifier(loss='hinge', penalty='l2', alpha=0.0001, 
                   max_iter=1000, random_state=42)
svm.fit(X_train, y_train)

#%% Step 10: Evaluation
def evaluate_model(model, name):
    y_pred = model.predict(X_test)
    probas = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else None
    
    print(f"\n{name} Performance:")
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    print(f"Precision: {precision_score(y_test, y_pred):.4f}")
    print(f"Recall: {recall_score(y_test, y_pred):.4f}")
    print(f"F1 Score: {f1_score(y_test, y_pred):.4f}")
    
    # Confusion Matrix
    cm = confusion_matrix(y_test, y_pred)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title(f'{name} Confusion Matrix')
    plt.show()
    
    # ROC Curve
    if probas is not None:
        fpr, tpr, _ = roc_curve(y_test, probas)
        roc_auc = auc(fpr, tpr)
        RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=roc_auc, 
                       estimator_name=name).plot()
        plt.title(f'{name} ROC Curve (AUC = {roc_auc:.4f})')
        plt.show()

evaluate_model(nb, "Naive Bayes")
evaluate_model(svm, "SVM")

#%% Step 11: Visualizations
# Word Clouds
def generate_wordcloud(text, title):
    wordcloud = WordCloud(width=800, height=400, 
                         background_color='white').generate(text)
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud)
    plt.title(title)
    plt.axis("off")
    plt.show()

# Fake news word cloud
fake_text = ' '.join(combined_df[combined_df['label'] == 0]['processed_str'])
generate_wordcloud(fake_text, "Fake News Word Cloud")

# Real news word cloud
true_text = ' '.join(combined_df[combined_df['label'] == 1]['processed_str'])
generate_wordcloud(true_text, "Real News Word Cloud")

# Repeat token analysis
repeat_counts = Counter()
for tokens in combined_df['processed']:
    repeat_counts.update([token for token in tokens if '<REPEAT' in token])
    
plt.figure(figsize=(10, 5))
sns.barplot(x=list(repeat_counts.keys())[:10], 
            y=list(repeat_counts.values())[:10])
plt.title("Top 10 Repeated Character Patterns")
plt.xticks(rotation=45)
plt.show()