<a href="https://colab.research.google.com/github/vidya100804/AI-ML/blob/main/Python.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


In [4]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

# --- Conditional Imports for External Libraries ---
import xgboost as xgb
try:
    from catboost import CatBoostClassifier
    CATBOOST_AVAILABLE = True
except ImportError:
    CATBOOST_AVAILABLE = False
    print("CatBoost not installed. Skipping CatBoost.")

try:
    import tensorflow as tf
    from tensorflow.keras.preprocessing.text import Tokenizer
    from tensorflow.keras.preprocessing.sequence import pad_sequences
    from tensorflow.keras.models import Sequential
    from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
    LSTM_AVAILABLE = True
except ImportError:
    LSTM_AVAILABLE = False
    print("TensorFlow/Keras not installed. Skipping LSTM.")

# --- 1. Load Data ---
TRAIN_FILE = '/content/telugu_news_dataset.parquet'
TEST_FILE = '/content/telugu_news_test.parquet'

try:
    train_df = pd.read_parquet(TRAIN_FILE)
    test_df = pd.read_parquet(TEST_FILE)
except Exception as e:
    print(f"Error loading data: {e}. Ensure the files are accessible.")
    exit()

# --- 2. Preprocessing and Feature Engineering ---

# Define columns (assuming standard structure based on metadata snippets)
TEXT_COLS = ['title', 'text']
LABEL_COL = 'category'

# Fill NA values and combine text
for col in TEXT_COLS:
    train_df[col] = train_df[col].fillna('')
    test_df[col] = test_df[col].fillna('')

train_df['combined_text'] = train_df[TEXT_COLS].agg(' '.join, axis=1)
test_df['combined_text'] = test_df[TEXT_COLS].agg(' '.join, axis=1)

# Text cleaning function (generic for multilingual data)
def clean_text(text):
    text = re.sub(r'[^\w\s]', ' ', text) # Remove punctuation
    text = re.sub(r'\d+', ' ', text)      # Remove numbers
    text = re.sub(r'\s+', ' ', text).strip() # Replace multiple spaces with single space
    return text

train_df['combined_text'] = train_df['combined_text'].apply(clean_text)
test_df['combined_text'] = test_df['combined_text'].apply(clean_text)

# --- 3. Label Encoding for Binary Classification (Fake/Correct) ---

# Filter to the two most frequent categories to create a clear binary classification
if LABEL_COL not in train_df.columns:
    print(f"Label column '{LABEL_COL}' not found in training data. Cannot proceed with classification.")
    exit()

top_categories = train_df[LABEL_COL].value_counts().nlargest(2).index.tolist()

if len(top_categories) < 2:
    print("Insufficient categories for binary classification (Fake/Correct). Please check the 'category' column.")
    exit()

# Filter the training data to only include the top 2 categories
train_df_filtered = train_df[train_df[LABEL_COL].isin(top_categories)].copy()

# Map the categories: First most frequent category -> 0 (Fake), Second most frequent category -> 1 (Correct)
category_map = {top_categories[0]: 0, top_categories[1]: 1}
train_df_filtered['label'] = train_df_filtered[LABEL_COL].map(category_map)

print(f"Categories used: {top_categories}. Mapping: {top_categories[0]} -> 0 (Fake), {top_categories[1]} -> 1 (Correct)")

X_train_data = train_df_filtered['combined_text']
y_train = train_df_filtered['label']

# Split the training data for model evaluation
X_train, X_val, y_train_split, y_val_split = train_test_split(
    X_train_data, y_train, test_size=0.2, random_state=42, stratify=y_train
)

# --- 4. Feature Extraction: TF-IDF Vectorizer (for ML Models) ---

tfidf = TfidfVectorizer(max_features=10000, ngram_range=(1, 2))
X_train_tfidf = tfidf.fit_transform(X_train)
X_val_tfidf = tfidf.transform(X_val)
X_test_tfidf = tfidf.transform(test_df['combined_text'])

# --- 5. Model Training and Evaluation (ML Models) ---

models = {
    'Logistic Regression': LogisticRegression(solver='liblinear', random_state=42, max_iter=1000),
    'Naive Bayes': MultinomialNB(),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1),
    'Decision Trees': DecisionTreeClassifier(random_state=42),
    'XGBoost': xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42, n_jobs=-1)
}

if CATBOOST_AVAILABLE:
    models['CatBoost'] = CatBoostClassifier(verbose=0, random_state=42, allow_writing_files=False)

accuracies = {}
trained_models = {}

# Train and evaluate ML models
for name, model in models.items():
    print(f"Training {name}...")
    model.fit(X_train_tfidf, y_train_split)
    y_pred = model.predict(X_val_tfidf)
    accuracy = accuracy_score(y_val_split, y_pred)
    accuracies[name] = accuracy
    trained_models[name] = model
    print(f"  {name} Accuracy: {accuracy:.4f}")

# --- 6. LSTM Model Training and Evaluation (DL Model) ---

if LSTM_AVAILABLE:
    # Hyperparameters for LSTM
    MAX_WORDS = 20000
    MAX_SEQUENCE_LENGTH = 150
    EMBEDDING_DIM = 128

    # Tokenization and Padding
    tokenizer = Tokenizer(num_words=MAX_WORDS, oov_token="<unk>")
    tokenizer.fit_on_texts(X_train)

    X_train_seq = tokenizer.texts_to_sequences(X_train)
    X_val_seq = tokenizer.texts_to_sequences(X_val)
    X_test_seq = tokenizer.texts_to_sequences(test_df['combined_text'])

    X_train_padded = pad_sequences(X_train_seq, maxlen=MAX_SEQUENCE_LENGTH, padding='post', truncating='post')
    X_val_padded = pad_sequences(X_val_seq, maxlen=MAX_SEQUENCE_LENGTH, padding='post', truncating='post')
    X_test_padded = pad_sequences(X_test_seq, maxlen=MAX_SEQUENCE_LENGTH, padding='post', truncating='post')


    # Build LSTM Model
    lstm_model = Sequential([
        Embedding(MAX_WORDS, EMBEDDING_DIM, input_length=MAX_SEQUENCE_LENGTH),
        Dropout(0.3),
        LSTM(128),
        Dropout(0.3),
        Dense(1, activation='sigmoid')
    ])

    lstm_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

    print("\nTraining LSTM...")
    # Train the model (using a small number of epochs for speed)
    lstm_model.fit(
        X_train_padded, np.array(y_train_split),
        epochs=4,
        batch_size=64,
        validation_data=(X_val_padded, np.array(y_val_split)),
        verbose=0
    )
    trained_models['LSTM'] = lstm_model

    # Evaluate LSTM
    _, lstm_accuracy = lstm_model.evaluate(X_val_padded, np.array(y_val_split), verbose=0)
    accuracies['LSTM'] = lstm_accuracy
    print(f"  LSTM Accuracy: {lstm_accuracy:.4f}")

# --- 7. Display Accuracies and Find Best Model ---

print("\n--- Model Accuracies ---")
for name, acc in sorted(accuracies.items(), key=lambda item: item[1], reverse=True):
    print(f"{name}: {acc:.4f}")

best_model_name = max(accuracies, key=accuracies.get)
best_accuracy = accuracies[best_model_name]
print(f"\nHighest Accurate Technique: {best_model_name} ({best_accuracy:.4f})")
final_model = trained_models[best_model_name]

# --- 8. Apply Highest Accurate Technique on Test Data ---

if best_model_name == 'LSTM' and LSTM_AVAILABLE:
    # LSTM prediction on padded sequences
    y_pred_proba = final_model.predict(X_test_padded)
    # Convert probability to binary class (0 or 1)
    y_pred_final = (y_pred_proba > 0.5).astype(int).flatten()
else:
    # ML model prediction on TF-IDF features
    y_pred_final = final_model.predict(X_test_tfidf)

# --- 9. Output to CSV ---
# Create a DataFrame for the output
# 0 as fake, 1 as correct (consistent with the mapping)
output_df = pd.DataFrame({
    'Prediction': y_pred_final
})

# Save to CSV
output_df.to_csv('fake_news_predictions.csv', index=False)

print("\nPrediction complete. Output saved to 'fake_news_predictions.csv'.")

Categories used: ['eenadu_sports', 'eenadu_national']. Mapping: eenadu_sports -> 0 (Fake), eenadu_national -> 1 (Correct)
Training Logistic Regression...
  Logistic Regression Accuracy: 0.9733
Training Naive Bayes...
  Naive Bayes Accuracy: 0.9713
Training Random Forest...
  Random Forest Accuracy: 0.9738
Training Decision Trees...
  Decision Trees Accuracy: 0.9388
Training XGBoost...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


  XGBoost Accuracy: 0.9772
Training CatBoost...
  CatBoost Accuracy: 0.9699





Training LSTM...
  LSTM Accuracy: 0.9329

--- Model Accuracies ---
XGBoost: 0.9772
Random Forest: 0.9738
Logistic Regression: 0.9733
Naive Bayes: 0.9713
CatBoost: 0.9699
Decision Trees: 0.9388
LSTM: 0.9329

Highest Accurate Technique: XGBoost (0.9772)

Prediction complete. Output saved to 'fake_news_predictions.csv'.
