In [None]:
# ✅ STEP 1: Import Libraries
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from sklearn.model_selection import train_test_split

# ✅ STEP 2: Upload JSON File
from google.colab import files
uploaded = files.upload()

# ✅ STEP 3: Load Dataset from JSON
import io
data = []
for filename in uploaded.keys():
    with open(filename, 'r') as f:
        for line in f:
            data.append(json.loads(line))

df = pd.DataFrame(data)
df.head()

# ✅ STEP 4: Data Preprocessing
sentences = df['headline'].values
labels = df['is_sarcastic'].values

# Tokenize text
vocab_size = 10000
max_length = 32
trunc_type = 'post'
padding_type = 'post'
oov_tok = "<OOV>"

tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(sentences)
sequences = tokenizer.texts_to_sequences(sentences)
padded = pad_sequences(sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(padded, labels, test_size=0.2, random_state=42)

# ✅ STEP 5: Build LSTM Model
model = Sequential([
    Embedding(vocab_size, 64, input_length=max_length),
    LSTM(64, return_sequences=False),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

# ✅ STEP 6: Train the Model
history = model.fit(X_train, y_train, epochs=5, validation_data=(X_test, y_test), batch_size=128)

# ✅ STEP 7: Evaluate the Model
loss, accuracy = model.evaluate(X_test, y_test)
print(f"\nTest Accuracy: {accuracy*100:.2f}%")

# ✅ STEP 8: Plot Accuracy and Loss
plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'], label='Train Acc')
plt.plot(history.history['val_accuracy'], label='Val Acc')
plt.title('Model Accuracy')
plt.legend()
plt.grid(True)

plt.subplot(1, 2, 2)
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Val Loss')
plt.title('Model Loss')
plt.legend()
plt.grid(True)

plt.show()

# ✅ STEP 9: Test on Custom Headline
def predict_headline(headline):
    seq = tokenizer.texts_to_sequences([headline])
    padded_seq = pad_sequences(seq, maxlen=max_length, padding=padding_type, truncating=trunc_type)
    prediction = model.predict(padded_seq)[0][0]
    label = "🟢 Genuine" if prediction < 0.5 else "🔴 Sarcastic / Fake"
    print(f"Headline: {headline}\nPrediction Score: {prediction:.2f} → {label}")

# ➕ Example
predict_headline("Scientists discover water on Mars")
predict_headline("World ends tomorrow, women and minorities hardest hit")
