# Import Necessary Libraries

In [1]:
import pandas as pd
import re
import numpy as np

In [2]:
import nltk
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

In [3]:
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Vineeth\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Import Data

In [4]:
data = pd.read_csv("inputData.csv")

In [5]:
print(f"Shape of the data {data.shape}\nColumns Present in data {data.columns}")

Shape of the data (20800, 5)
Columns Present in data Index(['id', 'title', 'author', 'text', 'label'], dtype='object')


# Some Preprocessing

In [6]:
data.isnull().sum()

id           0
title      558
author    1957
text        39
label        0
dtype: int64

In [7]:
data = data.dropna() # Drop nan values from data

In [8]:
x = data.drop(['label'],axis=1)
y = data['label']

In [9]:
x['text']

0        House Dem Aide: We Didn’t Even See Comey’s Let...
1        Ever get the feeling your life circles the rou...
2        Why the Truth Might Get You Fired October 29, ...
3        Videos 15 Civilians Killed In Single US Airstr...
4        Print \nAn Iranian woman has been sentenced to...
                               ...                        
20795    Rapper T. I. unloaded on black celebrities who...
20796    When the Green Bay Packers lost to the Washing...
20797    The Macy’s of today grew from the union of sev...
20798    NATO, Russia To Hold Parallel Exercises In Bal...
20799      David Swanson is an author, activist, journa...
Name: text, Length: 18285, dtype: object

In [10]:
sentences = x.copy()
sentences.reset_index(inplace=True)

In [11]:
stemmer = PorterStemmer()

In [12]:
nltk.download("punkt_tab")

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Vineeth\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [15]:
print(sentences['title'][1003])

N.C. Trooper Investigated in Fatal Shooting of Deaf Motorist - The New York Times


In [27]:
from tqdm import tqdm
stop_words = set(stopwords.words("english"))
def clean_text_generator(sentences):
    for sentence in tqdm(sentences, desc="Number of Sentences Processed:"):
        cleaned_sentence = ""
        sentence = re.sub("[^a-zA-Z]", " ", sentence)
        sentence = sentence.lower()
        words = nltk.word_tokenize(sentence)
        for word in words:
            if word not in stop_words:
                cleaned_sentence += " " + stemmer.stem(word)
        yield cleaned_sentence.strip()
# Use a generator
corpus = list(clean_text_generator(sentences['title']))

Number of Sentences Processed:: 100%|██████████| 18285/18285 [00:03<00:00, 5087.62it/s]


In [28]:
corpus

['hous dem aid even see comey letter jason chaffetz tweet',
 'flynn hillari clinton big woman campu breitbart',
 'truth might get fire',
 'civilian kill singl us airstrik identifi',
 'iranian woman jail fiction unpublish stori woman stone death adulteri',
 'jacki mason hollywood would love trump bomb north korea lack tran bathroom exclus video breitbart',
 'beno hamon win french socialist parti presidenti nomin new york time',
 'back channel plan ukrain russia courtesi trump associ new york time',
 'obama organ action partner soro link indivis disrupt trump agenda',
 'bbc comedi sketch real housew isi caus outrag',
 'russian research discov secret nazi militari base treasur hunter arctic photo',
 'us offici see link trump russia',
 'ye paid govern troll social media blog forum websit',
 'major leagu soccer argentin find home success new york time',
 'well fargo chief abruptli step new york time',
 'anonym donor pay million releas everyon arrest dakota access pipelin',
 'fbi close hilla

# Model Building

In [29]:
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense

In [30]:
voc_size=5000

In [31]:
onehot_repr=[one_hot(words,voc_size)for words in corpus] 
onehot_repr

[[3292, 2335, 2153, 303, 2537, 620, 2117, 3513, 631, 3913],
 [2932, 1930, 406, 925, 3814, 3837, 3188],
 [2882, 678, 532, 1464],
 [4204, 3455, 1492, 1679, 2188, 1384],
 [3659, 3814, 2349, 511, 3662, 2938, 3814, 3483, 2520, 4082],
 [3588,
  3463,
  4346,
  4890,
  843,
  2239,
  4294,
  3891,
  279,
  3080,
  3736,
  4157,
  3646,
  249,
  3188],
 [3636, 1416, 3349, 3662, 1172, 3781, 3236, 4969, 2463, 1174, 2143],
 [4782, 2797, 783, 3949, 4250, 4679, 2239, 1783, 2463, 1174, 2143],
 [2831, 1559, 1740, 3675, 3406, 2374, 3542, 4363, 2239, 1168],
 [2146, 2306, 466, 3705, 3798, 1847, 3844, 2964],
 [2548, 252, 272, 1228, 3393, 13, 2402, 4111, 3483, 4288, 2597],
 [1679, 4725, 2537, 2374, 2239, 4250],
 [3544, 3613, 470, 4982, 988, 4755, 3185, 1399, 3375],
 [3272, 4037, 3428, 321, 4650, 971, 2335, 2463, 1174, 2143],
 [1405, 288, 4906, 3611, 237, 2463, 1174, 2143],
 [1458, 4218, 3952, 2034, 1216, 2132, 2015, 4586, 1717, 1952],
 [3212, 408, 1930],
 [1220, 2218, 4962, 491, 2239, 2238, 4980, 3188],
 

In [None]:

sent_length = max([len(seq) for seq in onehot_repr])
print(sent_length)
embedded_docs=pad_sequences(onehot_repr,padding='pre',maxlen=sent_length)
print(embedded_docs)

47
[[   0    0    0 ... 3513  631 3913]
 [   0    0    0 ... 3814 3837 3188]
 [   0    0    0 ...  678  532 1464]
 ...
 [   0    0    0 ... 2463 1174 2143]
 [   0    0    0 ... 4500  840 2706]
 [   0    0    0 ... 1633 1850 1056]]


In [63]:
print(len(embedded_docs))

18285


In [75]:
## Creating model
embedding_vector_features=40
model=Sequential()
model.add(Embedding(voc_size,embedding_vector_features,input_length=sent_length))
model.add(LSTM(100,return_sequences=True))
model.add(LSTM(50))
model.add(Dense(1,activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
model.build(input_shape=(None, sent_length))  # Batch size is None for flexibility
print(model.summary())



None


In [76]:
import numpy as np
X_final=np.array(embedded_docs)
y_final=np.array(y)

In [66]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_final, y_final, test_size=0.20, random_state=42)

In [67]:
len(X_train)

14628

In [77]:
### Finally Training
model.fit(X_train,y_train,validation_data=(X_test,y_test),epochs=10,batch_size=64)

Epoch 1/10
[1m229/229[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 24ms/step - accuracy: 0.7885 - loss: 0.4180 - val_accuracy: 0.9103 - val_loss: 0.2044
Epoch 2/10
[1m229/229[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 23ms/step - accuracy: 0.9470 - loss: 0.1368 - val_accuracy: 0.9150 - val_loss: 0.2056
Epoch 3/10
[1m229/229[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 24ms/step - accuracy: 0.9652 - loss: 0.1000 - val_accuracy: 0.9185 - val_loss: 0.2157
Epoch 4/10
[1m229/229[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 23ms/step - accuracy: 0.9781 - loss: 0.0673 - val_accuracy: 0.9161 - val_loss: 0.2691
Epoch 5/10
[1m229/229[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 25ms/step - accuracy: 0.9854 - loss: 0.0480 - val_accuracy: 0.9130 - val_loss: 0.2886
Epoch 6/10
[1m229/229[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 26ms/step - accuracy: 0.9916 - loss: 0.0309 - val_accuracy: 0.9098 - val_loss: 0.3243
Epoch 7/10
[1m229/229

<keras.src.callbacks.history.History at 0x20dcab61070>

In [78]:
from sklearn.metrics import confusion_matrix
y_pred_prob = model.predict(X_test)  # Returns probabilities

# Convert probabilities to class labels
y_pred = (y_pred_prob > 0.5).astype(int)  # Shape: (n_samples,)

# Compute confusion matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)


[1m115/115[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step
[[1878  204]
 [ 129 1446]]


In [80]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

0.9089417555373257