In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix, accuracy_score,precision_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D,GlobalMaxPooling1D, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Load the dataset
file_path = 'FakeNewsNet.csv'
df = pd.read_csv(file_path)
# Data Preprocessing
df = df.dropna()
X = df['title'].values
y = df['real'].values
# K-fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)
for fold, (train_idx, test_idx) in enumerate(kf.split(X)):
  print(f"Fold {fold + 1}")
  # Split the dataset into train and test sets for this fold
  X_train, X_test = X[train_idx], X[test_idx]
  y_train, y_test = y[train_idx], y[test_idx]
  # Tokenize and pad text data for training
  tokenizer = Tokenizer(num_words=5000, oov_token='<OOV>')
  tokenizer.fit_on_texts(X_train)
  X_train_sequences = tokenizer.texts_to_sequences(X_train)
  X_train_padded = pad_sequences(X_train_sequences, maxlen=50,
  padding='post', truncating='post')
  # Define and compile the model
  model = Sequential()
  model.add(Embedding(input_dim=5000, output_dim=16,
  input_length=50))
  model.add(Conv1D(128, 5, activation='relu'))
  model.add(GlobalMaxPooling1D())
  model.add(Dense(1, activation='sigmoid'))
  model.compile(optimizer='adam', loss='binary_crossentropy',
  metrics=['accuracy'])
  # Train the model on the current fold
  model.fit(X_train_padded, y_train, epochs=5, batch_size=64,
  verbose=1)
  # Tokenize and pad text data for testing
  X_test_sequences = tokenizer.texts_to_sequences(X_test)
  X_test_padded = pad_sequences(X_test_sequences, maxlen=50,
  padding='post', truncating='post')
  # Predictions and Confusion Matrix
  y_pred = (model.predict(X_test_padded) > 0.5).astype('int')
  conf_matrix = confusion_matrix(y_test, y_pred)
  # Extract values from the confusion matrix
  tn, fp, fn, tp = conf_matrix.ravel()
  # Print True Positives, True Negatives, False Positives, and False Negatives
  print(f'True Positives: {tp}')
  print(f'True Negatives: {tn}')
  print(f'False Positives: {fp}')
  print(f'False Negatives: {fn}')
  # Calculate and print evaluation metrics
  accuracy = accuracy_score(y_test, y_pred)
  precision = precision_score(y_test, y_pred)
  print(f'Accuracy: {accuracy * 100:.2f}%')
  print(f'Precision: {precision * 100:.2f}%')
  print('FINISH')


Fold 1
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
True Positives: 3139
True Negatives: 633
False Positives: 472
False Negatives: 330
Accuracy: 82.47%
Precision: 86.93%
FINISH
Fold 2
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
True Positives: 3162
True Negatives: 646
False Positives: 493
False Negatives: 272
Accuracy: 83.27%
Precision: 86.51%
FINISH
Fold 3
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
True Positives: 3176
True Negatives: 627
False Positives: 442
False Negatives: 328
Accuracy: 83.16%
Precision: 87.78%
FINISH
Fold 4
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
True Positives: 3184
True Negatives: 620
False Positives: 458
False Negatives: 311
Accuracy: 83.18%
Precision: 87.42%
FINISH
Fold 5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
True Positives: 3156
True Negatives: 617
False Positives: 487
False Negatives: 313
Accuracy: 82.51%
Precision: 86.63%
FINISH


In [5]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

# Load the dataset
file_path = 'FakeNewsNet.csv'
df = pd.read_csv(file_path)

# Data Preprocessing
df = df.dropna()
titles = df['title'].values

# TF-IDF vectorization
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(titles)

# Apply K-means clustering
num_clusters = 5  # Adjust the number of clusters as needed
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
kmeans.fit(X)

# Print the top terms per cluster
order_centroids = kmeans.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names_out()
for i in range(num_clusters):
    print(f"Cluster {i}:")
    for ind in order_centroids[i, :10]:  # Print top 10 terms per cluster
        print(f"{terms[ind]}")
    print()

# Assign each article to a cluster
article_clusters = kmeans.predict(X)

# Count the number of articles in each cluster
cluster_counts = {}
for cluster in article_clusters:
    if cluster not in cluster_counts:
        cluster_counts[cluster] = 1
    else:
        cluster_counts[cluster] += 1

# Print the count of articles in each cluster
for cluster, count in cluster_counts.items():
    print(f"Cluster {cluster}: {count} Articles")

# Rename clusters
cluster_names = {
    0: "Royal News",
    1: "Entertainment Awards",
    2: "TV Show and Series",
    3: "Celebrity Gossip (Kardashians)",
    4: "Celebrity Relationships and Gossip"
}

# Print the renamed clusters
for cluster, name in cluster_names.items():
    print(f"{name}: {cluster_counts[cluster]} Articles")




Cluster 0:
new
2018
season
awards
jenner
jennifer
star
baby
says
best

Cluster 1:
kardashian
kim
khloe
kourtney
west
kanye
thompson
tristan
jenner
baby

Cluster 2:
selena
gomez
bieber
justin
weeknd
relationship
timeline
complete
hailey
theroux

Cluster 3:
brad
pitt
angelina
jolie
aniston
jennifer
divorce
kids
dating
custody

Cluster 4:
meghan
markle
prince
harry
royal
wedding
middleton
kate
william
queen

Cluster 0: 20101 Articles
Cluster 3: 524 Articles
Cluster 4: 625 Articles
Cluster 1: 1165 Articles
Cluster 2: 451 Articles
Royal News: 20101 Articles
Entertainment Awards: 1165 Articles
TV Show and Series: 451 Articles
Celebrity Gossip (Kardashians): 524 Articles
Celebrity Relationships and Gossip: 625 Articles
