In [3]:
!pip install rdflib

import rdflib
import numpy as np
import pandas as pd
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, MaxPooling1D, Dense, Flatten, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import accuracy_score, precision_score, confusion_matrix
from sklearn.model_selection import train_test_split

# Load RDF graph
g = rdflib.Graph()
g.parse("/content/populated_ontology.ttl", format="ttl")

# Scan the graph to extract Article and Source information
articles = {}
for s in g.subjects(rdflib.RDF.type, rdflib.URIRef("http://www.semanticweb.org/talha/ontologies/2024/3/rumour_detection/Article")):
    title = str(g.value(s, rdflib.URIRef("http://www.semanticweb.org/talha/ontologies/2024/3/rumour_detection/title")))
    tweet_count = int(g.value(s, rdflib.URIRef("http://www.semanticweb.org/talha/ontologies/2024/3/rumour_detection/tweetCount")))
    is_real = int(g.value(s, rdflib.URIRef("http://www.semanticweb.org/talha/ontologies/2024/3/rumour_detection/isReal")))
    source = g.value(s, rdflib.URIRef("http://www.semanticweb.org/talha/ontologies/2024/3/rumour_detection/publishedBy"))
    source_domain = str(g.value(source, rdflib.URIRef("http://www.semanticweb.org/talha/ontologies/2024/3/rumour_detection/sourceDomain")))

    articles[s] = {'title': title, 'tweet_count': tweet_count, 'is_real': is_real, 'source_domain': source_domain}

# Convert dictionary to DataFrame
df = pd.DataFrame.from_dict(articles, orient='index')

# Text data preprocessing
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(df['title'])
title_sequences = tokenizer.texts_to_sequences(df['title'])
title_data = pad_sequences(title_sequences, maxlen=100)

tokenizer.fit_on_texts(df['source_domain'])
source_sequences = tokenizer.texts_to_sequences(df['source_domain'])
source_data = pad_sequences(source_sequences, maxlen=10)

# Normalizing tweet counts
tweet_counts_normalized = np.array(df['tweet_count'])
tweet_counts_normalized = (tweet_counts_normalized - np.mean(tweet_counts_normalized)) / np.std(tweet_counts_normalized)
tweet_counts_normalized = tweet_counts_normalized.reshape(-1, 1)  # Reshape for horizontal stacking

# Preparing data and labels
X = np.hstack((title_data, source_data, tweet_counts_normalized))
y = df['is_real'].values

# Check the total length of input features
input_length = X.shape[1]

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# CNN Model
model = Sequential([
    Embedding(input_dim=5000, output_dim=64, input_length=input_length),  # Correct input_length based on actual data width
    Conv1D(64, 5, activation='relu'),
    MaxPooling1D(5),
    Flatten(),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train model
model.fit(X_train, y_train, epochs=10, batch_size=32)

# Evaluate model
y_pred = (model.predict(X_test) > 0.5).astype("int32")
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
tp, fn, fp, tn = conf_matrix.ravel()

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"True Positives: {tp}")
print(f"False Negatives: {fn}")
print(f"False Positives: {fp}")
print(f"True Negatives: {tn}")


Collecting rdflib
  Downloading rdflib-7.0.0-py3-none-any.whl (531 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/531.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m174.1/531.9 kB[0m [31m5.4 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m531.9/531.9 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting isodate<0.7.0,>=0.6.0 (from rdflib)
  Downloading isodate-0.6.1-py2.py3-none-any.whl (41 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/41.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.7/41.7 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: isodate, rdflib
Successfully installed isodate-0.6.1 rdflib-7.0.0
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [11]:
!pip install rdflib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import rdflib

# Load RDF graph
g = rdflib.Graph()
g.parse("/content/populated_ontology.ttl", format="ttl")

# Scan the graph to extract Article information
articles = []
for s in g.subjects(rdflib.RDF.type, rdflib.URIRef("http://www.semanticweb.org/talha/ontologies/2024/3/rumour_detection/Article")):
    title = str(g.value(s, rdflib.URIRef("http://www.semanticweb.org/talha/ontologies/2024/3/rumour_detection/title")))
    articles.append(title)

# Convert list of articles to a corpus
corpus = articles

# TF-IDF vectorization
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(corpus)

# Apply K-means clustering
num_clusters = 5  # Adjust the number of clusters as needed
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
kmeans.fit(X)

# Print the top terms per cluster
order_centroids = kmeans.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names_out()
for i in range(num_clusters):
    print(f"Cluster {i}:")
    for ind in order_centroids[i, :10]:  # Print top 10 terms per cluster
        print(f"{terms[ind]}")
    print()

# Assign each article to a cluster
article_clusters = kmeans.predict(X)

# Count the number of articles in each cluster
cluster_counts = {}
for cluster in article_clusters:
    if cluster not in cluster_counts:
        cluster_counts[cluster] = 1
    else:
        cluster_counts[cluster] += 1

# Print the count of articles in each cluster
for cluster, count in cluster_counts.items():
    print(f"Cluster {cluster}: {count} Articles")


Cluster 0:
meghan
markle
prince
harry
royal
wedding
middleton
kate
william
queen

Cluster 1:
2018
awards
2017
music
list
choice
winners
tv
carpet
best

Cluster 2:
season
renewed
premiere
finale
cast
trailer
new
13
episode
little

Cluster 3:
kardashian
kim
khloe
kourtney
west
kanye
thompson
tristan
jenner
baby

Cluster 4:
new
jennifer
justin
jenner
baby
star
brad
says
pitt
selena

Cluster 4: 19331 Articles
Cluster 1: 1121 Articles
Cluster 3: 1159 Articles
Cluster 2: 630 Articles
Cluster 0: 625 Articles


In [12]:
cluster_names = {
    0: "Royal News",
    1: "Entertainment Awards",
    2: "TV Show and Series",
    3: "Celebrity Gossip (Kardashians)",
    4: "Celebrity Relationships and Gossip"
}

# Print the renamed clusters
for cluster, name in cluster_names.items():
    print(f"{name}: {cluster_counts[cluster]} Articles")


Royal News: 625 Articles
Entertainment Awards: 1121 Articles
TV Show and Series: 630 Articles
Celebrity Gossip (Kardashians): 1159 Articles
Celebrity Relationships and Gossip: 19331 Articles
