In [48]:
import rdflib
import numpy as np
import pandas as pd
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, MaxPooling1D, Dense, Flatten, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import accuracy_score, precision_score, confusion_matrix
from sklearn.model_selection import train_test_split

# Load RDF graph
g = rdflib.Graph()
g.parse("/content/populated_ontology.ttl", format="ttl")

# Scan the graph to extract Article and Source information
articles = {}
for s in g.subjects(rdflib.RDF.type, rdflib.URIRef("http://www.semanticweb.org/talha/ontologies/2024/3/rumour_detection/Article")):
    title = str(g.value(s, rdflib.URIRef("http://www.semanticweb.org/talha/ontologies/2024/3/rumour_detection/title")))
    tweet_count = int(g.value(s, rdflib.URIRef("http://www.semanticweb.org/talha/ontologies/2024/3/rumour_detection/tweetCount")))
    is_real = int(g.value(s, rdflib.URIRef("http://www.semanticweb.org/talha/ontologies/2024/3/rumour_detection/isReal")))
    source = g.value(s, rdflib.URIRef("http://www.semanticweb.org/talha/ontologies/2024/3/rumour_detection/publishedBy"))
    source_domain = str(g.value(source, rdflib.URIRef("http://www.semanticweb.org/talha/ontologies/2024/3/rumour_detection/sourceDomain")))

    articles[s] = {'title': title, 'tweet_count': tweet_count, 'is_real': is_real, 'source_domain': source_domain}

# Convert dictionary to DataFrame
df = pd.DataFrame.from_dict(articles, orient='index')

# Text data preprocessing
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(df['title'])
title_sequences = tokenizer.texts_to_sequences(df['title'])
title_data = pad_sequences(title_sequences, maxlen=100)

tokenizer.fit_on_texts(df['source_domain'])
source_sequences = tokenizer.texts_to_sequences(df['source_domain'])
source_data = pad_sequences(source_sequences, maxlen=10)

# Normalizing tweet counts
tweet_counts_normalized = np.array(df['tweet_count'])
tweet_counts_normalized = (tweet_counts_normalized - np.mean(tweet_counts_normalized)) / np.std(tweet_counts_normalized)
tweet_counts_normalized = tweet_counts_normalized.reshape(-1, 1)  # Reshape for horizontal stacking

# Preparing data and labels
X = np.hstack((title_data, source_data, tweet_counts_normalized))
y = df['is_real'].values

# Check the total length of input features
input_length = X.shape[1]

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# CNN Model
model = Sequential([
    Embedding(input_dim=5000, output_dim=64, input_length=input_length),  # Correct input_length based on actual data width
    Conv1D(64, 5, activation='relu'),
    MaxPooling1D(5),
    Flatten(),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train model
model.fit(X_train, y_train, epochs=10, batch_size=32)

# Evaluate model
y_pred = (model.predict(X_test) > 0.5).astype("int32")
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
tp, fn, fp, tn = conf_matrix.ravel()

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"True Positives: {tp}")
print(f"False Negatives: {fn}")
print(f"False Positives: {fp}")
print(f"True Negatives: {tn}")


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Accuracy: 0.830126803672934
Precision: 0.8960235640648012
True Positives: 755
False Negatives: 353
False Positives: 424
True Negatives: 3042
