# PREDICT VALUE FOR MISSING QUESTION TOPIC

## Import Libraries

In [1]:
import os
os.environ['TF_MIN_GPU_MULTIPROCESSOR_COUNT'] = '6' # Needed so I can use my old GPU with the new one
os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0' # Turns off oneDNN custom operations
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '1' # Hides message regarding TensorFlow optimization
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split

2025-11-20 15:31:04.965149: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1763613065.031574     555 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1763613065.051774     555 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1763613065.421332     555 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1763613065.421377     555 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1763613065.421380     555 computation_placer.cc:177] computation placer alr

## Import CSV File & Remove Duplicates
NB:

'question_topic_valid.csv' represents records from the original dataset where 'question_topic' was not empty while 'question_topic_null.csv' are records with missing values for 'question_topic'.

In [2]:
# Import patitioned libraries of original dataset 
df_topic_exists = pd.read_csv('../data/question_topic_valid.csv', usecols=[2,3,4,13,14]) # Import only essential columns
df_topic_null = pd.read_csv('../data/question_topic_null.csv')

# Drops duplicate values for 'question_content' that were created in the original dataset because of the 'response' columns
df_topic_exists.drop_duplicates(inplace=True)
df_topic_exists.reset_index(drop=True, inplace=True)

## Tokenize The Questions

In [3]:
def list_of_sentences(col):
    sentence_list = []
    for text in col:
        splitted_text = text.lower().split()
        sentence_list.append(splitted_text)
    return sentence_list

sentences_topic_exist = list_of_sentences(df_topic_exists.question_content)
sentences_topic_null = list_of_sentences(df_topic_null.question_content)

# init the tokenizer with a out_of_vocabulary token 
tokenizer_X = Tokenizer(oov_token="<OOV>")

# generate word indexes for all sentences 
tokenizer_X.fit_on_texts(sentences_topic_exist+sentences_topic_null)

# generate separate sequences for both with topic values and missing values
X = tokenizer_X.texts_to_sequences(sentences_topic_exist)
X_topic_null = tokenizer_X.texts_to_sequences(sentences_topic_null)

## Determine Word Counts & Maximum Sentence Length

In [4]:
print(f'The total number of words from all questions is {len(tokenizer_X.word_counts)}.')

max_len = 0
for l in X + X_topic_null:
    if len(l) > max_len:
        max_len = len(l)

print(f'The highest number of words in any sentence is {max_len}.')


The total number of words from all questions is 1292953.
The highest number of words in any sentence is 197.


In [5]:
max_features = 20000     # Use 20000 most frequent words from the total of 1292953 words

## Create Train & Test Datasets & Prepare For Model

In [6]:
# Split data into training and test sets
X_train_df, X_test_df, y_train_df, y_test_df = train_test_split(X, df_topic_exists.question_topic, test_size=0.2,
                                                                stratify=df_topic_exists.question_topic, random_state=42)

# Format X and y for model
X_train = np.array(sequence.pad_sequences(X_train_df, maxlen=max_len))
X_test = np.array(sequence.pad_sequences(X_test_df, maxlen=max_len))

y_train_one_hot = pd.get_dummies(y_train_df)
y_train = y_train_one_hot.to_numpy()
y_test_one_hot = pd.get_dummies(y_test_df)
y_test = y_test_one_hot.to_numpy()

## Configure Model

In [7]:
# Create transformer block class
class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerBlock, self).__init__()
        self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = tf.keras.Sequential([
            layers.Dense(ff_dim, activation='relu'),
            layers.Dense(embed_dim)
        ])
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)

    def call(self, inputs, training=None):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

In [8]:
# Define the model with an embedding layer, transformer block, and output layer
embed_dim = 32 # embedding dimension for each word vector
num_heads = 2  # the number of attention heads in the multi-head attention layer
ff_dim = 32    # number of units in the feed forward layer

inputs = layers.Input(shape=(max_len,))

embedding_layer = layers.Embedding(input_dim=max_features, output_dim=embed_dim)
out = embedding_layer(inputs)
transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)
out = transformer_block(out, training=True)
out = layers.GlobalAveragePooling1D()(out)
out = layers.Dropout(0.1)(out)
out = layers.Dense(20, activation='relu')(out)
out = layers.Dropout(0.1)(out)
outputs = layers.Dense(148, activation='softmax')(out)

model = tf.keras.Model(inputs=inputs, outputs=outputs)

I0000 00:00:1763613477.288486     555 gpu_device.cc:2019] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 5518 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 4060 Ti, pci bus id: 0000:01:00.0, compute capability: 8.9
I0000 00:00:1763613477.291889     555 gpu_device.cc:2019] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 2857 MB memory:  -> device: 1, name: NVIDIA GeForce GTX 1050 Ti, pci bus id: 0000:07:00.0, compute capability: 6.1


## Compile & Train Model

In [9]:
# Compile model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train model
model.fit(X_train, y_train, epochs=3, batch_size=1024, validation_split=0.2)

2025-11-20 15:38:02.293059: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 2318536340 exceeds 10% of free system memory.
2025-11-20 15:38:05.934049: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 435461140 exceeds 10% of free system memory.
2025-11-20 15:38:07.584169: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 435461140 exceeds 10% of free system memory.


Epoch 1/3


I0000 00:00:1763613498.396410     831 service.cc:152] XLA service 0x72892c009b30 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1763613498.406317     831 service.cc:160]   StreamExecutor device (0): NVIDIA GeForce RTX 4060 Ti, Compute Capability 8.9
I0000 00:00:1763613498.406343     831 service.cc:160]   StreamExecutor device (1): NVIDIA GeForce GTX 1050 Ti, Compute Capability 6.1
I0000 00:00:1763613500.283499     831 cuda_dnn.cc:529] Loaded cuDNN version 91002
I0000 00:00:1763613520.246825     831 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m2874/2874[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step - accuracy: 0.5738 - loss: 1.7989

2025-11-20 15:40:52.506728: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 579634676 exceeds 10% of free system memory.


[1m2874/2874[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m183s[0m 53ms/step - accuracy: 0.7362 - loss: 1.0314 - val_accuracy: 0.8242 - val_loss: 0.5198
Epoch 2/3
[1m2874/2874[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m122s[0m 42ms/step - accuracy: 0.8166 - loss: 0.5726 - val_accuracy: 0.8257 - val_loss: 0.4836
Epoch 3/3
[1m2874/2874[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m124s[0m 43ms/step - accuracy: 0.8194 - loss: 0.5316 - val_accuracy: 0.8263 - val_loss: 0.4615


<keras.src.callbacks.history.History at 0x728d8a6ec350>

## Evaluate Model

In [10]:
test_loss, test_acc = model.evaluate(X_test, y_test)
print("Test Accuracy:", test_acc)

2025-11-20 15:45:27.078871: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 724543148 exceeds 10% of free system memory.


[1m28734/28734[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m164s[0m 6ms/step - accuracy: 0.8271 - loss: 0.4597
Test Accuracy: 0.8271299600601196


## Extract Failed Predictions

In [11]:
# Store in a list the column names for one-hot encoding (question_topic)
one_hot_columns = list(y_test_one_hot.columns)

# Store predictions for X_test
y_pred = model.predict(X_test)

# Add predictions column to y_test_df
y_test_df = y_test_df.to_frame()
y_test_df['predictions'] = [one_hot_columns[i] for i in np.argmax(y_pred, axis=1)]

# Merge index associated rows from the original source dataset along with the predictions 
test_df = pd.merge(df_topic_exists, y_test_df, left_index=True, right_index=True)

# Create new dataframe that stores rows from test df where predictions were incorrect plus adds the predictions column
false_predictions = pd.DataFrame()
for i,v in test_df.iterrows():
    if v.question_topic_x != v.predictions:
        row = pd.DataFrame({'question_language' : [v.question_language], 'question_content' : [v.question_content],
                            'question_user_status' : [v.question_user_status], 'question_user_country_code' : [v.question_user_country_code],
                            'question_topic': [v.question_topic_x], 'predictions' : [v.predictions]
                            })
        false_predictions = pd.concat([false_predictions, row], ignore_index=True)

[1m28734/28734[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m116s[0m 4ms/step


# Export Test df For Predictions Versus Actual Analysis

In [12]:
test_df.to_csv('../data/prediction_vs_actual_topic.csv', index=False)

## Check For Any Indicators For Failure Rate

In [13]:
print(f'The test data % failure rate by {(false_predictions.question_language.value_counts() / test_df.question_language.value_counts()) * 100}\n')
print(f'The test data % failure rate by {(false_predictions.question_user_country_code.value_counts() / test_df.question_user_country_code.value_counts()) * 100}\n')
print(f'The test data % failure rate by {(false_predictions.question_user_status.value_counts() / test_df.question_user_status.value_counts()) * 100}')

The test data % failure rate by question_language
eng    18.167660
swa    16.958074
nyn    11.906100
lug    15.306250
Name: count, dtype: float64

The test data % failure rate by question_user_country_code
ke    19.857620
ug    16.139403
tz    13.898236
gb    23.529412
Name: count, dtype: float64

The test data % failure rate by question_user_status
live         16.862772
zombie       18.568636
destroyed    18.450623
blocked      16.258896
Name: count, dtype: float64


In [14]:
question_topic_failed = false_predictions.question_topic.value_counts().rename_axis('question_topic').reset_index(name='failed_prediction')
question_topic_total = test_df.question_topic_x.value_counts().rename_axis('question_topic').reset_index(name='total')
question_topic = pd.merge(question_topic_failed, question_topic_total, how='inner')
question_topic['percentage_failed'] = (question_topic['failed_prediction'] / question_topic['total']) * 100
question_topic = question_topic.sort_values(by=['percentage_failed'],ascending=False).reset_index(drop=True)

print(f'The top 60 failure rates \n {question_topic.head(60)}\n')
print(f'The bottom 60 failure rates \n {question_topic.tail(60)}')

The top 60 failure rates 
       question_topic  failed_prediction  total  percentage_failed
0          courgette                 56     56         100.000000
1        castor-bean                  7      7         100.000000
2              vetch                  7      7         100.000000
3       purple-vetch                  2      2         100.000000
4          caliandra                 13     13         100.000000
5          asparagus                 12     12         100.000000
6             celery                 11     11         100.000000
7           snap-pea                 10     10         100.000000
8            apricot                  8      8         100.000000
9               leek                 17     17         100.000000
10          chickpea                 26     26         100.000000
11        gooseberry                 30     30         100.000000
12           setaria                  5      5         100.000000
13          leucaena                  6      6   

# Free Memory For Next Step

NB: Optional step if system resources are limited

In [15]:
# %xdel sentences_topic_exist
# %xdel X 
# %xdel X_train_df
# %xdel X_test_df
# %xdel y_train_df
# %xdel y_test_df
# %xdel X_train
# %xdel X_test
# %xdel y_train_one_hot
# %xdel y_train
# %xdel y_test_one_hot
# %xdel y_test
# %xdel y_pred
# %xdel false_predictions
# %xdel df_topic_exists

## Make Predictions For Missing question_topic Values

In [16]:
# Create X input and make predictions
X_topic_null_predict = np.array(sequence.pad_sequences(X_topic_null, maxlen=max_len))
y_pred_topic_null = model.predict(X_topic_null_predict)

# Convert predictions to labels
topic_null_predictions = [one_hot_columns[i] for i in np.argmax(y_pred_topic_null, axis=1)]

# Insert predictions into 'question_topic' column for null dataframe
df_topic_null['question_topic'] = topic_null_predictions

[1m110555/110555[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m409s[0m 4ms/step


# Free Memory For Next Step

NB: Optional step if system resources are limited

In [17]:
# %xdel X_topic_null_predict
# %xdel y_pred_topic_null
# %xdel topic_null_predictions

# Export To CSV File

In [18]:
# Import datset without missing values for 'question_topic'
chunks = pd.read_csv('../data/question_topic_valid.csv',
                     dtype={'question_user_gender': str, 'response_user_gender': str}, # Removes mixed dtypes error message
                     chunksize=100000
                    )

df_topic_exists = pd.DataFrame()
for chunk in chunks:
    df_topic_exists = pd.concat([df_topic_exists,chunk], axis=0)


# Combine dataset without missing values with the predicted values to recreate the full dataset
df_no_missing = pd.concat([df_topic_exists, df_topic_null], axis=0)


# Export the predicted values only and the full dataset now with no missing values
df_topic_null.to_csv('../data/question_topic_predicted.csv', index=False)
df_no_missing.to_csv('../data/question_topic_no_missing.csv', index=False)