In [1]:
import pandas as pd
import os
from Book_module.Book import Book
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from sklearn.utils import class_weight
from imblearn.over_sampling import SMOTE
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

character_data = []
stories_folder = 'books'
story_files = [f for f in os.listdir(stories_folder) if f.endswith('.txt')]

#Dictionary mapping story filenames to perpetrators
story_perpetrators = {
    'book_bottomless_well.txt': 'Harrison',
    'book_eye_of_apollo.txt': 'Kevin',
    'book_face_in_the_target.txt': 'Jenkins',
    'book_fad_of_the_fisherman.txt': 'Merivale',
    'book_hole_in_the_wall.txt': 'Haddow',
    'book_secret_garden.txt': 'Valentin',
    'book_sign_of_the_broken_sword.txt': 'Arthur',
    'book_soul_of_schoolboy.txt': 'Morty',
    'book_temple_of_silence.txt': 'Verner',
    'book_the_blue_cross.txt': 'Hercule',
    'book_the_flying_stars.txt': 'Hercule',
    'book_the_hammer_of_god.txt': 'Wilfred',
    'book_the_queer_feet.txt': 'Hercule',
    'book_the_sins_of_prince_saradine.txt': 'Saradine',
    'book_the_wrong_shape.txt': 'Harris',
    'book_three_tools_of_death.txt': 'Aaron',
    'book_vanishing_prince.txt': 'Wilson',
    'book_vengeance_of_the_statue.txt': 'Horne'
}

#Process each story and extract features
for story_file in story_files:
    file_path = os.path.join(stories_folder, story_file)
    book = Book(file_path)
    book.pre_process()
    book.feature_extraction()
    
    #Get the perp for this story
    perpetrator = story_perpetrators.get(story_file)
    if perpetrator is None:
        print(f"No perpetrator found for {story_file}. Skipping.")
        continue
    
    for character in book.names:
        #Character-specific features
        mention_count = book.character_mentions_all.get(character, 0)
        first_mention = book.character_mentions_first.get(character, -1)
        sentiment = book.character_sentiments.get(character, 0)
        
        #Rank of the character by mentions
        mentions_sorted = sorted(book.character_mentions_all.items(), key=lambda x: x[1], reverse=True)
        char_rank = next((rank for rank, (name, _) in enumerate(mentions_sorted, 1) if name == character), None)
        
        #Proximity features
        proximity_score = sum(count for (char1, char2), count in book.character_proximity.items() if character in (char1, char2))
        
        #Label 1 if this character is the perp, 0 otherwise
        label = 1 if character == perpetrator else 0

        #Plot phase of first mention
        first_mention_phase = None
        if hasattr(book, 'plot_structure'):
            for phase, (start_idx, end_idx) in book.plot_structure.items():
                if first_mention >= start_idx and first_mention <= end_idx:
                    first_mention_phase = phase
                    break
        if first_mention_phase is None:
            first_mention_phase = 'Unknown'

        #One-hot encode the plot phase
        plot_phases = ['Exposition', 'Rising Action', 'Climax', 'Falling Action', 'Resolution', 'Unknown']
        plot_phase_features = {f'First_Mention_{phase.replace(" ", "_")}': int(first_mention_phase == phase) for phase in plot_phases}

        #Mentions per plot phase
        mentions_per_phase = {phase: 0 for phase in plot_phases}

        #Map sentence indices to plot phases
        sentence_plot_phases = {}
        if hasattr(book, 'plot_structure'):
            for phase, (start_idx, end_idx) in book.plot_structure.items():
                for idx in range(start_idx, end_idx + 1):
                    sentence_plot_phases[idx] = phase
        else:
            for idx in range(len(book.sentences)):
                sentence_plot_phases[idx] = 'Unknown'

        #Count mentions per plot phase
        for idx, sentence in enumerate(book.sentences):
            phase = sentence_plot_phases.get(idx, 'Unknown')
            if character in sentence:
                mentions_per_phase[phase] += 1

        #Add mentions per phase to the features
        total_mentions = sum(mentions_per_phase.values())
        if total_mentions > 0:
            mentions_proportion = {f'Mentions_{phase.replace(" ", "_")}_Proportion': mentions_per_phase[phase] / total_mentions for phase in plot_phases}
        else:
            mentions_proportion = {f'Mentions_{phase.replace(" ", "_")}_Proportion': 0 for phase in plot_phases}
        
        #Prepare the row data
        row = {
            'Story': story_file,
            'Character': character,
            'Label': label,
            'Mention_Count': mention_count,
            'First_Mention': first_mention,
            'Sentiment': sentiment,
            'Rank': char_rank,
            'Proximity_Score': proximity_score,
        }

        #Add the plot phase features
        row.update(plot_phase_features)
        row.update(mentions_proportion)
        
        character_data.append(row)

character_df = pd.DataFrame(character_data)
character_df.fillna(0, inplace=True)

features = [
    'Mention_Count', 'First_Mention', 'Sentiment', 'Rank', 'Proximity_Score',
    'First_Mention_Exposition', 'First_Mention_Rising_Action', 'First_Mention_Climax',
    'First_Mention_Falling_Action', 'First_Mention_Resolution', 'First_Mention_Unknown',
    'Mentions_Exposition_Proportion', 'Mentions_Rising_Action_Proportion',
    'Mentions_Climax_Proportion', 'Mentions_Falling_Action_Proportion',
    'Mentions_Resolution_Proportion', 'Mentions_Unknown_Proportion',
]

#Prepare the dataset
x = character_df[features]
y = character_df['Label']

#Feature scaling
scaler = StandardScaler()
x_scaled = scaler.fit_transform(x)

#Split the data
x_train, x_test, y_train, y_test = train_test_split(x_scaled, y, test_size=0.2, stratify=y, random_state=42)

#Handle class imbalance
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(x_train, y_train)

#Define the NN
model = keras.Sequential([
    layers.Dense(64, activation='relu', input_shape=(X_train_resampled.shape[1],)),
    layers.Dense(32, activation='relu'),
    layers.Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

#Calculate class weights
class_weights = class_weight.compute_class_weight(
    class_weight='balanced',
    classes=np.unique(y_train_resampled),
    y=y_train_resampled
)
class_weights = dict(enumerate(class_weights))

#Train the model
history = model.fit(
    X_train_resampled,
    y_train_resampled,
    epochs=50,
    batch_size=32,
    validation_data=(x_test, y_test),
    class_weight=class_weights,
    verbose=1
)

#Evaluate the model
y_pred_prob = model.predict(x_test)
y_pred = (y_pred_prob > 0.5).astype(int).flatten()

print(classification_report(y_test, y_pred))


2024-10-28 07:31:35.519248: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-10-28 07:31:35.653526: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-10-28 07:31:36.483242: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-10-28 07:31:37.084114: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1730115097.452535   25373 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1730115097.53

Epoch 1/50
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 112ms/step - accuracy: 0.6408 - loss: 0.6489 - val_accuracy: 0.4783 - val_loss: 0.7034
Epoch 2/50
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 0.6942 - loss: 0.5966 - val_accuracy: 0.5652 - val_loss: 0.7000
Epoch 3/50
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 0.7547 - loss: 0.5496 - val_accuracy: 0.5652 - val_loss: 0.6984
Epoch 4/50
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 0.8239 - loss: 0.5012 - val_accuracy: 0.6087 - val_loss: 0.6925
Epoch 5/50
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - accuracy: 0.8373 - loss: 0.4785 - val_accuracy: 0.6522 - val_loss: 0.6751
Epoch 6/50
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step - accuracy: 0.8238 - loss: 0.4776 - val_accuracy: 0.6522 - val_loss: 0.6580
Epoch 7/50
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━