In [2]:
# Import necessary libraries
import pandas as pd
import os
from Book_module.Book import Book
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from sklearn.utils import class_weight
from imblearn.over_sampling import SMOTE
import numpy as np

# Import TensorFlow and Keras
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# Initialize an empty list to store character data
character_data = []

# Path to the folder containing your stories
stories_folder = 'books'

# List all story files
story_files = [f for f in os.listdir(stories_folder) if f.endswith('.txt')]

# Dictionary mapping story filenames to perpetrators
story_perpetrators = {
    'book_bottomless_well.txt': 'Harrison',
    'book_eye_of_apollo.txt': 'Kevin',
    'book_face_in_the_target.txt': 'Jenkins',
    'book_fad_of_the_fisherman.txt': 'Merivale',
    'book_hole_in_the_wall.txt': 'Haddow',
    'book_secret_garden.txt': 'Valentin',
    'book_sign_of_the_broken_sword.txt': 'Arthur',
    'book_soul_of_schoolboy.txt': 'Morty',
    'book_temple_of_silence.txt': 'Verner',
    'book_the_blue_cross.txt': 'Hercule',
    'book_the_flying_stars.txt': 'Hercule',
    'book_the_hammer_of_god.txt': 'Wilfred',
    'book_the_queer_feet.txt': 'Hercule',
    'book_the_sins_of_prince_saradine.txt': 'Saradine',
    'book_the_wrong_shape.txt': 'Harris',
    'book_three_tools_of_death.txt': 'Aaron',
    'book_vanishing_prince.txt': 'Wilson',
    'book_vengeance_of_the_statue.txt': 'Horne'
}

# Process each story and extract features
for story_file in story_files:
    file_path = os.path.join(stories_folder, story_file)
    book = Book(file_path)
    book.pre_process()
    book.feature_extraction()
    
    # Get the perpetrator for this story
    perpetrator = story_perpetrators.get(story_file)
    if perpetrator is None:
        print(f"No perpetrator found for {story_file}. Skipping.")
        continue
    
    for character in book.names:
        # Character-specific features
        mention_count = book.character_mentions_all.get(character, 0)
        first_mention = book.character_mentions_first.get(character, -1)
        sentiment = book.character_sentiments.get(character, 0)
        
        # Rank of the character by mentions
        mentions_sorted = sorted(book.character_mentions_all.items(), key=lambda x: x[1], reverse=True)
        char_rank = next((rank for rank, (name, _) in enumerate(mentions_sorted, 1) if name == character), None)
        
        # Proximity features
        proximity_score = sum(count for (char1, char2), count in book.character_proximity.items() if character in (char1, char2))
        
        # Label: 1 if this character is the perpetrator, 0 otherwise
        label = 1 if character == perpetrator else 0

        # New Feature: Plot Phase of First Mention
        first_mention_phase = None
        if hasattr(book, 'plot_structure'):
            for phase, (start_idx, end_idx) in book.plot_structure.items():
                if first_mention >= start_idx and first_mention <= end_idx:
                    first_mention_phase = phase
                    break
        if first_mention_phase is None:
            first_mention_phase = 'Unknown'

        # One-hot encode the plot phase
        plot_phases = ['Exposition', 'Rising Action', 'Climax', 'Falling Action', 'Resolution', 'Unknown']
        plot_phase_features = {f'First_Mention_{phase.replace(" ", "_")}': int(first_mention_phase == phase) for phase in plot_phases}

        # New Feature: Mentions per Plot Phase
        mentions_per_phase = {phase: 0 for phase in plot_phases}

        # Map sentence indices to plot phases
        sentence_plot_phases = {}
        if hasattr(book, 'plot_structure'):
            for phase, (start_idx, end_idx) in book.plot_structure.items():
                for idx in range(start_idx, end_idx + 1):
                    sentence_plot_phases[idx] = phase
        else:
            for idx in range(len(book.sentences)):
                sentence_plot_phases[idx] = 'Unknown'

        # Count mentions per plot phase
        for idx, sentence in enumerate(book.sentences):
            phase = sentence_plot_phases.get(idx, 'Unknown')
            if character in sentence:
                mentions_per_phase[phase] += 1

        # Add mentions per phase to the features
        total_mentions = sum(mentions_per_phase.values())
        if total_mentions > 0:
            mentions_proportion = {f'Mentions_{phase.replace(" ", "_")}_Proportion': mentions_per_phase[phase] / total_mentions for phase in plot_phases}
        else:
            mentions_proportion = {f'Mentions_{phase.replace(" ", "_")}_Proportion': 0 for phase in plot_phases}
        
        # Prepare the row data
        row = {
            'Story': story_file,
            'Character': character,
            'Label': label,
            'Mention_Count': mention_count,
            'First_Mention': first_mention,
            'Sentiment': sentiment,
            'Rank': char_rank,
            'Proximity_Score': proximity_score,
        }

        # Add the plot phase features
        row.update(plot_phase_features)
        row.update(mentions_proportion)
        
        character_data.append(row)

# Create the DataFrame
character_df = pd.DataFrame(character_data)

# Handle any missing values
character_df.fillna(0, inplace=True)

# Define the feature list
features = [
    'Mention_Count', 'First_Mention', 'Sentiment', 'Rank', 'Proximity_Score',
    'First_Mention_Exposition', 'First_Mention_Rising_Action', 'First_Mention_Climax',
    'First_Mention_Falling_Action', 'First_Mention_Resolution', 'First_Mention_Unknown',
    'Mentions_Exposition_Proportion', 'Mentions_Rising_Action_Proportion',
    'Mentions_Climax_Proportion', 'Mentions_Falling_Action_Proportion',
    'Mentions_Resolution_Proportion', 'Mentions_Unknown_Proportion',
]

# Prepare the dataset
X = character_df[features]
y = character_df['Label']

# Feature scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, stratify=y, random_state=42)

# Handle class imbalance using SMOTE
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Define the neural network model
model = keras.Sequential([
    layers.Dense(64, activation='relu', input_shape=(X_train_resampled.shape[1],)),
    layers.Dense(32, activation='relu'),
    layers.Dense(1, activation='sigmoid')  # Output layer for binary classification
])

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Calculate class weights
class_weights = class_weight.compute_class_weight(
    class_weight='balanced',
    classes=np.unique(y_train_resampled),
    y=y_train_resampled
)
class_weights = dict(enumerate(class_weights))

# Train the model
history = model.fit(
    X_train_resampled,
    y_train_resampled,
    epochs=50,
    batch_size=32,
    validation_data=(X_test, y_test),
    class_weight=class_weights,
    verbose=1
)

# Evaluate the model
y_pred_prob = model.predict(X_test)
y_pred = (y_pred_prob > 0.5).astype(int).flatten()

print(classification_report(y_test, y_pred))


[__init__]: Initializing book from file: books/book_secret_garden.txt
get_book(): Text obtained from file 'books/book_secret_garden.txt'.




[__init__]: Initializing book from file: books/book_eye_of_apollo.txt
get_book(): Text obtained from file 'books/book_eye_of_apollo.txt'.




[__init__]: Initializing book from file: books/book_soul_of_schoolboy.txt
get_book(): Text obtained from file 'books/book_soul_of_schoolboy.txt'.




[__init__]: Initializing book from file: books/book_the_sins_of_prince_saradine.txt
get_book(): Text obtained from file 'books/book_the_sins_of_prince_saradine.txt'.




[__init__]: Initializing book from file: books/book_sign_of_the_broken_sword.txt
get_book(): Text obtained from file 'books/book_sign_of_the_broken_sword.txt'.




[__init__]: Initializing book from file: books/book_the_hammer_of_god.txt
get_book(): Text obtained from file 'books/book_the_hammer_of_god.txt'.




[__init__]: Initializing book from file: books/book_the_flying_stars.txt
get_book(): Text obtained from file 'books/book_the_flying_stars.txt'.




[__init__]: Initializing book from file: books/book_fad_of_the_fisherman.txt
get_book(): Text obtained from file 'books/book_fad_of_the_fisherman.txt'.




[__init__]: Initializing book from file: books/book_the_blue_cross.txt
get_book(): Text obtained from file 'books/book_the_blue_cross.txt'.




[__init__]: Initializing book from file: books/book_the_wrong_shape.txt
get_book(): Text obtained from file 'books/book_the_wrong_shape.txt'.




[__init__]: Initializing book from file: books/book_the_queer_feet.txt
get_book(): Text obtained from file 'books/book_the_queer_feet.txt'.




[__init__]: Initializing book from file: books/book_vanishing_prince.txt
get_book(): Text obtained from file 'books/book_vanishing_prince.txt'.




[__init__]: Initializing book from file: books/book_three_tools_of_death.txt
get_book(): Text obtained from file 'books/book_three_tools_of_death.txt'.




[__init__]: Initializing book from file: books/book_vengeance_of_the_statue.txt
get_book(): Text obtained from file 'books/book_vengeance_of_the_statue.txt'.




[__init__]: Initializing book from file: books/book_temple_of_silence.txt
get_book(): Text obtained from file 'books/book_temple_of_silence.txt'.




[__init__]: Initializing book from file: books/book_bottomless_well.txt
get_book(): Text obtained from file 'books/book_bottomless_well.txt'.




[__init__]: Initializing book from file: books/book_face_in_the_target.txt
get_book(): Text obtained from file 'books/book_face_in_the_target.txt'.




[__init__]: Initializing book from file: books/book_hole_in_the_wall.txt
get_book(): Text obtained from file 'books/book_hole_in_the_wall.txt'.


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/50
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 57ms/step - accuracy: 0.7249 - loss: 0.6460 - val_accuracy: 0.5652 - val_loss: 0.6766
Epoch 2/50
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.8115 - loss: 0.6023 - val_accuracy: 0.5652 - val_loss: 0.6508
Epoch 3/50
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.8209 - loss: 0.5783 - val_accuracy: 0.5652 - val_loss: 0.6277
Epoch 4/50
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.7845 - loss: 0.5528 - val_accuracy: 0.6957 - val_loss: 0.6118
Epoch 5/50
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.8161 - loss: 0.5082 - val_accuracy: 0.6957 - val_loss: 0.6019
Epoch 6/50
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - accuracy: 0.8008 - loss: 0.5003 - val_accuracy: 0.6522 - val_loss: 0.5879
Epoch 7/50
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━