In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

from nltk.tokenize import word_tokenize


In [2]:
import os
import json
import time
import multiprocessing as mp

import io
import os.path
import re
import tarfile
import sys
import gensim
import pandas as pd
import numpy as np


In [3]:
repos_path = os.path.abspath(os.path.join(os.getcwd(), os.pardir, os.pardir))

gutenberg_corpus_analysis_repo = os.path.join(repos_path, 'gutenberg_corpus_analysis')

In [4]:
gutenberg_repo_path = os.path.join(repos_path, 'gutenberg')
gutenberg_analysis_repo = os.path.join(repos_path, 'gutenberg-analysis')

src_dir = os.path.join(gutenberg_analysis_repo,'src')
sys.path.append(src_dir)
from data_io import get_book


gutenberg_src_dir = os.path.join(gutenberg_repo_path,'src')
sys.path.append(gutenberg_src_dir)

from metaquery import meta_query

sys.path.append(gutenberg_corpus_analysis_repo)
import misc_utils.dataset_filtering as dataset_filtering

In [5]:
text_fold=os.path.join(gutenberg_repo_path, 'data', 'text')

In [6]:
train_csv = os.path.join(gutenberg_corpus_analysis_repo, 'sample_dataset', 'final_train.csv')
test_csv = os.path.join(gutenberg_corpus_analysis_repo, 'sample_dataset', 'final_test.csv')

pg_catalog_filepath=os.path.join(gutenberg_repo_path, 'metadata', 'pg_catalog.csv')

In [7]:
np.random.seed(500)

In [8]:
train_df = pd.read_csv(train_csv, index_col='Unnamed: 0')
train_df.head()

Unnamed: 0,id,title,author,authoryearofbirth,authoryearofdeath,language,downloads,subjects
2439,PG12810,"Uncle Sam's Boys with Pershing's Troops: Or, D...","Hancock, H. Irving (Harrie Irving)",1868.0,1922.0,['en'],78,"{'World War, 1914-1918 -- Juvenile fiction', '..."
2446,PG12819,"Dick Prescott's Second Year at West Point: Or,...","Hancock, H. Irving (Harrie Irving)",1868.0,1922.0,['en'],94,{'United States Military Academy -- Juvenile f...
25920,PG40605,"The Motor Boat Club at Nantucket; or, The Myst...","Hancock, H. Irving (Harrie Irving)",1868.0,1922.0,['en'],189,"{'Motorboats -- Juvenile fiction', 'Nantucket ..."
55435,PG8153,"The Young Engineers in Arizona; or, Laying Tra...","Hancock, H. Irving (Harrie Irving)",1868.0,1922.0,['en'],190,"{'Civil engineers -- Fiction', 'Arizona -- Fic..."
32899,PG48863,"The Motor Boat Club off Long Island; or, A Dar...","Hancock, H. Irving (Harrie Irving)",1868.0,1922.0,['en'],85,"{'Motorboats -- Juvenile fiction', 'Long Islan..."


In [9]:
len(train_df['author'].unique())

80

In [10]:
subj = train_df['subjects'].replace('set()',np.nan)
subj_docs = []
for h in subj:
    try:
        h = h.strip("{}")[1:-1]
    except AttributeError:
        subj_docs.append(h)
        continue
    h = h.replace(' -- ', '-')
    h = h.replace("', '","_")
    h = h.split('_')
    h = [item.replace(' ','').replace(',', ' ') for item in h]
    h = ' '.join(h)
    subj_docs.append(h)

In [11]:
train_df['subj_str']=subj_docs

In [12]:
#train_df['subject_str'] = train_df['subjects'].apply(lambda x: split_subjects(x))

In [14]:
#train_df = train_df.sample(500)

In [27]:
start = time.time()
train_df['text'] = train_df['id'].apply(lambda x: get_book(x, path_gutenberg=gutenberg_repo_path,level='text'))
end = time.time()
print(f'Time elapsed: {end-start} seconds')

Time elapsed: 4.847460031509399 seconds


# Just that Text

In [29]:
texts = list(zip(train_df.author, train_df.text))

In [30]:
# Split into features and labels
authors, sentences = zip(*texts)

In [32]:
# 2. Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(sentences, authors, test_size=0.3, random_state=42)


In [33]:
# 3. Create a pipeline: TF-IDF Vectorizer + SVM Classifier
model = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('svm', SVC(kernel='linear'))  # Use linear kernel for text classification
])

In [34]:
# 4. Train the model
model.fit(X_train, y_train)

In [35]:
# 5. Predict and evaluate
train_y_pred = model.predict(X_train)
print("Classification Report:\n")
train_class_rpt = classification_report(y_train, train_y_pred))
print(train_class_rpt)

Classification Report:

                                          precision    recall  f1-score   support

                             A. L. O. E.       1.00      0.12      0.21        17
                         Aimard, Gustave       0.00      0.00      0.00        10
                     Alger, Horatio, Jr.       0.84      0.89      0.86        18
                       Balzac, Honoré de       1.00      0.07      0.12        15
                    Barbour, Ralph Henry       1.00      0.65      0.79        17
               Baring-Gould, S. (Sabine)       1.00      0.11      0.19        19
            Baum, L. Frank (Lyman Frank)       1.00      0.29      0.44        14
                         Belloc, Hilaire       0.07      0.95      0.13        21
                         Bennett, Arnold       0.78      0.33      0.47        21
         Benson, E. F. (Edward Frederic)       0.00      0.00      0.00        15
                     Blanchard, Amy Ella       1.00      0.62      0.76  

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [45]:
# 5. Predict and evaluate
y_pred = model.predict(X_test)
print("Classification Report:\n")
test_class_rpt = classification_report(y_test, y_pred)
print(test_class_rpt)

Classification Report:

                                          precision    recall  f1-score   support

                             A. L. O. E.       0.00      0.00      0.00         7
                         Aimard, Gustave       0.00      0.00      0.00        14
                     Alger, Horatio, Jr.       0.86      1.00      0.92         6
                       Balzac, Honoré de       0.00      0.00      0.00         9
                    Barbour, Ralph Henry       0.00      0.00      0.00         7
               Baring-Gould, S. (Sabine)       0.00      0.00      0.00         5
            Baum, L. Frank (Lyman Frank)       0.00      0.00      0.00        10
                         Belloc, Hilaire       0.02      0.67      0.03         3
                         Bennett, Arnold       0.00      0.00      0.00         3
         Benson, E. F. (Edward Frederic)       0.00      0.00      0.00         9
                     Blanchard, Amy Ella       1.00      0.09      0.17  

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
