In [21]:
import pandas as pd
import os
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from collections import defaultdict
import math
from sklearn.model_selection import train_test_split
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

In [22]:
CURRENT_FILE_PATH = os.path.dirname(os.path.abspath("__file__"))
metadata = pd.read_csv(CURRENT_FILE_PATH + '/../data/external/textbooks_archive/Metadata.csv')
metadata.head()

Unnamed: 0,File_name,Contents Page,Summary Page,Book Title,Author,Edition,Product Type,Copyright Year,Language,Language Collection,Series Title,Subject Classification,Unnamed: 12,Unnamed: 13,Unnamed: 14,Unnamed: 15,Unnamed: 16,Unnamed: 17
0,Fundamentals_of_Power_Electronics.pdf,616,"52:52,73:73,117:118,140:141,193:193,271:271,33...",Fundamentals of Power Electronics,"Robert W. Erickson, Dragan Maksimovic",2nd ed. 2001,Graduate/advanced undergraduate textbook,2001,EN,English/International,,"Engineering; Circuits and Systems; Energy, gen...",,,,,,
1,Handbook_of_the_Life_Course.pdf,1720,"33:35,59:62,90:92,129:131,152:154,172:174,192:...",Handbook of the Life Course,"Jeylan T. Mortimer, Michael J. Shanahan",2003,Graduate/advanced undergraduate textbook,2003,EN,English/International,Handbooks of Sociology and Social Research,"Social Sciences; Sociology, general; Clinical ...",,,,,,
2,All_of_Statistics.pdf,1218,0,All of Statistics,Larry Wasserman,2004,Graduate/advanced undergraduate textbook,2004,EN,English/International,Springer Texts in Statistics,Mathematics; Computational Mathematics and Num...,,,,,,
3,Social_Anxiety_and_Social_Phobia_in_Youth.pdf,910,"31:31,57:57,79:79,101:101,131:131,183:183",Social Anxiety and Social Phobia in Youth,Christopher Kearney,2005,Graduate/advanced undergraduate textbook,2005,EN,English/International,Series in Anxiety and Related Disorders,Psychology,Clinical Psychology,Personality and Social Psychology,Community and Environmental Psychology,,,
4,Discrete_Mathematics.pdf,79,0,Discrete Mathematics,"László Lovász, József Pelikán, Katalin Veszter...",2003,Undergraduate textbook,2003,EN,English/International,Undergraduate Texts in Mathematics,Mathematics,Combinatorics,Number Theory,,,,


In [23]:
textbook_names = [file_name for file_name in metadata.loc[metadata['Summary Page'].notnull()]['File_name']]

textbooks_df = {}

for name in textbook_names:
    textbooks_df[name] = pd.read_csv(CURRENT_FILE_PATH + f'/../data/processed/tesseract_csvs/{name}.csv')

list(textbooks_df.keys())

['Fundamentals_of_Power_Electronics.pdf',
 'Handbook_of_the_Life_Course.pdf',
 'All_of_Statistics.pdf',
 'Social_Anxiety_and_Social_Phobia_in_Youth.pdf',
 'Discrete_Mathematics.pdf']

In [24]:
def sort_words_in_reading_order(words):
    result = []
    for word in sorted(words, key=lambda x: (math.floor(x['top']/25.0), math.floor(x['left']/25.0))):
        if isinstance(word['text'], str):
            result.append(word['text'])
    return ' '.join(result)


def check_if_content_page(textbook_name, page_number, metadata):
    contents_page = metadata.loc[(metadata['File_name'] == textbook_name), 'Contents Page'].iloc[0]
    start_page, end_page = [int(page_num) for page_num in contents_page.split(',')]
    return page_number >= start_page and page_number <= end_page

def check_if_summary_page(textbook_name, page_number, metadata):
    summary_pages = metadata.loc[(metadata['File_name'] == textbook_name), 'Summary Page'].iloc[0]
    if not isinstance(summary_pages, float) and ':' in summary_pages:
        page_ranges = [(int(page_range.split(':')[0]), int(page_range.split(':')[1])) for page_range in summary_pages.split(',')]
    else:
        page_ranges = []
        
    return any([page_number >= page_range[0] and page_number <= page_range[1] for page_range in page_ranges])

def get_page_class(name, page_num, metadata):
    classes = {
        1: check_if_content_page,
        2: check_if_summary_page,
    }
    
    for page_class, check_func in classes.items():
        if check_func(name, page_num, metadata):
            return page_class
    return 0

pages = defaultdict(list)

for name, df in textbooks_df.items():
    page_numbers = sorted(df['page_num'].unique())
    for page_num in page_numbers:
        words = df.loc[df['page_num'] == page_num, ['text', 'left', 'top', 'width', 'height']].to_dict('records')
        pages['textbook_name'].append(name)
        pages['page_number'].append(page_num)
        pages['page_texts'].append(sort_words_in_reading_order(words))
        pages['page_class'].append(get_page_class(name, page_num, metadata))

pages_text_df = pd.DataFrame.from_dict(pages)
pages_text_df.head()

Unnamed: 0,textbook_name,page_number,page_texts,page_class
0,Fundamentals_of_Power_Electronics.pdf,1,"ELE Sato, Power Electronics SA eee a > ;.",0
1,Fundamentals_of_Power_Electronics.pdf,2,Fundamentals of Power Electronics SECOND EDITION,0
2,Fundamentals_of_Power_Electronics.pdf,3,Fundamentals of Power Electronics SECOND EDITI...,0
3,Fundamentals_of_Power_Electronics.pdf,4,"Distributors for North, Central and South Amer...",0
4,Fundamentals_of_Power_Electronics.pdf,5,"Dedicated to Linda, William, and Richard Lidij...",0


In [25]:
train, test = train_test_split(pages_text_df, test_size=0.2)

train.head()

Unnamed: 0,textbook_name,page_number,page_texts,page_class
2477,Discrete_Mathematics.pdf,207,202 13. Coloring Maps and Graphs Uu ~ aan \ uy...,0
872,Fundamentals_of_Power_Electronics.pdf,873,"874 Index of buck-boost, 16, 124, 420 in low h...",0
1710,All_of_Statistics.pdf,111,98 7. Estimating the CDF and Statistical Funct...,0
2048,Social_Anxiety_and_Social_Phobia_in_Youth.pdf,3,SERIES IN ANXIETY AND RELATED DISORDERS Series...,0
2400,Discrete_Mathematics.pdf,130,6.10 How to Test Whether a Number is a Prime? ...,0


In [26]:
def upsample(tfidf, df):
    oversample = SMOTE()
    upsampled_train, upsampled_classes = oversample.fit_resample(tfidf, df['page_class'])

    return upsampled_train, upsampled_classes

def downsample(tfidf, df):
    downsample = RandomUnderSampler()
    upsampled_train, upsampled_classes = downsample.fit_resample(tfidf, df['page_class'])

    return upsampled_train, upsampled_classes

tfidf_vect = CountVectorizer(stop_words='english', ngram_range=(1,3))
tfidf_vect.fit(pages_text_df['page_texts'].values.tolist())

train_tfidf = tfidf_vect.transform(train['page_texts'])
test_tfidf = tfidf_vect.transform(test['page_texts'])

tsvd= TruncatedSVD(n_components=1000)
tsvd.fit(train_tfidf)
train_tsvd=tsvd.transform(train_tfidf)
test_tsvd=tsvd.transform(test_tfidf)

print(f'tsvd variance {tsvd.explained_variance_ratio_.sum()}')

upsampled_train, upsampled_classes = upsample(train_tsvd, train)

upsampled_classes = pd.DataFrame({'page_class': upsampled_classes})

tsvd variance 0.8256106425494552


In [27]:
print(f"Number of training samples: {len(upsampled_classes)}")
print(f"Number of positive contents page training samples: {len(upsampled_classes.loc[upsampled_classes.page_class == 1, :])}")
print(f"Number of positive summary page training samples: {len(upsampled_classes.loc[upsampled_classes.page_class == 2, :])}")

print(f"Number of test samples: {len(test)}")
print(f"Number of positive contents page test samples: {len(test.loc[test.page_class == 1, :])}")
print(f"Number of positive summary page test samples: {len(test.loc[test.page_class == 2, :])}")

Number of training samples: 5856
Number of positive contents page training samples: 1952
Number of positive summary page training samples: 1952
Number of test samples: 514
Number of positive contents page test samples: 7
Number of positive summary page test samples: 17


In [28]:
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

In [29]:
classifier = LogisticRegression(multi_class='multinomial', solver='saga')

parameters = {
    'solver': ['saga', 'sag', 'newton-cg', 'liblinear'],
    'penalty': ['none', 'l2'],
    'C': [100.0, 10.0, 1.0, 0.1],
    'max_iter': [100]
}

scoring_evals = ['recall_micro']
randm_src = RandomizedSearchCV(estimator=classifier, cv = cv, n_jobs=-1, refit='recall_micro', param_distributions=parameters, scoring=scoring_evals)
randm_src.fit(upsampled_train, upsampled_classes.values.ravel())

print(" Results from Random Search " )
print("\n The best estimator across ALL searched params:\n", randm_src.best_estimator_)
print("\n The best score across ALL searched params:\n", randm_src.best_score_)

print("\n Classification report for test set")
predictions = randm_src.best_estimator_.predict(test_tsvd)
print(classification_report(test['page_class'], predictions))

60 fits failed out of a total of 300.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
30 fits failed with the following error:
Traceback (most recent call last):
  File "/home/omid-dev/Personal/venvs/textbook_page_classifier/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/omid-dev/Personal/venvs/textbook_page_classifier/lib/python3.9/site-packages/sklearn/linear_model/_logistic.py", line 1461, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/home/omid-dev/Personal/venvs/textbook_page_classifier/lib/python3.9/site-packages/sklearn/linear_model/_logistic.py", line 464, in _check_solver
    raise V

 Results from Random Search 

 The best estimator across ALL searched params:
 LogisticRegression(C=100.0, multi_class='multinomial', penalty='none',
                   solver='sag')

 The best score across ALL searched params:
 0.9914046653637099

 Classification report for test set
              precision    recall  f1-score   support

           0       0.96      0.99      0.98       490
           1       0.67      0.57      0.62         7
           2       0.00      0.00      0.00        17

    accuracy                           0.95       514
   macro avg       0.54      0.52      0.53       514
weighted avg       0.92      0.95      0.94       514





In [30]:
test['predicted_class'] = predictions
test.to_csv('../output/v1_logistic_regression_results.csv')