In [225]:
import pandas as pd
import os
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import defaultdict
import math
from sklearn.model_selection import train_test_split
from sklearn.decomposition import TruncatedSVD

In [226]:
CURRENT_FILE_PATH = os.path.dirname(os.path.abspath("__file__"))
metadata = pd.read_csv(CURRENT_FILE_PATH + '/../data/external/textbooks_archive/Metadata.csv')
metadata.head()

Unnamed: 0,File_name,Contents Page,Summary Page,Book Title,Author,Edition,Product Type,Copyright Year,Language,Language Collection,Series Title,Subject Classification,Unnamed: 12,Unnamed: 13,Unnamed: 14,Unnamed: 15,Unnamed: 16,Unnamed: 17
0,Fundamentals_of_Power_Electronics.pdf,616,"52:52,73:73,117:118,140:141,193:193,271:271,33...",Fundamentals of Power Electronics,"Robert W. Erickson, Dragan Maksimovic",2nd ed. 2001,Graduate/advanced undergraduate textbook,2001,EN,English/International,,"Engineering; Circuits and Systems; Energy, gen...",,,,,,
1,Handbook_of_the_Life_Course.pdf,1720,"33:35,59:62,90:92,129:131,152:154,172:174,192:...",Handbook of the Life Course,"Jeylan T. Mortimer, Michael J. Shanahan",2003,Graduate/advanced undergraduate textbook,2003,EN,English/International,Handbooks of Sociology and Social Research,"Social Sciences; Sociology, general; Clinical ...",,,,,,
2,All_of_Statistics.pdf,1218,0,All of Statistics,Larry Wasserman,2004,Graduate/advanced undergraduate textbook,2004,EN,English/International,Springer Texts in Statistics,Mathematics; Computational Mathematics and Num...,,,,,,
3,Social_Anxiety_and_Social_Phobia_in_Youth.pdf,910,"31:31,57:57,79:79,101:101,131:131,183:183",Social Anxiety and Social Phobia in Youth,Christopher Kearney,2005,Graduate/advanced undergraduate textbook,2005,EN,English/International,Series in Anxiety and Related Disorders,Psychology,Clinical Psychology,Personality and Social Psychology,Community and Environmental Psychology,,,
4,Discrete_Mathematics.pdf,79,0,Discrete Mathematics,"László Lovász, József Pelikán, Katalin Veszter...",2003,Undergraduate textbook,2003,EN,English/International,Undergraduate Texts in Mathematics,Mathematics,Combinatorics,Number Theory,,,,


In [227]:
textbook_names = [file_name for file_name in metadata.loc[metadata['Summary Page'].notnull()]['File_name']]

textbooks_df = {}

for name in textbook_names:
    textbooks_df[name] = pd.read_csv(CURRENT_FILE_PATH + f'/../data/processed/tesseract_csvs/{name}.csv')

list(textbooks_df.keys())

['Fundamentals_of_Power_Electronics.pdf',
 'Handbook_of_the_Life_Course.pdf',
 'All_of_Statistics.pdf',
 'Social_Anxiety_and_Social_Phobia_in_Youth.pdf',
 'Discrete_Mathematics.pdf']

In [253]:
def sort_words_in_reading_order(words):
    result = []
    for word in sorted(words, key=lambda x: (math.floor(x['top']/25.0), math.floor(x['left']/25.0))):
        if isinstance(word['text'], str):
            result.append(word['text'])
    return ' '.join(result)


def check_if_content_page(textbook_name, page_number, metadata):
    contents_page = metadata.loc[(metadata['File_name'] == textbook_name), 'Contents Page'].iloc[0]
    start_page, end_page = [int(page_num) for page_num in contents_page.split(',')]
    return page_number >= start_page and page_number <= end_page

def check_if_summary_page(textbook_name, page_number, metadata):
    summary_pages = metadata.loc[(metadata['File_name'] == textbook_name), 'Summary Page'].iloc[0]
    if not isinstance(summary_pages, float) and ':' in summary_pages:
        page_ranges = [(int(page_range.split(':')[0]), int(page_range.split(':')[1])) for page_range in summary_pages.split(',')]
    else:
        page_ranges = []
        
    return any([page_number >= page_range[0] and page_number <= page_range[1] for page_range in page_ranges])

pages = defaultdict(list)

for name, df in textbooks_df.items():
    page_numbers = sorted(df['page_num'].unique())
    for page_num in page_numbers:
        words = df.loc[df['page_num'] == page_num, ['text', 'left', 'top', 'width', 'height']].to_dict('records')
        pages['page_texts'].append(sort_words_in_reading_order(words))
        pages['is_content_page'].append(check_if_content_page(name, page_num, metadata))
        pages['is_summary_page'].append(check_if_summary_page(name, page_num, metadata))

pages_text_df = pd.DataFrame.from_dict(pages)
pages_text_df.head()

breaking
breaking
breaking
breaking
breaking


Unnamed: 0,page_texts,is_content_page,is_summary_page
0,"ELE Sato, Power Electronics SA eee a > ;.",False,False
1,Fundamentals of Power Electronics SECOND EDITION,False,False
2,Fundamentals of Power Electronics SECOND EDITI...,False,False
3,"Distributors for North, Central and South Amer...",False,False
4,"Dedicated to Linda, William, and Richard Lidij...",False,False


In [254]:
train, valid = train_test_split(pages_text_df, test_size=0.2)
train.head()

Unnamed: 0,page_texts,is_content_page,is_summary_page
11,Contents xiii 13.5 Several Types of Magnetic D...,True,False
5,Contents Preface xix 1 Introduction 1 1.1 Intr...,True,False
130,14 1. Probability 3. Let 2 be a sample space a...,False,False
192,34 CHAPTER 2 side of caution and carefully ass...,False,False
80,12 Glen H. Elder Jr. et al. successful life co...,False,False


In [255]:
print(f"Number of training samples: {len(train)}")
print(f"Number of positive contents page training samples: {len(train.loc[train['is_content_page'] == True, :])}")
print(f"Number of positive summary page training samples: {len(train.loc[train['is_summary_page'] == True, :])}")
print(f"Number of validation samples: {len(valid)}")
print(f"Number of positive contents page validation samples: {len(valid.loc[valid['is_content_page'] == True, :])}")
print(f"Number of positive summary page validation samples: {len(valid.loc[valid['is_summary_page'] == True, :])}")

Number of training samples: 200
Number of positive contents page training samples: 19
Number of positive summary page training samples: 3
Number of validation samples: 50
Number of positive contents page validation samples: 8
Number of positive summary page validation samples: 1


In [256]:
# vectorize page text
tfidf_vect = TfidfVectorizer(stop_words='english', ngram_range=(1,3))
tfidf_vect.fit(pages_text_df['page_texts'].values.tolist())

train_tfidf = tfidf_vect.transform(train['page_texts'])
valid_tfidf = tfidf_vect.transform(valid['page_texts'])

train_tfidf.shape

(200, 83030)

In [257]:
tsvd= TruncatedSVD(n_components=10)
train_tsvd=tsvd.fit_transform(train_tfidf)
valid_tsvd=tsvd.fit_transform(valid_tfidf)

train_tsvd.shape

(200, 10)

In [258]:
classifier = LogisticRegression(solver='sag')
classifier.fit(train_tsvd, train['is_content_page'])

classifier.score(valid_tsvd, valid['is_content_page'])

0.84

In [259]:
classifier = LogisticRegression(solver='sag')
classifier.fit(train_tsvd, train['is_summary_page'])

classifier.score(valid_tsvd, valid['is_summary_page'])

0.98