In [136]:
import pandas as pd
import os
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import defaultdict
import math
from sklearn.model_selection import train_test_split
from sklearn.decomposition import TruncatedSVD

In [137]:
CURRENT_FILE_PATH = os.path.dirname(os.path.abspath("__file__"))
metadata = pd.read_csv(CURRENT_FILE_PATH + '/../data/external/textbooks_archive/Metadata.csv')
metadata.head()

Unnamed: 0,File_name,Contents Page,Book Title,Author,Edition,Product Type,Copyright Year,Language,Language Collection,Series Title,Subject Classification,Unnamed: 11,Unnamed: 12,Unnamed: 13,Unnamed: 14,Unnamed: 15,Unnamed: 16
0,Fundamentals_of_Power_Electronics.pdf,616,Fundamentals of Power Electronics,"Robert W. Erickson, Dragan Maksimovic",2nd ed. 2001,Graduate/advanced undergraduate textbook,2001,EN,English/International,,"Engineering; Circuits and Systems; Energy, gen...",,,,,,
1,Handbook_of_the_Life_Course.pdf,1720,Handbook of the Life Course,"Jeylan T. Mortimer, Michael J. Shanahan",2003,Graduate/advanced undergraduate textbook,2003,EN,English/International,Handbooks of Sociology and Social Research,"Social Sciences; Sociology, general; Clinical ...",,,,,,
2,All_of_Statistics.pdf,1218,All of Statistics,Larry Wasserman,2004,Graduate/advanced undergraduate textbook,2004,EN,English/International,Springer Texts in Statistics,Mathematics; Computational Mathematics and Num...,,,,,,
3,Social_Anxiety_and_Social_Phobia_in_Youth.pdf,910,Social Anxiety and Social Phobia in Youth,Christopher Kearney,2005,Graduate/advanced undergraduate textbook,2005,EN,English/International,Series in Anxiety and Related Disorders,Psychology,Clinical Psychology,Personality and Social Psychology,Community and Environmental Psychology,,,
4,Discrete_Mathematics.pdf,79,Discrete Mathematics,"László Lovász, József Pelikán, Katalin Veszter...",2003,Undergraduate textbook,2003,EN,English/International,Undergraduate Texts in Mathematics,Mathematics,Combinatorics,Number Theory,,,,


In [138]:
textbook_names = [file_name for file_name in metadata.loc[metadata['Contents Page'].notnull()]['File_name']]

textbooks_df = {}

for name in textbook_names:
    textbooks_df[name] = pd.read_csv(CURRENT_FILE_PATH + f'/../data/processed/tesseract_csvs/{name}.csv')

list(textbooks_df.keys())

['Fundamentals_of_Power_Electronics.pdf',
 'Handbook_of_the_Life_Course.pdf',
 'All_of_Statistics.pdf',
 'Social_Anxiety_and_Social_Phobia_in_Youth.pdf',
 'Discrete_Mathematics.pdf',
 'Developmental_Neurobiology.pdf',
 'Intuitive_Probability_and_Random_Processes_using_MATLAB.pdf',
 'Handbook_of_Disaster_Research.pdf',
 'Handbook_of_the_Sociology_of_Gender.pdf',
 'Handbook_of_Sociological_Theory.pdf',
 'Clinical_Neuroanatomy.pdf',
 'Acquired_Brain_Injury.pdf',
 'Numerical_Optimization.pdf',
 'Handbook_of_Biological_Confocal_Microscopy.pdf',
 'Ceramic_Materials.pdf']

In [139]:
def sort_words_in_reading_order(words):
    result = []
    for word in sorted(words, key=lambda x: (math.floor(x['top']/25.0), math.floor(x['left']/25.0))):
        if isinstance(word['text'], str):
            result.append(word['text'])
    return ' '.join(result)


def check_if_content_page(textbook_name, page_number, metadata):
    contents_page = metadata.loc[(metadata['File_name'] == textbook_name), 'Contents Page'].iloc[0]
    start_page, end_page = [int(page_num) for page_num in contents_page.split(',')]
    return page_number >= start_page and page_number <= end_page

pages = defaultdict(list)

for name, df in textbooks_df.items():
    page_numbers = sorted(df['page_num'].unique())
    for page_num in page_numbers:
        words = df.loc[df['page_num'] == page_num, ['text', 'left', 'top', 'width', 'height']].to_dict('records')
        pages['page_texts'].append(sort_words_in_reading_order(words))
        pages['is_content_page'].append(check_if_content_page(name, page_num, metadata))

pages_text_df = pd.DataFrame.from_dict(pages)
pages_text_df.head()

Unnamed: 0,page_texts,is_content_page
0,"ELE Sato, Power Electronics SA eee a > ;.",False
1,Fundamentals of Power Electronics SECOND EDITION,False
2,Fundamentals of Power Electronics SECOND EDITI...,False
3,"Distributors for North, Central and South Amer...",False
4,"Dedicated to Linda, William, and Richard Lidij...",False


In [140]:
train, valid = train_test_split(pages_text_df, test_size=0.2)
train.head()

Unnamed: 0,page_texts,is_content_page
92,Brief Survey of 4.2. A Power Semiconductor Dev...,False
7586,32 Chapter 2 J.B. Pawley ¢ 200 um i a 5x5 | Se...,False
827,"826 Simulation of Converters points, the quies...",False
7149,288 CHAPTER 11. NONLINEAR EQUATIONS We now inv...,False
8505,"Index 951 plants. See also, Botanical specimen...",False


In [141]:
print(f"Number of training samples: {len(train)}")
print(f"Number of positive training samples: {len(train.loc[train['is_content_page'] == True, :])}")
print(f"Number of validation samples: {len(valid)}")
print(f"Number of positive validation samples: {len(valid.loc[valid['is_content_page'] == True, :])}")

Number of training samples: 7452
Number of positive training samples: 63
Number of validation samples: 1863
Number of positive validation samples: 22


In [142]:
# vectorize page text
tfidf_vect = TfidfVectorizer(stop_words='english', ngram_range=(1,3))
tfidf_vect.fit(pages_text_df['page_texts'].values.tolist())

train_tfidf = tfidf_vect.transform(train['page_texts'])
valid_tfidf = tfidf_vect.transform(valid['page_texts'])

train_tfidf.shape

(7452, 4310363)

In [143]:
tsvd= TruncatedSVD(n_components=1)
train_tsvd=tsvd.fit_transform(train_tfidf)
valid_tsvd=tsvd.fit_transform(valid_tfidf)

train_tsvd.shape

(7452, 1)

In [144]:
classifier = LogisticRegression(solver='sag')
classifier.fit(train_tsvd, train['is_content_page'])

classifier.score(valid_tsvd, valid['is_content_page'])

0.988191089640365