-
Notifications
You must be signed in to change notification settings - Fork 0
/
Booker Hathi Download.py
147 lines (109 loc) · 4.79 KB
/
Booker Hathi Download.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
#!/usr/bin/env python
# coding: utf-8
# In[ ]:
get_ipython().system('pip install htrc-feature-reader')
get_ipython().system('pip install pyLDAvis')
get_ipython().system('pip install htrc')
import pandas as pd
from htrc_features import FeatureReader, Volume
from tqdm.notebook import trange, tqdm
import os
import shutil
import nltk
#import gensim
# Because some of our libraries will throw many, many warnings for future changes to their code:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
os.chdir('/Users/tomwilliams/Library/Mobile Documents/com~apple~CloudDocs/UVA/Thesis/Code')
from htrc import workset
bookerVolIds = workset.load_hathitrust_collection('https://babel.hathitrust.org/cgi/mb?a=listis&c=1335516225')
FeatureReader(ids=bookerVolIds).first().title#This checks first title in the list
#This creates a list of volume IDs for me to work with
x = bookerVolIds[0:10]
print(x)
def ef_vol_to_bow_df(volume, save_to_tsv=False):
from htrc_features import FeatureReader, Volume
import pandas as pd
from tqdm.notebook import trange, tqdm
import os
notebook_path = os.getcwd()
htid = volume.id
# creating an empty Dataframe to which we'll add clean tokens, instead of a text file
vol_df = pd.DataFrame(columns=['htid', 'page_number', 'page_tokens'])
outfile_name = htid+'.tsv' # saving our volume-level token DataFrame to a TSV
for page in tqdm(volume.pages(), total=volume.page_count):
page_num = str(page).split(' ')[1]
page_df = page.tokenlist(section='body', case=False, pos=False)
tkn_list = []
for i, r in page_df.iterrows():
tkn = i[2]
clean_tkn = tkn.strip()
count = r[0]
tkns = ([f'{clean_tkn}'] * count)
clean_tkn_list = [word for word in tkns if word.isalpha()]
clean_tkn_list = [word for word in clean_tkn_list if word not in en_stop]
tkn_list.extend(clean_tkn_list)
'''
Instead of writing to text files, we are adding the page-level clean tokens to our
DataFrame, with one each page of tokens constituting one row in the DataFrame.
'''
vol_df = vol_df.append({'htid': htid, 'page_number': page_num, 'page_tokens': tkn_list}, ignore_index=True)
'''
Lastly, we save our volume-level DataFrame as a tab-separated file, and return the volume
DataFrame so that we can better aggregate each volume's tokens into a single DataFrame (you'll
see this code in the wrapper we write to iterate through multiple volumes)
'''
if save_to_tsv==True:
vol_df.to_csv(outfile_name, sep='\t', index=False)
print(f'Saved {volume.title} to TSV named {outfile_name}')
print(f'Reformatted "{volume.title}" ({htid}) to bag-of-words')
return vol_df
nltk.download('stopwords')
en_stop = set(nltk.corpus.stopwords.words('english'))
en_stop.add("'")
en_stop.add('"')
en_stop.add(' ')
en_stop.add('would')
en_stop.add('could')
en_stop.add('should')
en_stop.add('said')
en_stop.add('also')
# If we wanted to be a bit more clever, we could use a simple loop to add these words to our stop list with less typing:
# stop_words_to_add = ["'",'"',' ','would','could','should','also', 'said']
# for word in stop_words_to_add:
# en_stop.add(word)
workset_page_df = pd.DataFrame(columns=['htid','page_number','page_tokens'])
for book in tqdm(bookerVolIds):
fr_vol = Volume(book)
book_df = ef_vol_to_bow_df(fr_vol, save_to_tsv=True)
workset_page_df = workset_page_df.append(book_df)
print(f"Reformatted {len(volume_list)} volumes to bag-of-words pages.")
#I couldn't work out why the code was breaking and needed to fiddle with the text to work out which books caused the problems.
x = bookerVolIds[32]
print(x)
y = bookerVolIds[150]
print(y)
z = bookerVolIds[181]
print(z)
a = bookerVolIds[215]
print(a)
#bookerVolIds.remove(x)
#bookerVolIds.remove(y)
#bookerVolIds.remove(z)
#bookerVolIds.remove(a)
#These 4 volumes caused the next cell to crash. They are
print(x,y,z,a)
workset_page_df = pd.DataFrame(columns=['htid','page_number','page_tokens'])
for book in tqdm(bookerVolIds):
fr_vol = Volume(book)
book_df = ef_vol_to_bow_df(fr_vol, save_to_tsv=True)
workset_page_df = workset_page_df.append(book_df)
print(f"Reformatted {len(volume_list)} volumes to bag-of-words pages.")
missingVolIds = ["ucbk.ark+=28722=h24m91c3s", "ucbk.ark+=228722=h27940w7z","ucbk.ark+=228722=h2tx35d4d","ucbk.ark+=228722=h2x34n702"]
print(missingVolIds)
workset_page_df2 = pd.DataFrame(columns=['htid','page_number','page_tokens'])
for book in tqdm(missingVolIds):
fr_vol = Volume(book)
book_df = ef_vol_to_bow_df(fr_vol, save_to_tsv=True)
workset_page_df = workset_page_df2.append(book_df)
print(f"Reformatted {len(volume_list)} volumes to bag-of-words pages.")