## Calculating feature vectors for the files that I've to label:

In [1]:
import pandas as pd
to_label = pd.read_csv('./fnames_to_label.csv')
print(to_label.nunique())
to_label.head()

Filename          97688
DocumentID        97688
ESTC_ID           87270
Date                131
Title             81957
Vol_Number          146
Author            18892
Imprint           69271
Field_Headings    41218
TableName             8
dtype: int64


Unnamed: 0,Filename,DocumentID,ESTC_ID,Date,Title,Vol_Number,Author,Imprint,Field_Headings,TableName
0,0031300100.xml,31300100,T021625,1736,"Bibliotheca topographica Anglicana: or, a new ...",0,"Worrall, John",London : printed for J. Worrall at the Dove in...,"Books, Prices, Catalogs, Booksellers', Great B...",Manifest_GenRef
1,0031300200.xml,31300200,T013049,1787,"A catalogue of books printed for, and sold by ...",0,"Dilly, Charles","[London], s.n, 1787.","Catalogs, Booksellers', Early works to 1800",Manifest_GenRef
2,0031300300.xml,31300300,T057382,1800,"Rules of a reading-society, established April ...",0,Anon,"London : printed by H.D. Steel, No. 51, Lothbu...","[London Reading Society], Rules and practice, ...",Manifest_GenRef
3,0031300400.xml,31300400,T012488,1787,Rules for regulating the subscription library ...,0,Anon,"Stamford : printed by Newcomb and Peat, [1787].","[Stamford Subscription Library], Rules and pra...",Manifest_GenRef
4,0031300500.xml,31300500,W029739,1773,"A catalogue of books, imported and to be sold ...",0,"Knox, Henry","[Boston : Sold by Henry Knox, 1773].","Booksellers and bookselling, Massachusetts, Bo...",Manifest_GenRef


In [2]:
df = to_label[['Filename', 'TableName']]
df.head()

Unnamed: 0,Filename,TableName
0,0031300100.xml,Manifest_GenRef
1,0031300200.xml,Manifest_GenRef
2,0031300300.xml,Manifest_GenRef
3,0031300400.xml,Manifest_GenRef
4,0031300500.xml,Manifest_GenRef


In [3]:
set(df['TableName'].tolist())

{'Manifest_GenRef',
 'Manifest_HistAndGeo',
 'Manifest_Law',
 'Manifest_LitAndLang1',
 'Manifest_LitAndLang2',
 'Manifest_MedSciTech',
 'Manifest_RelandPhil',
 'Manifest_SSAndFineArt'}

In [4]:
# Model 2: All words > len(3)

from nltk.tokenize import word_tokenize, sent_tokenize
from collections import defaultdict
from heapq import nlargest

'''
Takes in a list of sentences where each sentence is a list of words, and optional argument 'user_stopwords'.
Returns a dictionary with each 'word' is the key, and 'count' as the value.
'''
def calculate_frequencies(sentences_ll, user_stopwords=None):  # sentences_ll is a list of lists
    frequency = defaultdict(int)    # default value : 0
    
    for sentence in sentences_ll:
        for word in sentence:
            word = word.lower()
            
            if len(word) > 3:
                frequency[word] += 1

    return frequency

'''
Takes in text, and n = number of features
Returns a list of n most frequent words
'''
def get_features(text, n, user_stopwords=None):  # n is the desired no. of features
    sentences = sent_tokenize(text.decode('utf8'))
    
    sentences_ll = []
    for s in sentences:
        words = word_tokenize(s)
        sentences_ll.append(words)

    frequency = calculate_frequencies(sentences_ll, user_stopwords)
    return nlargest(n, frequency, key=frequency.get)

In [5]:
tuples = [tuple(x) for x in df.values]
print(tuples[:4])

[('0031300100.xml', 'Manifest_GenRef'), ('0031300200.xml', 'Manifest_GenRef'), ('0031300300.xml', 'Manifest_GenRef'), ('0031300400.xml', 'Manifest_GenRef')]


In [7]:
# Read files from Dataset
path = './FINAL_Dataset/'

map_filename_to_1000words = {}

counter = 0
for (fname, folder) in tuples[:100]:
    with open(path+folder+'/'+fname+'.txt', 'rb') as f:
        map_filename_to_1000words[fname] = get_features(f.read(), 1000)
        
    if counter % 10 == 0:
        print(counter)
    counter += 1
    
# Should stop at 95000
# Started running at 3PM Tuesday

0
10
20
30
40
50
60
70
80
90


In [13]:
import pickle

with open('./features_97k.pickle', 'wb') as f:
    pickle.dump(map_filename_to_1000words, f)

In [14]:
with open('./features_97k.pickle', 'rb') as f:
    df = pickle.load(f)
    
print(df.keys())

dict_keys(['0031300100.xml', '0031300200.xml', '0031300300.xml', '0031300400.xml', '0031300500.xml', '0031300600.xml', '0031300700.xml', '0031300800.xml', '0031300900.xml', '0031301000.xml', '0031301100.xml', '0031301200.xml', '0031301300.xml', '0031301400.xml', '0031301500.xml', '0031301700.xml', '0031301800.xml', '0031301900.xml', '0031302000.xml', '0031302100.xml', '0031302200.xml', '0031302300.xml', '0031302400.xml', '0031302500.xml', '0031400101.xml', '0031400102.xml', '0031400103.xml', '0031400104.xml', '0031400105.xml', '0031400106.xml', '0031400107.xml', '0031400108.xml', '0031400200.xml', '0031400300.xml', '0031400400.xml', '0031400500.xml', '0031400600.xml', '0031500100.xml', '0031500200.xml', '0031500300.xml', '0031500400.xml', '0031500500.xml', '0031500600.xml', '0031500700.xml', '0031500800.xml', '0031500900.xml', '0031501000.xml', '0031501100.xml', '0031501200.xml', '0031501300.xml', '0031501400.xml', '0031501500.xml', '0031501600.xml', '0031501700.xml', '0031501800.xml',