# Model interpretation

This code translates a model (that is, a set of results produced by Mallet) into a human-interpretable form so that we can label and categorize the topics.

The Mallet output that we will use comes in three files. There's a document-topic matrix (```doctopics```), a list of keywords (```keys```), and some ```diagnostics``` on the topics.

In [3]:
import pandas as pd
import numpy as np
from collections import Counter
import xml.etree.ElementTree as ET

While we remove only a minimal list of stopwords before topic inference, there are lots of words that human readers find difficult to interpret; those get filtered out before we present the list to them. We need to load that list.

In [2]:
with open('functionwords.txt', encoding = 'utf-8') as f:
    lines = f.readlines()
    functionwords = [x.strip() for x in lines]

We also need metadata for the corpus used in modeling. For all the models we'll be using this is "corpus4," which underwent some pruning of e.g. nonfiction and collected works.

In [12]:
corpus = pd.read_csv('../metadata/corpus4.tsv', sep = '\t', low_memory = False)

For any given model, we start by constructing a dictionary of diagnostic statistics associated with topics.

To interpret these, consult [the Mallet documentation.](http://mallet.cs.umass.edu/diagnostics.php)

In [40]:
def extract_diag_stats(xmlfile):
    tree = ET.parse(xmlfile)
    root = tree.getroot()
    
    topics = dict()
    alltokencount = 0
    
    for item in root.findall('topic'):
        t = item.attrib
        tnum = int(t['id'])
        topics[tnum] = dict()
        
        topics[tnum]['tokens'] = float(t['tokens'])
        alltokencount += topics[tnum]['tokens']
        
        topics[tnum]['document_entropy'] = t['document_entropy']
        topics[tnum]['coherence'] = t['coherence']
        topics[tnum]['word-length'] = t['word-length']
        topics[tnum]['rank_1_docs'] = t['rank_1_docs']
    
    for tnum, valuedict in topics.items():
        valuedict['tokenpct'] = round(100 * topics[tnum]['tokens'] / alltokencount, 3)
        
    print(alltokencount)
    
    return topics
        

Next we translate the keywords into a more human-interpretable form by deleting function words that are too general.

In [41]:
def translate_keys(keyfile, topicdict):
    outlines = []
    global functionwords
    
    with open(keyfile, encoding ='utf-8') as f:
        for line in f.readlines():
            tokens = line.strip().split()
            topicnum = int(tokens[0])
            alpha = tokens[1]
            words = tokens[2 : ]
            wordsneeded = []
            for w in words:
                if w in functionwords:
                    continue
                else:
                    wordsneeded.append(w)
                
                if len(wordsneeded) > 75:
                    break
            
            thistopic = topicdict[topicnum]
            
            outline = '"' + 'TOPIC ' + str(topicnum) + '\n'
            outline = outline + 'pct corpus = ' + str(thistopic['tokenpct']) + '%\n'
            outline = outline + 'doc entropy = ' + thistopic['document_entropy'] + '\n'
            outline = outline + 'word length = ' + thistopic['word-length'] + '\n'
            outline = outline + 'coherence = ' + thistopic['coherence'] + '\n'
            outline = outline + 'rank 1 docs = ' + thistopic['rank_1_docs'] + '\n"\t"'
            
            for wordfloor in range(0, 70, 10):
                outline = outline + ' '.join(wordsneeded[wordfloor : (wordfloor + 10)]) + '\n'
            
            outline = outline + '"\t'
            
            outlines.append(outline)
    
    return outlines

Finally, we define a pair of functions that get the doctopic matrix and return the n documents with highest proportion of each topic.

In [5]:
def get_doctopics(filename):
    chunks = dict()
    with open(filename, encoding = 'utf-8') as f:
        for line in f:
            fields = line.strip().split()
            chunkid = fields[1]
            docid = fields[1].split('_')[0]
        
            if docid not in chunks:
                chunks[docid] = []
                
            vector = np.array([float(x) for x in fields[2: ]])
            chunks[docid].append(vector)
            
    docs = dict()
    docsizes = dict()    # not actually used in current version I think
                        # early on I used number of chunks as a proxy
                        # but now using actual number of tokens, from metadata
    for docid, value in chunks.items():
        avgvector = np.mean(value, axis = 0)
        docs[docid] = avgvector
        docsizes[docid] = len(vector)
    
    return docs, docsizes

def docs2maxdocs(docs, howmany):
    topictuples = dict()
    
    for docid, vector in docs.items():
        for idx, fraction in enumerate(vector):
            if idx not in topictuples:
                topictuples[idx] = []
            topictuples[idx].append((fraction, docid))
    
    maxdocs = dict()
    
    for topic, tuples in topictuples.items():
        tuples.sort(reverse = True)
        maxdocs[topic] = [(round(x[0], 3), x[1]) for x in tuples[0: howmany]]
        
    return maxdocs

def get_maxdocs(filename, howmany):
    docs, docsizes = get_doctopics(filename)
    maxdocs = docs2maxdocs(docs, howmany)
    return maxdocs, docsizes
    

The function below combines all the functions above into a single recipe.

Note, in particular, that we instruct maxdocs to return the top 200 documents for each topic, and use all 200 to infer, for instance, date quartiles.

But we only individually list the top seven.

In [44]:
def interpret_model(modelnumber):
    
    global corpus, functionwords
    
    doctopicfile = "final/k" + str(modelnumber) + 'doctopics.txt'
    keyfile = "final/k" + str(modelnumber) + 'keys.txt'
    diagnosticfile = "final/k" + str(modelnumber) + 'diagnostics.xml'
    
    topicdict = extract_diag_stats(diagnosticfile)
    
    outlines = translate_keys(keyfile, topicdict)
    
    maxdocs, docsizes = get_maxdocs(doctopicfile, 200)   # get the top 200 books
                                    # and use all 200 to find the "biggest author"
                                    # and "date quartiles"
    finalout = []

    
    for idx, outline in enumerate(outlines):
        outline = outline + '"'
        docctr = 0
        authorproportions = Counter()
        firstpubs = []
        for fraction, doc in maxdocs[idx]:
            author = corpus.loc[corpus.docid == doc, 'hathi_author'].values[0]
            if pd.isnull(author):
                author = "Unknown Author"
            else:
                if len(author) > 30:
                    author = author[0: 30]
            title = corpus.loc[corpus.docid == doc, 'hathi_title'].values[0]
            if pd.isnull(title):
                title = "Unknown Title"
            else:
                title = title.replace('"', '')
                if len(title) > 33:
                    title = title[0 : 33]
            firstpub = corpus.loc[corpus.docid == doc, 'firstpub'].values[0]
            firstpubs.append(firstpub)
            docctr += 1
            if docctr <= 7:   # but only list the top seven individually
                outline = outline + str(fraction) + ' | ' + author + ' | ' + title + ' | ' + str(firstpub) + '\n'
            
            authorproportions[author] += fraction * float(corpus.loc[corpus.docid == doc, 'tokens'].values[0])
            
        outline = outline + '"\t'
        
        allsum = sum(authorproportions.values())
        biggestauth, biggestsum = authorproportions.most_common(1)[0]
        
        bigfrac = round(100 * biggestsum / topicdict[idx]['tokens'], 2)
        outline = outline + biggestauth + " = " + str(bigfrac) + '%\t'
        percent25, percent75 = np.percentile(firstpubs, [25, 75])
        outline = outline + str(int(percent25)) + '-' + str(int(percent75)) + '\n'
        finalout.append(outline)
    
    outfile = keyfile.replace('keys.txt', 'interpret.tsv')
    with open(outfile, mode = 'w', encoding = 'utf=8') as f:
        f.write('topicstats\tkeywords\ttopbooks\tbiggestauth\tdatequartiles\n')
        for o in finalout:
            f.write(o)

In [45]:
interpret_model(200)

1400787771.0


## Unrelated stuff below

After starting to examine some early topic models, I found a lot of nonfiction floated to the top, and used this notebook to identify and remove some of it.

In [93]:
def docstoremove(topiclist, modelnumber, alreadyremoved, alreadycleared):
    global corpus, functionwords
    
    doctopicfile = "k" + str(modelnumber) + 'doctopics.txt'
    
    maxdocs = get_maxdocs(doctopicfile, 25)
    
    for t in topiclist:
        print()
        print('TOPIC ', t)
        suspects = maxdocs[t]
        for fraction, doc in suspects:
            # print(doc)
            if doc in alreadyremoved or doc in alreadycleared:
                continue
            else:
                author = corpus.loc[corpus.docid == doc, 'hathi_author'].values[0]
                title = corpus.loc[corpus.docid == doc, 'hathi_title'].values[0]
                print(fraction, ' | ', author, ' | ', title)
                user = input('remove? ')
                if user == 'y':
                    alreadyremoved.add(doc)
                else:
                    alreadycleared.add(doc)
    
    return alreadyremoved, alreadycleared
  

In [78]:
alreadyremoved = set()
alreadycleared = set()
suspicious = [5, 18]

In [94]:
alreadyremoved, alreadycleared = docstoremove(suspicious, 100, alreadyremoved, alreadycleared)


TOPIC  5
0.361  |  Farrington, Frank  |  Efficiency in the business


remove?  y


0.283  |  Watson, Nancy Dingman  |  What is one?


remove?  n


0.249  |  Butler, Ellis Parker  |  Robinson Crusoe and thrift stamps


remove?  n


0.248  |  Harvey, W. H  |  Coin on money, trusts, and imperialism


remove?  y


0.22  |  nan  |  Report from the Select Committee on Premium Bonds : together with the proceedings of the Committee, minutes of evidence and appendices


remove?  y


0.204  |  Landho, Red  |  Seventh city of Cibola


remove?  n


0.195  |  nan  |  Medicare : change in contigency reserve funding held down increase in Part B premium : briefing report to the Chairman, Special Committee on Aging, U.S. Senate


remove?  y


0.193  |  Van Dyke, Theodore S. (Theodore Strong)  |  Millionaires of a day: an inside history of the great southern California "boom."


remove?  n


0.183  |  Johnston, J. P. (James Perry)  |  Twenty years of hus'ling


remove?  n


0.169  |  Chester, George Randolph  |  Young Wallingford


remove?  n


0.164  |  Updegraff, Robert R. (Robert Rawls)  |  Captains in conflict; the story of the struggle of a business generation


remove?  n


0.164  |  Kyne, Peter B. (Peter Bernard)  |  Cappy Ricks comes back


remove?  n


0.163  |  Coolidge, Dane  |  The trail of gold


remove?  n


0.163  |  Shoop, Max  |  The fable of Economy Isle


remove?  n


0.16  |  Post, Louis Freeland  |  The open shop and the closed shop


remove?  y


0.159  |  Pritchard, Myron T. (Myron Thomas)  |  Stories of thrift for young Americans


remove?  n


0.157  |  Kyne, Peter B. (Peter Bernard)  |  Cappy Ricks; or, The subjugation of Matt Peasley


remove?  n


0.157  |  Lynch, Edward Melville  |  Killboylan bank, or, Every man his own banker : being the account of how Killboylan characters concerned themselves about cooperative credit


remove?  n


0.155  |  Clarke, Peter A  |  The English nobility under Edward the Confessor


remove?  y


0.155  |  Walbourn, Charles H  |  Confessions of a Pullman conductor


remove?  n


0.154  |  Stead, Christina  |  House of all nations


remove?  n


0.152  |  Fox-Davies, Arthur Charles  |  The finances of Sir John Kynnersley


remove?  n


0.151  |  Dodge, Henry Irving  |  The other Mr. Barclay


remove?  n


0.149  |  Train, Arthur Cheney  |  Paper profits : a novel of Wall Street


remove?  n


0.148  |  Kyne, Peter B. (Peter Bernard)  |  Cappy Ricks


remove?  n



TOPIC  18
0.82  |  Oxley, William  |  The idea and its imminence : a poet's philosophy


remove?  y


0.807  |  nan  |  Dialectical logic : essays on its history and theory


remove?  y


0.756  |  Stambaugh, Joan  |  Nietzsche's thought of eternal return


remove?  y


0.698  |  Wei, Wu Wei  |  Open secret


remove?  y


0.66  |  Ernst, Katharina  |  "Death" in the poetry of Emily Dickinson


remove?  y


0.632  |  Heidegger, Martin  |  Early Greek thinking


remove?  y


0.617  |  Royce, Josiah  |  The conception of immortality


remove?  y


0.613  |  Wheeler, Charles Kirkland  |  Critique of pure Kant or, A real realism vs. a fictitious idealism; in a word, the bubble and monstrosity of the Kantian metaphysic


remove?  y


0.574  |  Steiner, Rudolf  |  Ancient myths: their meaning and connection with evolution


remove?  y


0.554  |  Jevons, F. B (Frank Byron)  |  Philosophy, What is it?


remove?  y


0.542  |  Coke, Zachary  |  The art of logic, 1654


remove?  y


0.53  |  Jackson, Dawson  |  Against destruction


remove?  n


0.529  |  Krishnamurti, J. (Jiddu)  |  Mind without measure : talks in India, 1982-83 : authentic report


remove?  y


0.501  |  Conrad, Lawrence Henry  |  An address delivered before the annual meeting of Michigan authors association in Detroit on October 14, 1926


remove?  y


0.485  |  Krishnamurti, J. (Jiddu)  |  On nature and the environment


remove?  y


0.475  |  Neale, Robert E  |  The art of dying


remove?  n


0.474  |  Osho  |  The great challenge : exploring the world within


remove?  y


0.452  |  Packer, Toni  |  The work of this moment


remove?  n


0.45  |  Fiske, Amos Kidder  |  Beyond the bourn;


remove?  n


0.448  |  Kuo'an  |  The ox and his herdsman; a Chinese Zen text;


remove?  n


0.446  |  Wells, H. G. (Herbert George)  |  What are we to do with our lives?


remove?  y


0.439  |  Allapichai, A. M  |  The seven sages and the world order


remove?  n


0.433  |  Leggett, Mortimer Dormer  |  Dream of a modest prophet


remove?  n


0.43  |  Loori, John Daido  |  Mountain record of Zen talks


remove?  n


0.43  |  Hagen, Steve  |  Buddhism, plain and simple


remove?  y


In [95]:
alreadyremoved

{'inu.30000082137286',
 'inu.30000120170752',
 'mdp.39015005630002',
 'mdp.39015008901053',
 'mdp.39015009304042',
 'mdp.39015011804856',
 'mdp.39015019194730',
 'mdp.39015028446519',
 'mdp.39015032936745',
 'mdp.39015048719093',
 'mdp.39015048853934',
 'mdp.39015059449317',
 'mdp.39015081835764',
 'njp.32101078173828',
 'nnc2.ark+=13960=t6nz9cv81',
 'nyp.33433020487124',
 'pst.000043376155',
 'uc1.b3937989',
 'uc1.b3955246',
 'uc1.b4395331',
 'uc1.b5023357',
 'uc2.ark+=13960=t71v5pj5d',
 'wu.89097453831'}

In [96]:
suspicious = [20, 22, 95]
alreadyremoved, alreadycleared = docstoremove(suspicious, 100, alreadyremoved, alreadycleared)


TOPIC  20
0.446  |  McNerney, Kathryn  |  American oak furniture


remove?  y


0.36  |  nan  |  Official guides


remove?  y


0.31  |  Hansen, Henny Harald  |  Costumes and styles


remove?  y


0.288  |  Sammarco, Anthony Mitchell  |  The Great Boston Fire of 1872


remove?  y


0.287  |  nan  |  The seventeenth century / Valerie Cumming


remove?  y


0.259  |  nan  |  How to make your windows beautiful


remove?  y


0.225  |  Prior, Joanne  |  Soft furnishing: a practical introduction;


remove?  y


0.214  |  Robbe-Grillet, Alain  |  In the labyrinth, a novel


remove?  n


0.207  |  Penfield, Edward  |  Holland sketches


remove?  n


0.203  |  Robbe-Grillet, Alain  |  Snapshots


remove?  n


0.202  |  Coover, Robert  |  After Lazarus : a filmscript


remove?  n


0.196  |  Southern, Richard  |  Stage-setting for amateurs and professionals


remove?  y


0.19  |  Adcock, Arthur St. John  |  The booklover's London


remove?  y


0.184  |  Smith, Francis Hopkinson  |  A white umbrella in Mexico


remove?  n


0.178  |  nan  |  Wood-turning


remove?  y


0.171  |  Glantz, Evelyn  |  Scrap fun for everyone; 401 things anyone can make


remove?  y


0.163  |  Hansen, Edith  |  Counted thread embroidery


remove?  y


0.159  |  Sellers, Tom  |  nan


remove?  n


0.15  |  Straub, Peter  |  Mrs. God


remove?  n


0.148  |  Winter, Janet  |  Victorian costuming


remove?  y


0.145  |  Smith, Francis Hopkinson  |  Laguerre's and Well worn roads


remove?  n


0.144  |  Kharrāṭ, Idwār  |  City of saffron


remove?  n


0.143  |  Mottram, R. H. (Ralph Hale)  |  Our Mr. Dormer


remove?  n


0.143  |  Saint-Exupéry, Antoine de  |  The wild garden


remove?  n


0.142  |  Smith, Francis Hopkinson  |  The novels, stories and sketches of F. Hopkinson Smith


remove?  n



TOPIC  22
0.709  |  Batcher, Ralph R  |  The electronic control handbook


remove?  y


0.646  |  nan  |  CSE permissible digital methanometer


remove?  y


0.589  |  Hukkoo, R. K  |  Experimental re-evaluation of two-crystal scanning geometry for whole-body counting with log-shape placement of crystals


remove?  y


0.558  |  nan  |  Sectional catalogue of special machinery for manufacturing military rifle stocks and accessories


remove?  y


0.49  |  Weerts, Theodore Charles  |  The effects of eye position and expectation on sound localization


remove?  y


0.431  |  nan  |  The Planets


remove?  n


0.415  |  Clarke, Arthur C.  (Arthur Charles)  |  2001: a space odyssey


remove?  n


0.413  |  Herbert, Frank  |  Destination, void


remove?  n


0.386  |  Blankenship, John R  |  Articulator effects of Bennett movement and Bennett angle on cusp inclines : a dissertation [sic] submitted in partial fulfillment ... in denture prosthesis


remove?  y


0.371  |  Forward, Robert L  |  Dragon's egg


remove?  n


0.368  |  nan  |  Safety in woodworking


remove?  y


0.367  |  Crichton, Michael  |  The Andromeda strain


remove?  n


0.363  |  Caidin, Martin  |  Marooned, a novel


remove?  n


0.361  |  nan  |  US-50, southwest corner of Newton to I-35, Harvey County : environmental impact statement


remove?  y


0.347  |  Zelazny, Roger  |  Flare


remove?  n


0.335  |  McCollum, Michael  |  Life probe : Michael McCollum


remove?  n


0.334  |  Smith, E. E. (Edward Elmer)  |  Skylark three


remove?  n


0.331  |  Gamow, George  |  Mr Tompkins explores the atom


remove?  n


0.33  |  nan  |  SR-8 rest area construction, Elma : environmental impact statement


remove?  y


0.33  |  Niven, Larry  |  Ringworld : a novel


remove?  n


0.324  |  Leinster, Murray  |  The wailing asteroid


remove?  n


0.323  |  Chaudhuri, Tarini Charan  |  Sir William Ramsay as a scientist and man


remove?  y


0.318  |  Leinster, Murray  |  Men into space


remove?  n


0.315  |  Sutton, Jeff  |  Bombs in orbit


remove?  n


0.313  |  nan  |  The Science fictional solar system


remove?  n



TOPIC  95
0.924  |  nan  |  Pamphlets on forestry. Fish and game


remove?  y


0.904  |  Torre-Bueno, J. R. de la (José Rollin)  |  A synopsis of the Hemiptera-Heteroptera of America north of Mexico


remove?  y


0.686  |  nan  |  A catalogue of the Greek coins in the British Museum. $p: Italy


remove?  y


0.611  |  Edgar, C. C. (Campbell Cowan)  |  Sculptors' studies and unfinished works


remove?  y


0.561  |  Donaldson, Thomas E  |  Hindu temple art of Orissa


remove?  y


0.499  |  Sars, G. O (Georg Ossian)  |  An account of the Crustacea of Norway, with short descriptions and figures of all the species


remove?  y


0.256  |  Holder, Charles Frederick  |  The boy anglers; their adventures in the Gulf of Mexico, California, the Pacific and Atlantic Oceans, and the lakes and streams of Canada


remove?  y


0.253  |  Choudhury, R. D (Rabin Dev)  |  The sculptures of Assam


remove?  y


0.251  |  nan  |  Some specimens of the Roman, Oriental, and foreign types now in use in the offices of William Clowes & sons, limited


remove?  y


0.24  |  Grey, Zane  |  Tales of fishing virgin seas


remove?  n


0.232  |  nan  |  McKinney's consolidated laws of New York annotated


remove?  y


0.226  |  Beebe, William  |  Zaca venture


remove?  n


0.22  |  Witelo  |  Witelo on the principles of reflection : a critical edition and English translation with notes and commentary of Book V of Witelo's Perspectiva


remove?  y


0.214  |  Grey, Zane  |  Tales of Tahitian waters


remove?  n


0.213  |  D'Amelio, Joseph  |  Perspective drawing handbook


remove?  y


0.206  |  Haig-Brown, Roderick Langmere  |  Return to the river; a story of the Chinook run


remove?  n


0.206  |  Williamson, Henry  |  Salar the salmon


remove?  n


0.205  |  Holder, Charles Frederick  |  Along the Florida reef


remove?  n


0.204  |  Grey, Zane  |  Tales of swordfish and tuna


remove?  n


0.203  |  Grey, Zane  |  Tales of the angler's Eldorado, New Zealand


remove?  n


0.201  |  Hering, Ewald  |  Spatial sense and movements of the eye


remove?  y


0.187  |  Hopkins, Francis Powell  |  Fishing experiences of half a century, with instructions in the use of the fast reel


remove?  y


0.18  |  Hertz, Marguerite Rosenberg  |  Frequency tables for scoring Rorschach responses; code charts, normal and rare details, F+ and F- responses, popular responses, original responses


remove?  y


0.179  |  Beebe, William  |  The Arcturus adventure; an account of the New York zoological so


remove?  n


In [97]:
alreadyremoved

{'coo.31924003587999',
 'ien.35556030137137',
 'ien.35556030796155',
 'inu.30000082137286',
 'inu.30000120170752',
 'loc.ark+=13960=t7kp8vh4x',
 'mdp.35112203985603',
 'mdp.39015001116238',
 'mdp.39015005630002',
 'mdp.39015006310232',
 'mdp.39015007548012',
 'mdp.39015008901053',
 'mdp.39015009240063',
 'mdp.39015009304042',
 'mdp.39015009856363',
 'mdp.39015011804856',
 'mdp.39015014718244',
 'mdp.39015014846086',
 'mdp.39015019194730',
 'mdp.39015028446519',
 'mdp.39015031013892',
 'mdp.39015032936745',
 'mdp.39015033591671',
 'mdp.39015037792010',
 'mdp.39015048719093',
 'mdp.39015048853934',
 'mdp.39015059449317',
 'mdp.39015062453520',
 'mdp.39015066348353',
 'mdp.39015081835764',
 'mdp.39076005163303',
 'mdp.39076005640987',
 'mdp.49015002057033',
 'njp.32101078173828',
 'nnc2.ark+=13960=t6nz9cv81',
 'nyp.33433020487124',
 'nyp.33433066630348',
 'pst.000008276803',
 'pst.000010587188',
 'pst.000029752102',
 'pst.000043376155',
 'uc1.$b288145',
 'uc1.31822000601435',
 'uc1.321060

In [98]:
suspicious = [41]

In [99]:
alreadyremoved, alreadycleared = docstoremove(suspicious, 300, alreadyremoved, alreadycleared)


TOPIC  41
0.329  |  nan  |  POST course certification and presentation guidelines


remove?  y


0.212  |  nan  |  Course evaluations


remove?  y


0.21  |  nan  |  [SPEC kit on the systems function in ARL libraries


remove?  y


0.184  |  Upson, William Hazlett  |  Alexander Botts : great stories from the Saturday evening post


remove?  n


0.158  |  Johnson, Alvin Saunders  |  A report to Carnegie Corporation of New York on the policy of donations to free public libraries


remove?  y


0.15  |  Boulle, Pierre  |  William Conrad;


remove?  n


0.147  |  Pilhes, René Victor  |  The provocateur : a novel


remove?  n


0.136  |  Boulle, Pierre  |  The bridge over the River Kwai


remove?  n


0.134  |  Newman, Bernard  |  Second front, first spy


remove?  n


0.133  |  Boulle, Pierre  |  Not the glory;


remove?  n


0.133  |  Kalimugogo, Godfrey  |  The prodigal chairman


remove?  n


0.126  |  Fearing, Kenneth  |  The Crozart story


remove?  n


0.126  |  Ogunyẹmi, Michael Ade  |  The D.O


remove?  n


0.121  |  Kirst, Hans Hellmut  |  Damned to success


remove?  n


0.119  |  nan  |  SR-25 realignment, Litchfield : environmental impact statement


remove?  y


0.118  |  Kirst, Hans Hellmut  |  No fatherland


remove?  n


0.118  |  nan  |  Fuel shortages hearings, pursuant to S. Res. 45, a national fuels and energy policy study, Ninety-third Congress, first session


remove?  y


0.114  |  Fallick, J. L  |  The provision of tertiary courses in labour economics in Australia


remove?  y


0.111  |  Lathen, Emma  |  Murder against the grain


remove?  n


0.11  |  Brunner, John  |  The long result


remove?  n


0.108  |  Boulle, Pierre  |  A noble profession


remove?  n


0.105  |  Kimbrough, Emily  |  Through Charley's door


remove?  n


In [101]:
len(alreadyremoved)

68

In [102]:
corpus.shape

(29611, 23)

In [103]:
corpus = corpus.loc[~corpus.docid.isin(alreadyremoved), : ]
corpus.shape

(29543, 23)

In [104]:
pwd

'/Users/tunder/Dropbox/python/cohort/modelselection'

In [105]:
corpus.to_csv('../metadata/corpus3.tsv', sep = '\t', index = False)

In [106]:
oldcorpus = pd.read_csv('../metadata/modelcorpus.tsv', sep = '\t')

  interactivity=interactivity, compiler=compiler, result=result)


In [107]:
len(corpus.loc[corpus.firstpub == 1998, : ])

230

In [108]:
len(oldcorpus.loc[(oldcorpus.firstpub == 1999) & (oldcorpus.nonficprob < .95), :])

299

In [120]:
supplement = oldcorpus.loc[(oldcorpus.firstpub == 1999) & (oldcorpus.nonficprob < .95), :].sample(225)

In [121]:
supplement.head()

Unnamed: 0,docid,hathi_author,hathi_title,authordate,birthyear,deathyear,chi_date,ukw_date,copyright_date,firstpub,datewrong,nationality,nonficprob,distances,copyright_corpus,manual_corpus,us_national,pubdate_known,authof3ormore
38922,uc1.32106016419647,"Jooste, Pamela",Frieda and Min,,,,,,,1999,,,0.172311,,False,False,False,False,False
39064,mdp.39015048930781,"Nissen, Thisbe",Out of the girls' room and into the night,1972-,1972.0,,,,,1999,,,0.398568,,False,False,False,False,True
38960,mdp.39015045985358,"Ellis, Trey","Right here, right now",,,,,,,1999,,,0.409676,,False,False,False,False,False
39124,mdp.39015047609089,"Mayo, Wendell",B. Horror : and other stories,,,,,,,1999,,,0.363226,,False,False,False,False,False
38953,mdp.39015048735289,"Staffel, Megan",The notebook of lost things,1952-,1952.0,,,,,1999,,,0.243718,,False,False,False,False,False


In [110]:
set(corpus.columns) - set(supplement.columns)

{'pagesinchunk', 'skipped_pages', 'tokens', 'trimmed_pages'}

In [122]:
parsed = pd.read_csv('../getEF/parsing_metadata.tsv', sep = '\t')
parsed.head()

Unnamed: 0,id,tokens,skipped_pages,trimmed_pages,pagesinchunk
0,nyp.33433082289871_0,10150,0,36,75
1,nyp.33433082289871_1,9632,0,36,73
2,nyp.33433075765176_0,6921,0,61,62
3,nyp.33433075765176_1,6884,0,61,62
4,nyp.33433075765176_2,6927,0,61,62


In [123]:
def get_docid(astring):
    return astring.split('_')[0]

parsed = parsed.assign(docid = parsed['id'].apply(get_docid))

In [127]:
docsums = parsed.groupby('docid').sum()
docsums.reset_index(inplace = True)
docsums.head()

Unnamed: 0,docid,tokens,skipped_pages,trimmed_pages,pagesinchunk
0,aeu.ark+=13960=t22c08w6m,46847,0,248,256
1,aeu.ark+=13960=t2p575j24,50724,0,260,266
2,aeu.ark+=13960=t3126wn1n,17734,0,58,122
3,aeu.ark+=13960=t3gx5k62v,40509,0,264,269
4,aeu.ark+=13960=t4qj90j8c,32848,0,304,305


In [128]:
supplement = supplement.merge(docsums.loc[:, ['docid', 'pagesinchunk', 'tokens']], on= 'docid')

In [129]:
supplement.head()

Unnamed: 0,docid,hathi_author,hathi_title,authordate,birthyear,deathyear,chi_date,ukw_date,copyright_date,firstpub,...,nationality,nonficprob,distances,copyright_corpus,manual_corpus,us_national,pubdate_known,authof3ormore,pagesinchunk,tokens
0,uc1.32106016419647,"Jooste, Pamela",Frieda and Min,,,,,,,1999,...,,0.172311,,False,False,False,False,False,286,57387
1,mdp.39015048930781,"Nissen, Thisbe",Out of the girls' room and into the night,1972-,1972.0,,,,,1999,...,,0.398568,,False,False,False,False,True,217,52567
2,mdp.39015045985358,"Ellis, Trey","Right here, right now",,,,,,,1999,...,,0.409676,,False,False,False,False,False,238,42974
3,mdp.39015047609089,"Mayo, Wendell",B. Horror : and other stories,,,,,,,1999,...,,0.363226,,False,False,False,False,False,123,28058
4,mdp.39015048735289,"Staffel, Megan",The notebook of lost things,1952-,1952.0,,,,,1999,...,,0.243718,,False,False,False,False,False,198,36106


In [133]:
corpus = pd.read_csv('../metadata/corpus3.tsv', sep = '\t', low_memory = False)
corpus.drop(columns = ['skipped_pages', 'trimmed_pages'], inplace = True)

In [134]:
corpus.shape

(29482, 21)

In [135]:
corpus = pd.concat([corpus, supplement])
corpus.shape

(29707, 21)

In [138]:
corpus.to_csv('../metadata/corpus4.tsv',sep = '\t', index = False)

In [137]:
corpus = corpus.assign(tokensperpage = np.round(corpus.tokens / corpus.pagesinchunk, 3))

In [139]:
allowed = corpus.docid.tolist()
len(allowed)

29707

In [141]:
paths = pd.read_csv('../getEF/pathlistwithauthors.tsv', sep = '\t')
subpaths = paths.loc[paths.docid.isin(allowed), : ]
subpaths.shape

(29707, 3)

In [142]:
subpaths.head()

Unnamed: 0,docid,path,author
3,uc1.b4975632,uc1/pairtree_root/b4/97/56/32/b4975632/uc1.b49...,"Aakhus, Patricia"
4,inu.30000112046630,inu/pairtree_root/30/00/01/12/04/66/30/3000011...,"Abbe, George"
5,uc1.$b799882,uc1/pairtree_root/$b/79/98/82/$b799882/uc1.$b7...,"Abbe, George"
6,uc1.$b149331,uc1/pairtree_root/$b/14/93/31/$b149331/uc1.$b1...,"Abbe, George"
7,uc1.32106007981415,uc1/pairtree_root/32/10/60/07/98/14/15/3210600...,"Abbey, Edward"


In [145]:
subpaths = subpaths.merge(corpus.loc[: , ['docid', 'tokensperpage']], on = 'docid')
subpaths.head()

Unnamed: 0,docid,path,author,tokensperpage
0,uc1.b4975632,uc1/pairtree_root/b4/97/56/32/b4975632/uc1.b49...,"Aakhus, Patricia",189.321
1,inu.30000112046630,inu/pairtree_root/30/00/01/12/04/66/30/3000011...,"Abbe, George",191.688
2,uc1.$b799882,uc1/pairtree_root/$b/79/98/82/$b799882/uc1.$b7...,"Abbe, George",211.281
3,uc1.$b149331,uc1/pairtree_root/$b/14/93/31/$b149331/uc1.$b1...,"Abbe, George",215.292
4,uc1.32106007981415,uc1/pairtree_root/32/10/60/07/98/14/15/3210600...,"Abbey, Edward",238.725


In [146]:
sum(pd.isnull(subpaths.tokensperpage))

0

In [148]:
subpaths.to_csv('../getEF/cohort3_pathlist.tsv', sep = '\t', index = False)

In [149]:
np.median(subpaths.tokensperpage)

185.76

In [160]:
c4 = pd.read_csv('../metadata/corpus4.tsv', sep = '\t', low_memory = False)

In [161]:
len(alreadyremoved)

68

In [162]:
alreadyremoved.intersection(set(c4.docid.tolist()))

{'nyp.33433066630348'}