In [74]:
import pickle
import re
import sklearn
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np

In [36]:
def csr_vappend(a,b):
    """ Takes in 2 csr_matrices and appends the second one to the bottom of the first one. 
    Much faster than scipy.sparse.vstack but assumes the type to be csr and overwrites
    the first matrix instead of copying it. The data, indices, and indptr still get copied."""

    a.data = np.hstack((a.data,b.data))
    a.indices = np.hstack((a.indices,b.indices))
    a.indptr = np.hstack((a.indptr,(b.indptr + a.nnz)[1:]))
    a._shape = (a.shape[0]+b.shape[0],b.shape[1])
    return a


## User Example

In [107]:
vocabulary = set(['hewitt','southwest corner','lawyer','said','said meyer','pu','nmb'])
binary=CountVectorizer(binary=True,vocabulary = vocabulary, stop_words = 'english',ngram_range=(1,4))
corpus = ['hewitt southwest corner local lawyer',
         'said meyer pulley heyhey post',
         'nmb, london apt']
binary.transform(corpus).todense()

matrix([[1, 1, 0, 0, 0, 0, 1],
        [0, 0, 0, 0, 1, 1, 0],
        [0, 0, 1, 0, 0, 0, 0]])

In [108]:
corpus2 = ['hewit southwest local lawyer',
         'meyer pulley heyhey post',
         'nmb, london apt']
binary.transform(corpus2).todense()

matrix([[0, 1, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0],
        [0, 0, 1, 0, 0, 0, 0]])

In [31]:
import scipy
import numpy as np
sparse = np.hstack((sparse,sparse2))

In [112]:
def csr_vappend(a,b):
    """ Takes in 2 csr_matrices and appends the second one to the bottom of the first one. 
    Much faster than scipy.sparse.vstack but assumes the type to be csr and overwrites
    the first matrix instead of copying it. The data, indices, and indptr still get copied."""

    a.data = np.hstack((a.data,b.data))
    a.indices = np.hstack((a.indices,b.indices))
    a.indptr = np.hstack((a.indptr,(b.indptr + a.nnz)[1:]))
    a._shape = (a.shape[0]+b.shape[0],b.shape[1])
    return a
csr_vappend(binary.transform(corpus),binary.transform(corpus2)).todense()

matrix([[1, 1, 0, 0, 0, 0, 1],
        [0, 0, 0, 0, 1, 1, 0],
        [0, 0, 1, 0, 0, 0, 0],
        [0, 1, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0],
        [0, 0, 1, 0, 0, 0, 0]])

## Load Vocabulary Dictionary

In [122]:
vocab = pickle.load(open('finalfeature.p','rb'))
vocab = set(vocab)
for w in vocab:
    vocab.remove(w)
    vocab.add(re.sub(r'_',' ',w))
n = len(vocab)

## Build Count Vectorizer

In [102]:
count_vectorizer=CountVectorizer(binary=False,vocabulary = vocab,ngram_range=(1,4))

In [103]:
len(binary_vectorizer.vocabulary)

25215

## Read in Corpus

In [92]:
import os
from glob import glob
from zipfile import ZipFile
from collections import Counter
import pickle
import re
import shutil

In [95]:
zfile = ZipFile('/Users/muriel820/Downloads/ml-project/cleaned_1880/1880.zip')
members = zfile.namelist()
for fname in members:
    if fname.endswith('-maj.p'):
        docid = fname.split('/')[-1][:-2]                     
        text = pickle.load(zfile.open(fname,'r'))

In [96]:
docid

'XOKHH7QNB5G0-maj'

In [97]:
text

[" HUGHES, J. This is a suit in chancery, brought by the executors, under letters taken out in Ohio, of Calvin Giddings, deceased, who was a citizen of that state, and whose will was proved there; the executors, of course, being also citizens of Ohio. The object of the suit is to subject a certain piece of land near the town of Hampton, in this state, to the lien for part of the purchase money of the land evidenced by a negotiable note which had been indorsed to the testator in his life-time by the vendor of the land, and which matured some eight months after the death of the testator, and after the qualification of the complainants as his executors in Ohio. The note was found by the executors among the testator's effects in Ohio. The vendee of the land, who is the principal defendant in the bill, is not a resident of this state, but is a resident of New Jersey; nor has process been served upon him, but he has appeared by counsel, and pleads that the complainants ought not to be heard 

In [104]:
sparse = count_vectorizer.transform(text)

In [105]:
sparse.todense()

matrix([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]])

In [126]:
doc_count = scipy.sparse.csr_matrix((1, len(vocab)))
for i in range(sparse.shape[0]):
    doc_count = doc_count+sparse[0]
all_ = []
all_.append((docid.split('-')[0],doc_count))
all_ = dict(all_)
all_

{'XOKHH7QNB5G0': <1x25215 sparse matrix of type '<class 'numpy.float64'>'
 	with 36 stored elements in Compressed Sparse Row format>}

In [127]:
os.chdir('/Users/muriel820/Downloads/ml-project/cleaned_1880')
zipfiles = glob('*zip')

for zfname in zipfiles:        
    print(zfname)
    zfile = ZipFile(zfname)    
    year = zfname.split('/')[-1][:-4]
 
    members = zfile.namelist()        
    threshold = len(members) / 200    
    docfreqs = Counter()

    zip_name = '/Users/muriel820/Downloads/ml-project/cleaned_1880/frequency_count_by_paragraph/' + zfname[0:4]+'/'
    zip_name_2 = '/Users/muriel820/Downloads/ml-project/cleaned_1880/frequency_count_by_doc/' + zfname[0:4]+'/'
    #maj ='maj'
    os.makedirs(zip_name)
    #os.chdir(zip_name)
    os.makedirs(zip_name+'maj/')
    os.makedirs(zip_name+'con/')
    os.makedirs(zip_name+'dis/')
    os.makedirs(zip_name+'condis/')
    os.makedirs(zip_name_2)
    os.makedirs(zip_name_2+'maj/')
    os.makedirs(zip_name_2+'con/')
    os.makedirs(zip_name_2+'dis/')
    os.makedirs(zip_name_2+'condis/')
    all_ = []
    
    for fname in members:
        if fname.endswith('-maj.p'):
            docid = fname.split('/')[-1][:-2]                     
            text = pickle.load(zfile.open(fname,'r'))
            para_count = count_vectorizer.transform(text)
            name = docid + '.p'        
            pickle.dump(para_count, open(zip_name+'maj/'+name, "wb"))
            doc_count = scipy.sparse.csr_matrix((1, n))
            for i in range(para_count.shape[0]):
                doc_count = doc_count + para_count[i]
            pickle.dump(doc_count, open(zip_name_2+'maj/'+name, "wb"))
            all_.append((docid.split('-')[0],doc_count))
        
        elif fname.endswith('.p')==False:
            continue 
        elif fname.endswith('dis/.p')==True:
            continue
        else:
            optype = fname.split('-')[-1][:-2]
            docid = fname.split('/')[-1][:-2]
            text = pickle.load(zfile.open(fname,'r'))

            if len(optype) ==7:        
                para_count = count_vectorizer.transform(text)
                name = docid + '.p'        
                pickle.dump(para_count, open(zip_name+'condis/'+name, "wb"))
                doc_count = scipy.sparse.csr_matrix((1, n))
                for i in range(para_count.shape[0]):
                    doc_count = doc_count + para_count[i]
                pickle.dump(doc_count, open(zip_name_2+'condis/'+name, "wb"))
                all_.append((docid.split('-')[0],doc_count))

            elif len(optype) == 4 and optype[0] =='c':
                para_count = count_vectorizer.transform(text)
                name = docid + '.p'        
                pickle.dump(para_count, open(zip_name+'con/'+name, "wb"))
                doc_count = scipy.sparse.csr_matrix((1, n))
                for i in range(para_count.shape[0]):
                    doc_count = doc_count + para_count[i]
                pickle.dump(doc_count, open(zip_name_2+'con/'+name, "wb"))
                all_.append((docid.split('-')[0],doc_count))

            else:
                para_count = count_vectorizer.transform(text)
                name = docid + '.p'        
                pickle.dump(para_count, open(zip_name+'dis/'+name, "wb"))
                doc_count = scipy.sparse.csr_matrix((1, n))
                for i in range(para_count.shape[0]):
                    doc_count = doc_count + para_count[i]
                pickle.dump(doc_count, open(zip_name_2+'dis/'+name, "wb"))
                all_.append((docid.split('-')[0],doc_count))
    all_ = dict(all_)
    pickle.dump(all_,open('all.p', "wb"))
    shutil.make_archive(zip_name, 'zip', zip_name)
    shutil.rmtree(zip_name,ignore_errors=True, onerror=None)
    shutil.make_archive(zip_name_2, 'zip', zip_name_2)
    shutil.rmtree(zip_name_2,ignore_errors=True, onerror=None)

1880.zip
1881.zip


In [133]:
pickle.load(open('/Users/muriel820/Downloads/ml-project/cleaned_1880/all.p','rb'))

{'X17IFNQNB5G0': <1x25215 sparse matrix of type '<class 'numpy.float64'>'
 	with 328 stored elements in Compressed Sparse Row format>,
 'X18KAJQNB5G0': <1x25215 sparse matrix of type '<class 'numpy.float64'>'
 	with 79 stored elements in Compressed Sparse Row format>,
 'X18KANQNB5G0': <1x25215 sparse matrix of type '<class 'numpy.float64'>'
 	with 26 stored elements in Compressed Sparse Row format>,
 'X18KARQNB5G0': <1x25215 sparse matrix of type '<class 'numpy.float64'>'
 	with 158 stored elements in Compressed Sparse Row format>,
 'X18KAVQNB5G0': <1x25215 sparse matrix of type '<class 'numpy.float64'>'
 	with 93 stored elements in Compressed Sparse Row format>,
 'X191HJQNB5G0': <1x25215 sparse matrix of type '<class 'numpy.float64'>'
 	with 27 stored elements in Compressed Sparse Row format>,
 'X191MFQNB5G0': <1x25215 sparse matrix of type '<class 'numpy.float64'>'
 	with 148 stored elements in Compressed Sparse Row format>,
 'X191MNQNB5G0': <1x25215 sparse matrix of type '<class 'nu

## Altogether

In [135]:
import pickle
import re
import sklearn
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
import os
from glob import glob
from zipfile import ZipFile
from collections import Counter
import shutil

vocab = pickle.load(open('/Users/muriel820/Downloads/ml-project/finalfeature.p','rb'))
vocab = set(vocab)
for w in vocab:
    vocab.remove(w)
    vocab.add(re.sub(r'_',' ',w))
n = len(vocab)
count_vectorizer=CountVectorizer(binary=False,vocabulary = vocab,ngram_range=(1,4))

os.chdir('/Users/muriel820/Downloads/ml-project/cleaned_1880')
zipfiles = glob('*zip')

for zfname in zipfiles:        
    print(zfname)
    zfile = ZipFile(zfname)    
    year = zfname.split('/')[-1][:-4]
 
    members = zfile.namelist()        
    threshold = len(members) / 200    
    docfreqs = Counter()

    zip_name = '/Users/muriel820/Downloads/ml-project/cleaned_1880/frequency_count_by_paragraph/' + zfname[0:4]+'/'
    zip_name_2 = '/Users/muriel820/Downloads/ml-project/cleaned_1880/frequency_count_by_doc/' + zfname[0:4]+'/'
    #maj ='maj'
    os.makedirs(zip_name)
    #os.chdir(zip_name)
    os.makedirs(zip_name+'maj/')
    os.makedirs(zip_name+'con/')
    os.makedirs(zip_name+'dis/')
    os.makedirs(zip_name+'condis/')
    os.makedirs(zip_name_2)
    os.makedirs(zip_name_2+'maj/')
    os.makedirs(zip_name_2+'con/')
    os.makedirs(zip_name_2+'dis/')
    os.makedirs(zip_name_2+'condis/')
    all_ = []
    
    for fname in members:
        if fname.endswith('-maj.p'):
            docid = fname.split('/')[-1][:-2]                     
            text = pickle.load(zfile.open(fname,'r'))
            para_count = count_vectorizer.transform(text)
            name = docid + '.p'        
            pickle.dump(para_count, open(zip_name+'maj/'+name, "wb"))
            doc_count = scipy.sparse.csr_matrix((1, n))
            for i in range(para_count.shape[0]):
                doc_count = doc_count + para_count[i]
            pickle.dump(doc_count, open(zip_name_2+'maj/'+name, "wb"))
            all_.append((docid.split('-')[0],doc_count))
        
        elif fname.endswith('.p')==False:
            continue 
        elif fname.endswith('dis/.p')==True:
            continue
        else:
            optype = fname.split('-')[-1][:-2]
            docid = fname.split('/')[-1][:-2]
            text = pickle.load(zfile.open(fname,'r'))

            if len(optype) ==7:        
                para_count = count_vectorizer.transform(text)
                name = docid + '.p'        
                pickle.dump(para_count, open(zip_name+'condis/'+name, "wb"))
                doc_count = scipy.sparse.csr_matrix((1, n))
                for i in range(para_count.shape[0]):
                    doc_count = doc_count + para_count[i]
                pickle.dump(doc_count, open(zip_name_2+'condis/'+name, "wb"))
                all_.append((docid.split('-')[0],doc_count))

            elif len(optype) == 4 and optype[0] =='c':
                para_count = count_vectorizer.transform(text)
                name = docid + '.p'        
                pickle.dump(para_count, open(zip_name+'con/'+name, "wb"))
                doc_count = scipy.sparse.csr_matrix((1, n))
                for i in range(para_count.shape[0]):
                    doc_count = doc_count + para_count[i]
                pickle.dump(doc_count, open(zip_name_2+'con/'+name, "wb"))
                all_.append((docid.split('-')[0],doc_count))

            else:
                para_count = count_vectorizer.transform(text)
                name = docid + '.p'        
                pickle.dump(para_count, open(zip_name+'dis/'+name, "wb"))
                doc_count = scipy.sparse.csr_matrix((1, n))
                for i in range(para_count.shape[0]):
                    doc_count = doc_count + para_count[i]
                pickle.dump(doc_count, open(zip_name_2+'dis/'+name, "wb"))
                all_.append((docid.split('-')[0],doc_count))
    all_ = dict(all_)
    pickle.dump(all_,open('all.p', "wb"))
    shutil.make_archive(zip_name, 'zip', zip_name)
    shutil.rmtree(zip_name,ignore_errors=True, onerror=None)
    shutil.make_archive(zip_name_2, 'zip', zip_name_2)
    shutil.rmtree(zip_name_2,ignore_errors=True, onerror=None)

1880.zip
1881.zip
