In [17]:
import os, sys, re, collections, string
from operator import itemgetter as at
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from tqdm import tqdm
from ipywidgets import interact
sys.path.append("../python")
import data
%matplotlib inline

In [2]:
from sklearn.feature_extraction import text
X,y = data.preprocessed()
sample_doc = X[0]

## Building a vocabulary by distribution

In [26]:
word_count = collections.Counter()
for x in tqdm(X):
    word_count += collections.Counter(x.split())
total_word_count = sum(word_count.values())
word_count_hist = collections.Counter(word_count.values())
print ("Total word count: "+str(total_word_count))

100%|██████████| 4427/4427 [00:09<00:00, 449.05it/s]

Total word count: 14886405





In [41]:
@interact(lb = (1,10000), ub =(1000,1e7))
def vocab_coverage(lb=10,ub=10000):
    words_covered = sum([wc*n for wc, n in word_count_hist.items() if lb<wc<ub])
    corpus_percentage = words_covered/total_word_count
    vector_size = len([1 for wc, n in word_count_hist.items() if lb<wc<ub])
    return "Corpus Coverage: {c:.2f}%\n Vector Size: {v}".format(c=corpus_percentage*100,v=vector_size)

Term frequency Vector
---

In [3]:
#http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html
cv=text.CountVectorizer(min_df=1e-06, max_df=0.05)
cv.fit(X[:100])

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.05, max_features=None, min_df=1e-06,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [4]:
sample_vec = cv.transform([sample_doc])

Vector to bag of words
---

In [5]:
{k:v for k,v in zip(cv.get_feature_names(), sample_vec.toarray()[0]) if v>0}

{'adjourn': 2,
 'adjourned': 5,
 'adjournment': 5,
 'adjournments': 2,
 'adopting': 1,
 'affix': 1,
 'affixed': 1,
 'announcement': 1,
 'appointing': 1,
 'arranged': 1,
 'arrival': 1,
 'assent': 1,
 'attempts': 1,
 'attorneyinfact': 2,
 'ballots': 2,
 'bankshares': 2,
 'bearing': 1,
 'begins': 1,
 'calling': 2,
 'candidates': 2,
 'capacities': 1,
 'casting': 1,
 'checks': 1,
 'choose': 1,
 'circular': 1,
 'closed': 2,
 'complying': 1,
 'conference': 3,
 'considering': 1,
 'conspicuously': 1,
 'correctly': 1,
 'countersigned': 1,
 'coupled': 1,
 'dating': 1,
 'deeds': 2,
 'defective': 1,
 'delegation': 2,
 'destruction': 1,
 'disqualification': 1,
 'drafts': 1,
 'duplicate': 3,
 'establishing': 2,
 'ex': 2,
 'facsimiles': 2,
 'far': 1,
 'fee': 1,
 'fewer': 1,
 'fit': 1,
 'flatface': 1,
 'format': 1,
 'herself': 1,
 'holds': 1,
 'implementation': 1,
 'inspectors': 4,
 'institutions': 1,
 'instruct': 1,
 'lack': 1,
 'leases': 1,
 'maintaining': 1,
 'manually': 1,
 'meting': 1,
 'mortgages

# Stemming

In [6]:
import nltk
from nltk.stem.porter import PorterStemmer

In [42]:
stemmer = PorterStemmer()
stemmer.stem("factorization")

'factor'

In [8]:
analyzer = text.CountVectorizer().build_analyzer()
def my_analyzer(txt):
    return [stemmer.stem(w) for w in analyzer(txt)]
cv=text.CountVectorizer(min_df=1e-06, max_df=0.05, analyzer=my_analyzer)
cv.fit(X[:100])
sample_vec = cv.transform([sample_doc])
{k:v for k,v in zip(cv.get_feature_names(), sample_vec.toarray()[0]) if v>0}

{'adjourn': 14,
 'arriv': 1,
 'assent': 1,
 'attorneyinfact': 2,
 'bankshar': 2,
 'candid': 2,
 'circular': 1,
 'conspicu': 1,
 'correctli': 1,
 'countersign': 1,
 'coupl': 1,
 'deed': 2,
 'defect': 1,
 'destruct': 1,
 'disqualif': 1,
 'duplic': 3,
 'ex': 2,
 'far': 1,
 'fewer': 1,
 'fit': 1,
 'flatfac': 1,
 'herself': 1,
 'inspector': 4,
 'lack': 1,
 'mete': 1,
 'mortgag': 1,
 'mutil': 1,
 'occup': 1,
 'officio': 2,
 'old': 1,
 'peopl': 2,
 'poll': 1,
 'primarili': 1,
 'see': 3,
 'simultan': 1,
 'stagger': 1,
 'text': 1,
 'thu': 2,
 'unfinish': 1,
 'vacant': 1,
 'virginia': 3,
 'voter': 1}

In [9]:
#http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html
tfidf=text.TfidfVectorizer(min_df=1e-06, max_df=0.05)
tfidf.fit(X[:100])
sample_vec = tfidf.transform([sample_doc])
{k:v for k,v in zip(cv.get_feature_names(), sample_vec.toarray()[0]) if v>0}

{'alexand': 0.09866589599169956,
 'alfr': 0.2584272439225444,
 'algeria': 0.27282338526978295,
 'algonquin': 0.10912935410791318,
 'am': 0.05456467705395659,
 'anglais': 0.05456467705395659,
 'angola': 0.05456467705395659,
 'beacon': 0.04933294799584978,
 'bimonthli': 0.051685448784508876,
 'bought': 0.051685448784508876,
 'boyertown': 0.06350837695208168,
 'brokerdeal': 0.06350837695208168,
 'cambridgepark': 0.05456467705395659,
 'cann': 0.10912935410791318,
 'cfo': 0.12701675390416337,
 'channel': 0.12701675390416337,
 'cin': 0.04933294799584978,
 'circl': 0.05456467705395659,
 'deceas': 0.10337089756901775,
 'delet': 0.10337089756901775,
 'demot': 0.05456467705395659,
 'diagnost': 0.06350837695208168,
 'diversifi': 0.051685448784508876,
 'domain': 0.04933294799584978,
 'driver': 0.06350837695208168,
 'eclin': 0.10912935410791318,
 'england': 0.04933294799584978,
 'esq': 0.14799884398754934,
 'exculp': 0.04933294799584978,
 'exhaust': 0.058276647893974874,
 'florida': 0.0635083769520

Pretrained word vectors
---

In [13]:
import spacy
nlp = spacy.load('en')

In [14]:
doc = nlp(data.readFile(data.listFiles()[0]))
doc

  
  


Exhibit 3.1

  


  


  


  


  




OF

NEW PEOPLES BANKSHARES, INC.

  


(restated in electronic format as of March 17, 2004)

  


  


  
  
  


  


  


* * *

  
  


TABLE OF CONTENTS

  


  


ARTICLE 1 – SHARES 

4

  


Section 1.

Certificates 

4

Section 2.

Signatures 

4

Section 3.

Duplicate Certificates 

4

Section 4.

Transfer of Shares 

4

Section 5.

Restrictions on Transfer 

4

  


ARTICLE II – SHAREHOLDERS 

4

  


Section 1.

Holders of Shares 

4

Section 2.

Meetings Generally 

4

Section 3.

Annual Meetings 

5

Section 4.

Special Meetings 

5

Section 5.

Notice 

5

Section 6.

Determination of Shareholders of Record 

5

Section 7.

Conduct of Meetings 

5

Section 8.

Proxies 

6

Section 9.

Procedure at Meetings 

6

Section 10.

Shareholder Proposals 

6

Section 11.

Quorum and Voting 

7

Section 12.

Inspectors 

7

Section 13.

Adjournments 

7

  


ARTICLE III – DIRECTORS 

7

  


Section 1.

General Powers 

7

Section 2.


In [15]:
v = None
for w in doc:
    if not any(w.vector):
        continue
    if v is not None:
        v+=w.vector
        n+=1
    else:
        v=w.vector
        n=1
v/=n
v

  


array([-inf,  inf, -inf, -inf,  inf,  inf,  inf, -inf,  inf,  inf, -inf,
        inf,  inf, -inf, -inf,  inf, -inf,  inf, -inf, -inf, -inf,  inf,
       -inf,  inf,  inf,  inf, -inf, -inf,  inf,  inf,  inf,  inf, -inf,
        inf,  inf, -inf,  inf,  inf, -inf,  inf,  inf, -inf,  inf, -inf,
       -inf,  inf, -inf,  inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf,
        inf,  inf,  inf,  inf, -inf,  inf, -inf,  inf,  inf, -inf, -inf,
        inf,  inf, -inf,  inf,  inf, -inf,  inf,  inf,  inf,  inf,  inf,
        inf,  inf,  inf,  inf,  inf, -inf,  inf, -inf, -inf,  inf, -inf,
        inf,  inf, -inf, -inf,  inf, -inf,  inf, -inf,  inf,  inf, -inf,
       -inf,  inf, -inf, -inf, -inf,  inf,  inf,  inf,  inf,  inf,  inf,
        inf, -inf,  inf, -inf, -inf,  inf,  inf, -inf, -inf, -inf,  inf,
        inf, -inf,  inf, -inf,  inf,  inf, -inf,  inf, -inf,  inf, -inf,
       -inf, -inf, -inf,  inf,  inf,  inf,  inf,  inf, -inf,  inf,  inf,
       -inf, -inf,  inf, -inf, -inf,  inf, -inf, -i

In [16]:
any([0,0,0,0])

False

## Phrase detection

In [10]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/urigoren/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [11]:
from nltk import collocations
bigram_measures = collocations.BigramAssocMeasures()
trigram_measures = collocations.TrigramAssocMeasures()

tokenized_corpus = nltk.word_tokenize(' '.join(X[:1000]))

# change this to read in your data
finder = collocations.BigramCollocationFinder.from_words(tokenized_corpus)

finder.apply_freq_filter(4) 

# return the 10 n-grams with the highest PMI
finder.nbest(bigram_measures.student_t, 100)  

[('<', 'NUM'),
 ('NUM', '>'),
 ('of', 'the'),
 ('the', 'company'),
 ('this', 'agreement'),
 ('shall', 'be'),
 ('to', 'the'),
 ('section', '<'),
 ('by', 'the'),
 ('of', 'this'),
 ('the', 'board'),
 ('>', '<'),
 ('may', 'be'),
 ('in', 'the'),
 ('of', 'directors'),
 ('the', 'corporation'),
 ('with', 'the'),
 ('subject', 'to'),
 ('to', 'be'),
 ('the', 'plan'),
 ('the', 'executive'),
 ('of', 'any'),
 ('board', 'of'),
 ('pursuant', 'to'),
 ('the', 'date'),
 ('on', 'the'),
 ('shall', 'not'),
 ('set', 'forth'),
 ('with', 'respect'),
 ('restricted', 'stock'),
 ('date', 'of'),
 ('common', 'stock'),
 ('the', 'companys'),
 ('respect', 'to'),
 ('any', 'other'),
 ('accordance', 'with'),
 ('in', 'accordance'),
 ('or', 'other'),
 ('of', 'such'),
 ('shall', 'have'),
 ('the', 'terms'),
 ('entitled', 'to'),
 ('not', 'be'),
 ('agreement', 'shall'),
 ('under', 'this'),
 ('shares', 'of'),
 ('the', 'participant'),
 ('termination', 'of'),
 ('the', 'parties'),
 ('or', 'any'),
 ('number', 'of'),
 ('company', 'o

In [12]:
from gensim.models import Phrases
X_words = [x.split() for x in X[:1000]]
bigram = Phrases(X_words, min_count=10, threshold=2)
bigram[X_words[0]]



['exhibit_<NUM>',
 'of',
 'new',
 'peoples',
 'bankshares_inc',
 'restated',
 'in',
 'electronic',
 'format',
 'as',
 'of',
 'march_<NUM>',
 '<NUM>',
 'table',
 'of',
 'contents',
 'article',
 '<NUM>',
 'shares',
 '<NUM>',
 'section_<NUM>',
 'certificates',
 '<NUM>',
 'section_<NUM>',
 'signatures',
 '<NUM>',
 'section_<NUM>',
 'duplicate',
 'certificates',
 '<NUM>',
 'section_<NUM>',
 'transfer',
 'of',
 'shares',
 '<NUM>',
 'section_<NUM>',
 'restrictions_on',
 'transfer',
 '<NUM>',
 'article_ii',
 'shareholders',
 '<NUM>',
 'section_<NUM>',
 'holders',
 'of',
 'shares',
 '<NUM>',
 'section_<NUM>',
 'meetings',
 'generally',
 '<NUM>',
 'section_<NUM>',
 'annual_meetings',
 '<NUM>',
 'section_<NUM>',
 'special_meetings',
 '<NUM>',
 'section_<NUM>',
 'notice',
 '<NUM>',
 'section_<NUM>',
 'determination',
 'of',
 'shareholders',
 'of',
 'record',
 '<NUM>',
 'section_<NUM>',
 'conduct',
 'of',
 'meetings',
 '<NUM>',
 'section_<NUM>',
 'proxies',
 '<NUM>',
 'section_<NUM>',
 'procedure',