In [1]:
import os, sys, re, collections, string
from operator import itemgetter as at
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from tqdm import tqdm
sys.path.append("../python")
import data
%matplotlib inline

In [2]:
import spacy
nlp = spacy.load('en')

In [3]:
from sklearn.feature_extraction import text
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
X,y = data.stemmed()

100%|██████████| 44277/44277 [55:17<00:00, 13.35it/s]


In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=20180301)

Vectorization
---

In [5]:
class AvgWordVectors:
    def __init__(self, weights={}):
        self.weights = weights
    def fit(self, docs):
        pass
    def transform(self, docs):
        ret = []
        for doc in docs:
            doc = nlp(doc)
            v = None
            for w in doc:
                if not any(w.vector):
                    continue
                if v is not None:
                    v+=self.weights.get(w, 1.0)*w.vector
                    n+=self.weights.get(w, 1.0)
                else:
                    v=self.weights.get(w, 1.0)*w.vector
                    n=self.weights.get(w, 1.0)
            ret.append(v/n)
        return ret

In [6]:
#vectorizer = text.CountVectorizer()
vectorizer = text.TfidfVectorizer(max_features=1000, max_df=0.05)
#vectorizer = AvgWordVectors()
vectorizer.fit(X_train)
X_train = vectorizer.transform(X_train)
X_test = vectorizer.transform(X_test)
X_test

<13284x1000 sparse matrix of type '<class 'numpy.float64'>'
	with 311099 stored elements in Compressed Sparse Row format>

In [7]:
vectorizer.get_feature_names()

['aaa',
 'ab',
 'abandon',
 'abus',
 'accredit',
 'accrual',
 'accumul',
 'accuraci',
 'acquiror',
 'add',
 'addendum',
 'addresse',
 'adequaci',
 'adher',
 'adjudg',
 'admiss',
 'admit',
 'advisori',
 'aesc',
 'affidavit',
 'aforement',
 'aftertax',
 'agil',
 'agreeabl',
 'aid',
 'air',
 'alcohol',
 'allianc',
 'allot',
 'along',
 'alphabet',
 'alway',
 'am',
 'ambigu',
 'amort',
 'amzg',
 'analys',
 'analysi',
 'andrew',
 'angel',
 'annex',
 'answer',
 'anthoni',
 'apart',
 'appendix',
 'apprais',
 'appreci',
 'approxim',
 'arizona',
 'arrear',
 'arrow',
 'ascertain',
 'asid',
 'assent',
 'athena',
 'atlanta',
 'atwil',
 'auditor',
 'authent',
 'authorship',
 'autom',
 'ave',
 'awarde',
 'back',
 'background',
 'bad',
 'ballot',
 'bancorp',
 'bancshar',
 'banker',
 'bar',
 'basic',
 'bbt',
 'beach',
 'becam',
 'beij',
 'beneath',
 'bequest',
 'bermuda',
 'bid',
 'bill',
 'biotim',
 'birth',
 'biweekli',
 'blackboard',
 'blackout',
 'block',
 'blvd',
 'bona',
 'bookentri',
 'borrow',


Fitting Logistic Regression
---
$$p(class|doc)=\frac{1}{1+e^{-(w_1f_1+w_2f_2+\dots+w_nf_n)}}$$
Where $f_i$ are the word frequencies, and $w_i$ are the learned weights

In [8]:
model = LogisticRegression()
model.fit(X_train, y_train)
yh_train = model.predict(X_train)
yh_test = model.predict(X_test)
print(classification_report(y_test, yh_test))

             precision    recall  f1-score   support

        SPA       0.89      0.80      0.84       518
     bylaws       0.98      0.90      0.94      1419
     credit       0.97      0.86      0.91       406
 employment       0.94      0.95      0.95      4265
        rra       0.97      0.84      0.90       398
        rsu       0.93      0.97      0.95      6278

avg / total       0.94      0.94      0.94     13284



In [9]:
for cls, coef in zip(model.classes_, model.coef_):
    weights = sorted(list(zip(vectorizer.get_feature_names(),coef)), key=at(1), reverse=True)
    print ("==============\nTop ten words for {c}\n-------------------".format(c=cls))
    print ("(+) POSITIVE: "+",".join([word for word, weight in weights][:10]))
    print ("(-) NEGATIVE: "+",".join([word for word, weight in weights][-10:]))

Top ten words for SPA
-------------------
(+) POSITIVE: seller,finder,accredit,moratorium,insolv,investco,accuraci,buyer,answer,indentur
(-) NEGATIVE: just,me,educ,splitup,grossup,borrow,nonforfeit,posteffect,underwritten,untru
Top ten words for bylaws
-------------------
(+) POSITIVE: unanim,chapter,preemptiv,perpetu,disinterest,inspector,conven,thereat,redempt,wind
(-) NEGATIVE: disagr,nonstatutori,growth,gift,distribute,unrestrict,splitup,seller,nonforfeit,untru
Top ten words for credit
-------------------
(+) POSITIVE: borrow,guarantor,promissori,homi,rmb,revolv,matur,prepay,collater,debtor
(-) NEGATIVE: appendix,incident,bancorp,multipl,nonforfeit,transferor,unanim,seller,vehicl,issuer
Top ten words for employment
-------------------
(+) POSITIVE: frequent,biweekli,nonrenew,semimonthli,inkind,car,voucher,vision,dissimilar,unreimburs
(-) NEGATIVE: notat,unrestrict,cashier,gift,deceas,untru,unissu,nonforfeit,splitup,bookentri
Top ten words for rra
-------------------
(+) POSITIVE: u

Testing various models
---

In [10]:
#http://scikit-learn.org/stable/supervised_learning.html
#http://scikit-learn.org/stable/auto_examples/classification/plot_classifier_comparison.html
#model = MultinomialNB()
model = LogisticRegression()
#model = SGDClassifier(loss="log")
#model = DecisionTreeClassifier()
#model = RandomForestClassifier()
#model = LinearSVC()
model.fit(X_train, y_train)
yh_train = model.predict(X_train)
yh_test = model.predict(X_test)
print ("Train Accuracy: {train}\nTest Accuracy: {test}".format(train=accuracy_score(y_train, yh_train),test=accuracy_score(y_test, yh_test)))

Train Accuracy: 0.9481495821637144
Test Accuracy: 0.941358024691358


In [11]:
print(classification_report(y_test, yh_test))

             precision    recall  f1-score   support

        SPA       0.89      0.80      0.84       518
     bylaws       0.98      0.90      0.94      1419
     credit       0.97      0.86      0.91       406
 employment       0.94      0.95      0.95      4265
        rra       0.97      0.84      0.90       398
        rsu       0.93      0.97      0.95      6278

avg / total       0.94      0.94      0.94     13284

