In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from nltk.corpus import stopwords
import pickle

In [2]:
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_colwidth', 100)

In [3]:
### create classification algorithm for property-related charges

In [4]:
corig = pd.read_csv('charges_labelled.csv', dtype={'CompanyNumber': str}, header=0, index_col=0)

print(corig.shape)
corig.head(2)

(27494, 8)


Unnamed: 0,CompanyName,CompanyNumber,SICCode,persons_entitled,desc,district,km_pred,label
0,CHG-MERIDIAN UK LIMITED,1276016,"[{""code"":""64910"",""description"":""Financial leasing""}]","[{""name"":""Bal Global Finance (UK) Limited""}]",security charge dated 31072019 bal global finance uk limited lender chgmeridian computer uk limi...,RUNNYMEDE,0,1
1,CHG-MERIDIAN UK LIMITED,1276016,"[{""code"":""64910"",""description"":""Financial leasing""}]","[{""name"":""Bal Global Finance (UK) Limited""}]",security charge dated 31072019 bal global finance uk limited lender chgmeridian computer uk limi...,RUNNYMEDE,0,1


In [5]:
# remove some NaNs in desc
corig = corig[~corig.desc.isna()]

### Build features and target

In [6]:
tf = TfidfVectorizer(max_features=1000000, lowercase=True, analyzer='word',
                        stop_words='english', ngram_range=(1,1))

tfvec = tf.fit_transform(corig['desc'])

tfvec

<27488x7584 sparse matrix of type '<class 'numpy.float64'>'
	with 334633 stored elements in Compressed Sparse Row format>

In [7]:
(x_train, x_test, y_train, y_test) = train_test_split(tfvec, corig['label'], test_size=0.2)

print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)

(21990, 7584) (5498, 7584) (21990,) (5498,)


### Build Classifier

In [8]:
# c == 4 yields best result
for c in [0.01, 0.5, 1.0, 2.0, 4.0, 5.0, 7.5, 1]:
    
    lreg = LogisticRegression(C=c)
    lreg.fit(x_train, y_train)
    print (f'Logistic Regression Accuracy: C={c} {accuracy_score(y_test, lreg.predict(x_test))}')

Logistic Regression Accuracy: C=0.01 0.9070571116769734
Logistic Regression Accuracy: C=0.5 0.9588941433248453
Logistic Regression Accuracy: C=1.0 0.9623499454347035
Logistic Regression Accuracy: C=2.0 0.9618042924699891
Logistic Regression Accuracy: C=4.0 0.9969079665332848
Logistic Regression Accuracy: C=5.0 0.9972717351764278
Logistic Regression Accuracy: C=7.5 0.9969079665332848
Logistic Regression Accuracy: C=1 0.9623499454347035


In [9]:
lreg = LogisticRegression(C=4.0)
lreg.fit(x_train, y_train)

print("Lreg Train Score: %f" % lreg.score(x_train, y_train))
print("Lreg Test Score: %f" % lreg.score(x_test, y_test))

y_pred=lreg.predict(x_test)
print(confusion_matrix(y_test, y_pred))

Lreg Train Score: 0.998545
Lreg Test Score: 0.996908
[[ 510    1]
 [  16 4971]]


In [10]:
### Naive Bayes does perform poorly
mnb = MultinomialNB()
mnb = mnb.fit(x_train, y_train)

print (f'NB Accuracy: {accuracy_score(y_test, mnb.predict(x_test))}')

print("NB Train Score: %f" % mnb.score(x_train, y_train))
print("NB Test Score: %f" % mnb.score(x_test, y_test))

y_pred=mnb.predict(x_test)
print(confusion_matrix(y_test, y_pred))

NB Accuracy: 0.953073845034558
NB Train Score: 0.957162
NB Test Score: 0.953074
[[ 260  251]
 [   7 4980]]


### Test Classifer

In [11]:
test_charges = [
    '64 wilton road salisbury sp2 7es',
    'general pledge',
    'whole subjects known forming road uddingston glasgow 5na whole subjects registered land register',
    'intellectual property number 12345',
    'none',
    'this is applicable',
    'please see instructions on page 5',
    'please see instructions on page five',
    'freehold property on camberwell road in se13',
]

In [12]:
stop = stopwords.words('english')

def clean_charge(x):
    x = x.lower()
    new_str = ''
    for c in x:
        if (c.isalpha() == True) or (c == ' '):
            new_str += c
    new_str = " ".join(w for w in str(new_str).split() if w not in stop)
                       
    return new_str

In [13]:
test_clean = []

for i in test_charges:
    test_clean.append(clean_charge(i))
    
print(test_clean)

['wilton road salisbury sp es', 'general pledge', 'whole subjects known forming road uddingston glasgow na whole subjects registered land register', 'intellectual property number', 'none', 'applicable', 'please see instructions page', 'please see instructions page five', 'freehold property camberwell road se']


In [14]:
test_vec = tf.transform(test_clean)

test_vec

<9x7584 sparse matrix of type '<class 'numpy.float64'>'
	with 24 stored elements in Compressed Sparse Row format>

In [15]:
test_lreg = lreg.predict(test_vec)

for phrase, pred in zip(test_charges, test_lreg):
    print(f'Lreg Prediction: {pred}  {phrase}')

Lreg Prediction: 1  64 wilton road salisbury sp2 7es
Lreg Prediction: 0  general pledge
Lreg Prediction: 1  whole subjects known forming road uddingston glasgow 5na whole subjects registered land register
Lreg Prediction: 1  intellectual property number 12345
Lreg Prediction: 0  none
Lreg Prediction: 0  this is applicable
Lreg Prediction: 0  please see instructions on page 5
Lreg Prediction: 0  please see instructions on page five
Lreg Prediction: 1  freehold property on camberwell road in se13


In [16]:
### pickle algo
# cc_out = open("classify_charges.pickle","wb")
# pickle.dump(lreg, cc_out)
# cc_out.close()