# We will classify the text in KP Document corpus into the 6 categories using Logistic regression model

In [52]:
os.chdir('C://Users//L833377//Desktop//ClassificationEngineIdeas//')

### Approach 1: In the first run we will use Word2vec for text pre-processing. Word2VEC converts in a text document into vectors where words with similar meaning have similar vector representation. It uses the surrounding words to represent target words with a Neural network that has a hidden layer and includes an encoding for word representation. We will first load the pre-trained Word2Vec library from Google which has been trained on billion word Google News Corpus.

In [53]:
%%time
import gensim
from gensim.models import Word2Vec

wv = gensim.models.KeyedVectors.load_word2vec_format("GoogleNews-vectors-negative300.bin.gz", binary=True)
wv.init_sims(replace=True)

Wall time: 1min 2s


In [54]:
# View some of the available vocabularies
from itertools import islice
list(islice(wv.vocab, 1030, 1045))


['Memorial_Hospital',
 'Seniors',
 'memorandum',
 'elephant',
 'Trump',
 'Census',
 'pilgrims',
 'De',
 'Dogs',
 '###-####_ext',
 'chaotic',
 'forgive',
 'scholar',
 'Lottery',
 'decreasing',
 'Supervisor',
 'fundamentally',
 'Fitness',
 'abundance',
 'Hold']

In [55]:
# Here we are using the bad of words approach 
def word_averaging(wv, words):
    all_words, mean = set(), []
    
    for word in words:
        if isinstance(word, np.ndarray):
            mean.append(word)
        elif word in wv.vocab:
            mean.append(wv.syn0norm[wv.vocab[word].index])
            all_words.add(wv.vocab[word].index)

    if not mean:
        logging.warning("cannot compute similarity with no input %s", words)
        # FIXME: remove these examples in pre-processing
        return np.zeros(wv.vector_size,)

    mean = gensim.matutils.unitvec(np.array(mean).mean(axis=0)).astype(np.float32)
    return mean

def  word_averaging_list(wv, text_list):
    return np.vstack([word_averaging(wv, post) for post in text_list ])

def w2v_tokenize_text(text):
    tokens = []
    for sent in nltk.sent_tokenize(text, language='english'):
        for word in nltk.word_tokenize(sent, language='english'):
            if len(word) < 2:
                continue
            tokens.append(word)
    return tokens

In [56]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix
import nltk

In [57]:
list(df2)

['Label', 'Text', 'DocName', 'DocType', 'cleanText', 'Length']

In [58]:
train, test = train_test_split(df1, test_size=0.2, random_state = 42)

test_tokenized = test.apply(lambda r: w2v_tokenize_text(r['LemmatizedcleanText']), axis=1).values
train_tokenized = train.apply(lambda r: w2v_tokenize_text(r['LemmatizedcleanText']), axis=1).values

In [59]:
X_train_word_average = word_averaging_list(wv,train_tokenized)
X_test_word_average = word_averaging_list(wv,test_tokenized)



Reference: https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html

Usage: 
class sklearn.linear_model.LogisticRegression(penalty='l2', *, dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None, random_state=None, solver='lbfgs', max_iter=100, multi_class='auto', verbose=0, warm_start=False, n_jobs=None, l1_ratio=None)[source]

n_jobsint, default=None
Number of CPU cores used when parallelizing over classes if multi_class=’ovr’”. This parameter is ignored when the solver is set to ‘liblinear’ regardless of whether ‘multi_class’ is specified or not. None means 1 unless in a joblib.parallel_backend context. -1 means using all processors. 

Cfloat, default=1.0
Inverse of regularization strength; must be a positive float. Like in support vector machines, smaller values specify stronger regularization.

In [60]:

from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(n_jobs=1, C=1e5)
logreg = logreg.fit(X_train_word_average, train['Label'])
y_pred = logreg.predict(X_test_word_average)



In [61]:
print('accuracy %s' % accuracy_score(y_pred, test.Label))
from sklearn.metrics import classification_report
print(classification_report(test.Label, y_pred))

accuracy 0.9310344827586207
                    precision    recall  f1-score   support

            Claims       1.00      1.00      1.00         4
HealthCareDelivery       1.00      1.00      1.00         3
        Membership       0.67      1.00      0.80         4
          Pharmacy       1.00      1.00      1.00         4
ProductandBenefits       1.00      0.80      0.89        10
ProviderandNetwork       1.00      1.00      1.00         4

          accuracy                           0.93        29
         macro avg       0.94      0.97      0.95        29
      weighted avg       0.95      0.93      0.93        29



### Approach 2: Next we will run the Logistic Regression model using TfIdfVectorizer. 

In [62]:
X = df1.LemmatizedcleanText
y = df1.Label
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 42)

In [63]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer

logreg = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', LogisticRegression(n_jobs=1, C=1e5)),
               ])
logreg.fit(X_train, y_train)



Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('clf',
                 LogisticRegression(C=100000.0, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=

In [64]:

%%time

y_pred = logreg.predict(X_test)

print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred))

accuracy 0.8620689655172413
                    precision    recall  f1-score   support

            Claims       1.00      0.75      0.86         4
HealthCareDelivery       1.00      1.00      1.00         3
        Membership       0.80      1.00      0.89         4
          Pharmacy       0.75      0.75      0.75         4
ProductandBenefits       0.82      0.90      0.86        10
ProviderandNetwork       1.00      0.75      0.86         4

          accuracy                           0.86        29
         macro avg       0.89      0.86      0.87        29
      weighted avg       0.88      0.86      0.86        29

Wall time: 39.7 ms


In comparison to approach 1 we notice that the test accuracy came down from 93 % to 86 %. The F1 score specially reduced for Pharmacy documents substantially

### Approach 3: We will now try to solve the classification problem using a Doc2vec vectorization approach. Doc2vec is based on Word2vec model, with the addition of another vector Doc ID to the input. Genisms Doc2vec requires each document to be have a label associated with it. This is done using TaggedDocument. 

Reference: https://towardsdatascience.com/multi-class-text-classification-with-doc2vec-logistic-regression-9da9947b43f4

In [65]:
from tqdm import tqdm
tqdm.pandas(desc="progress-bar")
from gensim.models import Doc2Vec
from sklearn import utils
import gensim
from gensim.models.doc2vec import TaggedDocument
import re

In [66]:

def label_sentences(corpus, label_type):
    """
    Gensim's Doc2Vec implementation requires each document/paragraph to have a label associated with it.
    We do this by using the TaggedDocument method. The format will be "TRAIN_i" or "TEST_i" where "i" is
    a dummy index of the post.
    """
    labeled = []
    for i, v in enumerate(corpus):
        label = label_type + '_' + str(i)
        labeled.append(TaggedDocument(v.split(), [label]))
    return labeled

In [67]:
X_train, X_test, y_train, y_test = train_test_split(df1.LemmatizedcleanText, df1.Label, random_state=0, test_size=0.2)
X_train = label_sentences(X_train, 'Train')
X_test = label_sentences(X_test, 'Test')
all_data = X_train + X_test

In [68]:
all_data[:2]

[TaggedDocument(words=['cobi', 'claim', 'technical', 'debt', 'colorado', 'business', 'intelligence', 'cobi', 'claim', 'technical', 'debt', 'acquisition', 'layer', 'insert', 'update', 'historical', 'data', 'addressed', 'exception', 'eedm', 'delta', 'cdw', 'crds', 'exist', 'difficult', 'quantify', 'inconsistent', 'use', 'metadata', 'column', 'etl', 'follow', 'cobi', 'standard', 'conform', 'layerschema', 'provide', 'value', 'extra', 'layer', 'maintain', 'situation', 'background', 'claim', 'built', 'crds', 'time', 'cdw', 'went', 'live', 'predates', 'cobi', 'current', 'standard', 'cdw', 'utilize', 'traditional', 'type', 'logic', 'us', 'post', 'processing', 'track', 'update', 'much', 'crds', 'claim', 'design', 'done', 'without', 'data', 'etl', 'architect', 'oversight', 'cobi', 'claim', 'technical', 'debt', 'assessment', 'confirmed', 'delta', 'crds', 'cdw', 'net', 'delta', 'claim', 'line', 'time', 'delta', 'attribution', 'difficult', 'detect', 'quantify', 'extent', 'unknown', 'high', 'impact'

In [69]:
model_dbow = Doc2Vec(dm=0, vector_size=300, negative=5, min_count=1, alpha=0.065, min_alpha=0.065)
model_dbow.build_vocab([x for x in tqdm(all_data)])

100%|██████████| 141/141 [00:00<00:00, 283507.60it/s]


In [70]:
for epoch in range(30):
    model_dbow.train(utils.shuffle([x for x in tqdm(all_data)]), total_examples=len(all_data), epochs=1)
    model_dbow.alpha -= 0.002
    model_dbow.min_alpha = model_dbow.alpha

100%|██████████| 141/141 [00:00<?, ?it/s]
100%|██████████| 141/141 [00:00<00:00, 283507.60it/s]
100%|██████████| 141/141 [00:00<?, ?it/s]
100%|██████████| 141/141 [00:00<00:00, 283779.69it/s]
100%|██████████| 141/141 [00:00<?, ?it/s]
100%|██████████| 141/141 [00:00<00:00, 313738.39it/s]
100%|██████████| 141/141 [00:00<00:00, 283779.69it/s]
100%|██████████| 141/141 [00:00<00:00, 284325.42it/s]
100%|██████████| 141/141 [00:00<?, ?it/s]
100%|██████████| 141/141 [00:00<?, ?it/s]
100%|██████████| 141/141 [00:00<00:00, 283779.69it/s]
100%|██████████| 141/141 [00:00<?, ?it/s]
100%|██████████| 141/141 [00:00<00:00, 284052.29it/s]
100%|██████████| 141/141 [00:00<00:00, 284052.29it/s]
100%|██████████| 141/141 [00:00<00:00, 284325.42it/s]
100%|██████████| 141/141 [00:00<00:00, 284599.07it/s]
100%|██████████| 141/141 [00:00<?, ?it/s]
100%|██████████| 141/141 [00:00<00:00, 282559.42it/s]
100%|██████████| 141/141 [00:00<00:00, 283779.69it/s]
100%|██████████| 141/141 [00:00<00:00, 283915.92it/s]
100%

In [71]:
def get_vectors(model, corpus_size, vectors_size, vectors_type):
    """
    Get vectors from trained doc2vec model
    :param doc2vec_model: Trained Doc2Vec model
    :param corpus_size: Size of the data
    :param vectors_size: Size of the embedding vectors
    :param vectors_type: Training or Testing vectors
    :return: list of vectors
    """
    vectors = np.zeros((corpus_size, vectors_size))
    for i in range(0, corpus_size):
        prefix = vectors_type + '_' + str(i)
        vectors[i] = model.docvecs[prefix]
    return vectors

In [72]:
train_vectors_dbow = get_vectors(model_dbow, len(X_train), 300, 'Train')
test_vectors_dbow = get_vectors(model_dbow, len(X_test), 300, 'Test')

In [73]:
logreg = LogisticRegression(n_jobs=1, C=1e5)
logreg.fit(train_vectors_dbow, y_train)



LogisticRegression(C=100000.0, class_weight=None, dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=100, multi_class='warn', n_jobs=1, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [74]:

logreg = logreg.fit(train_vectors_dbow, y_train)
y_pred = logreg.predict(test_vectors_dbow)



In [75]:
print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred))

accuracy 0.5172413793103449
                    precision    recall  f1-score   support

            Claims       0.00      0.00      0.00         5
HealthCareDelivery       0.75      0.60      0.67         5
        Membership       0.30      0.60      0.40         5
          Pharmacy       1.00      0.50      0.67         4
ProductandBenefits       0.33      0.50      0.40         4
ProviderandNetwork       0.83      0.83      0.83         6

          accuracy                           0.52        29
         macro avg       0.54      0.51      0.49        29
      weighted avg       0.54      0.52      0.50        29



In [None]:
We can see above that our accuracy substantially reduced with the Doc2Vec approach. This was expected because we have a small document corpus and Doc2vec works better for large corpus sizes with thousand of documents so that we can differentiate based on Document characteristics. 

Reference: https://github.com/susanli2016/NLP-with-Python/blob/master/Text%20Classification%20model%20selection.ipynb
        