In [1]:
import pandas as pd
import numpy as np
import argparse
import csv
import json
import os
import random
import sys

In [2]:
cleandataset = pd.read_csv('data/dataset.csv')
cleandataset.columns

Index(['merchant_name', 'cleanName2', 'start_idx', 'end_idx'], dtype='object')

In [3]:
cleandataset['start_idx'].unique()

array([ 5,  4, 16, 32, 26, 27, 39,  3,  9,  7,  8, 21,  2, 15, 23, 29, 20,
       25, 22, 30, 24, 10,  6, 14, 31,  0, 13, 12,  1, 11, 17, 19, 34, 28,
       33, 18, 35, 36, 37], dtype=int64)

In [4]:
TRAIN_DATA = cleandataset.values.tolist()
TRAIN_DATA = [(item[0], {'entities':[(item[2], item[3], 'BRD')]}) for item in TRAIN_DATA]
TRAIN_DATA[:5]

[('0 04 amc creve coeur', {'entities': [(5, 8, 'BRD')]}),
 ('0 05 amc security sq', {'entities': [(5, 8, 'BRD')]}),
 ('0 1 forever 21', {'entities': [(4, 14, 'BRD')]}),
 ('0 2 forever 21', {'entities': [(4, 14, 'BRD')]}),
 ('0 3 amc grand island', {'entities': [(4, 7, 'BRD')]})]

In [5]:
import spacy
import random
from random import sample
from sklearn.model_selection import train_test_split
import warnings

warnings.filterwarnings("ignore")

def train_spacy(data,iterations):
    TRAIN_DATA = data
    nlp = spacy.blank('en')  # create blank Language class
    # create the built-in pipeline components and add them to the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner, last=True)
# add labels
    for _, annotations in TRAIN_DATA:
         for ent in annotations.get('entities'):
            ner.add_label(ent[2])
# get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):  # only train NER
        optimizer = nlp.begin_training()
        for itn in range(iterations):
            print("Starting iteration " + str(itn))
            random.shuffle(TRAIN_DATA)
            losses = {}
            for text, annotations in TRAIN_DATA:
                nlp.update(
                    [text],  # batch of texts
                    [annotations],  # batch of annotations
                    drop=0.2,  # dropout - make it harder to memorise data
                    sgd=optimizer,  # callable to update weights
                    losses=losses)
            print(losses)
    return nlp

random.shuffle(TRAIN_DATA)
train = TRAIN_DATA[:1000]
# train = TRAIN_DATA[:int(len(TRAIN_DATA) *0.7)]
# test = TRAIN_DATA[int(len(TRAIN_DATA) *0.7):]

prdnlp = train_spacy(train, 20)
# Save our trained Model
modelfile = input("Enter your Model Name: ")
prdnlp.to_disk(modelfile)


Starting iteration 0
{'ner': 679.0596929758201}
Starting iteration 1
{'ner': 347.8885497782279}
Starting iteration 2
{'ner': 218.77194210659124}
Starting iteration 3
{'ner': 182.31397867414307}
Starting iteration 4
{'ner': 168.8610008848184}
Starting iteration 5
{'ner': 167.533809333152}
Starting iteration 6
{'ner': 99.81080790196178}
Starting iteration 7
{'ner': 64.98547894595171}
Starting iteration 8
{'ner': 42.0636345749937}
Starting iteration 9
{'ner': 74.13608511613619}
Starting iteration 10
{'ner': 88.43316794159757}
Starting iteration 11
{'ner': 53.66041874493271}
Starting iteration 12
{'ner': 43.94755223043633}
Starting iteration 13
{'ner': 71.71231055686705}
Starting iteration 14
{'ner': 54.02206059437969}
Starting iteration 15
{'ner': 45.36082394173235}
Starting iteration 16
{'ner': 35.11150637072986}
Starting iteration 17
{'ner': 30.977853328374515}
Starting iteration 18
{'ner': 27.159490019437403}
Starting iteration 19
{'ner': 60.90726052075518}


In [55]:
#Test your text
test_text = sample(TRAIN_DATA, 1)

print('test case: ', test_text)
doc = prdnlp(test_text[0][0])
for ent in doc.ents:
    print(ent.text)

test case:  [('sheetz 0518 mechanicsburgpa usa', {'entities': [(0, 6, 'BRD')]})]
sheetz


In [None]:
tempdata = pd.read_csv('data/cleanedmerchant_training.csv') ## read the file into to a pandas dataframe
data = tempdata[['merchant_name', 'cleanName2']]
validName = data[data['cleanName2'].notnull()]
validName.reset_index(inplace=True, drop = True)
validName.head(50)

In [None]:
from nltk import word_tokenize

In [None]:
# import matplotlib.pyplot as plt
# import numpy as np
# mu, sigma = 100, 10 # mean and standard deviation
# s = np.random.normal(mu, sigma, 1000)
# s_2 = np.random.uniform(70, 130, 1000)

# count, bins, ignored = plt.hist(s, 30, density=False, color = 'r')
# # plt.plot(bins, 1/(sigma * np.sqrt(2 * np.pi)) *
# #                np.exp( - (bins - mu)**2 / (2 * sigma**2) )
# #          )
# plt.show()

In [None]:
# raw_docs = ["Here are some very simple basic sentences.",
# "They won't be very interesting, I'm afraid.",
# "The point of these examples is to _learn how basic text cleaning works_ on *very simple* data."]
raw_docs = validName['merchant_name'].to_list()
tokenized_docs = [word_tokenize(doc) for doc in raw_docs]
print(tokenized_docs[2])

In [None]:
import re
import string
regex = re.compile('[%s]' % re.escape(string.punctuation)) #see documentation here: http://docs.python.org/2/library/string.html

tokenized_docs_no_punctuation = []

for review in tokenized_docs:
    new_review = []
    for token in review:
        new_token = regex.sub(u'', token)
        if not new_token == u'':
            new_review.append(new_token)
    
    tokenized_docs_no_punctuation.append(new_review)
    
print(tokenized_docs_no_punctuation[2])

In [None]:
from nltk.corpus import stopwords

tokenized_docs_no_stopwords = []

for doc in tokenized_docs_no_punctuation:
    new_term_vector = []
    for word in doc:
        if not word in stopwords.words('english'):
            new_term_vector.append(word)
    
    tokenized_docs_no_stopwords.append(new_term_vector)

print(tokenized_docs_no_stopwords[2])

In [None]:
cleaned_sentences = [' '.join(words) for words in tokenized_docs_no_stopwords]
len(cleaned_sentences) == len(validName)

In [None]:
validName['merchant_name'] = cleaned_sentences

In [None]:
def shuffle_dataset(dataset):
    """
    Shuffle samples in dataset.

    Args:
        dataset: (list) list of tuples like: [(transaction, start_idx, end_idx), ..... , ]

    Returns:
        dataset: (list) list of tuples like: [,....., (transaction, start_idx, end_idx), ..... , ]

    """

    shuffled_indices = list(range(len(dataset)))
    random.shuffle(shuffled_indices)
    dataset = [dataset[index] for index in shuffled_indices]

    return dataset
def get_merchant_indices_in_sentence(sentence, merchant):
    """
    Given transaction string and merchant string, returns start end end indices of merchant in transaction string.

    Args:
        sentence: (string) transaction string. (example: )
        merchant: (string) merchant name. (example: )

    Returns:
        sentence: (string) converted to upper strings
        start: (int) merchant string start index
        end: (int) merchant string end index

    Examples:
        sentences, start, end = get_sentence_indices("Target 00014423 WATERTOWN MA","target")

        sentences: "TARGET 00014423 WATERTOWN MA"
        start: 0
        end: 6
    """

    sentence = sentence.upper()
    merchant = merchant.upper()

    start = -1
    end = -1

    idx = sentence.find(merchant)
    if idx != -1:
        start = idx
        end = start + len(merchant)

    return sentence, start, end

In [None]:
validName.sample(10)

In [None]:
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords

for idx, row in validName.iterrows():
    # print('Processing row: ', idx)
    # print('row0: ', row[0])
    # print('row1: ', row[1])
    # print(row[0].find(row[1].split(' ')[0]))
    # print(len(row[1]))
    find_i = row[0].find(row[1].split(' ')[0])
    if find_i + len(row[1]) > len(row[0]):   
        row[0] = row[0][:find_i] + row[1]
    
    sentence, start, end = get_merchant_indices_in_sentence(row[0], row[1])
    validName.loc[idx, 'start_idx'] = start
    validName.loc[idx, 'end_idx'] = end

print(validName.sample(50))
#validName.to_csv('data/dataset.csv')

In [None]:
validName.to_csv('data/dataset.csv')

In [None]:
cleandataset = cleandataset.drop(columns = 'Unnamed: 0')
cleandataset.columns

In [None]:
cleandataset.index

In [None]:
cleandataset = cleandataset[cleandataset['start_idx']!=-1]
len(cleandataset)

In [None]:
cleandataset.reset_index(drop=True)

In [None]:
cleandataset.to_csv('data/dataset.csv')

In [None]:
def character_embedding(text, max_len=300, emb_dim=8):
        """
        Embeds character string with the use of (emb_dim)-bit binary values of each character.

        Args:
            text: (string) text to embed
            max_len: (int) maximum length of text that will be encoded. Padding will be done with zeros.
            emb_dim:

        Returns:
            str_array: (ndarray) 2 dimensional numpy array containing embedded text of shape emb_dim*max_len

        """

        # cut long text with maximum accepted length
        if len(text) > max_len:
            text = text[:max_len]

        str_array = np.zeros((emb_dim, max(len(text), max_len)), dtype=np.int32).tolist()

        for index, char in enumerate(text):
            str_binary = format(ord(char), 'b').zfill(emb_dim)[::-1]
            str_binary = str_binary[:emb_dim]
            for str_index, str_char in enumerate(str_binary, 0):
                str_array[str_index][index] = int(str_char)
       
        padding_str_binary = '0' * emb_dim
        
        for index in range(len(text), max_len):
            for str_index, str_char in enumerate(padding_str_binary, 0):
                str_array[str_index][index] = int(str_char)

        return str_array

In [None]:
import pprint
pp = pprint.PrettyPrinter(indent=4)
emb = character_embedding('Martin')
emb

In [None]:
df = cleandataset.sample(frac=1).reset_index(drop=True)

In [None]:
train_dataset = df[:int(0.7 * len(df))]
val_dataset = df[int(0.7 * len(df)): int(0.85 * len(df))]
test_dataset = df[int(0.85 * len(df)):]

In [None]:
train_dataset.head(50)

#### Rule matching

In [None]:
import spacy
from spacy.matcher import Matcher

nlp = spacy.load("en_core_web_sm")
matcher = Matcher(nlp.vocab)
# Add match ID "HelloWorld" with no callback and one pattern
pattern = [{"LOWER": "hello"}, {"IS_PUNCT": True}, {"LOWER": "world"}]
matcher.add("sID", [pattern])

doc = nlp("Hello, world! Hello world!")
matches = matcher(doc)

for match_id, start, end in matches:
    string_id = nlp.vocab.strings[match_id]  # Get string representation
    span = doc[start:end]  # The matched span
    print(match_id, string_id, start, end, span.text)

In [None]:
import spacy
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
import re

nlp = spacy.load("en_core_web_sm")
doc = nlp("dollar general w main a")
print(doc.ents)
# for idx, row in sample_10.iterrows():
#     # print('processing row: ', idx)
#     # print(row[0])
#     doc = nlp(row[0])
#     all_entities = doc.ents
#     print(all_entities)
#     # print(all_entities)
#     #extraction_temp = str(process.extractOne(row[1], all_entities)[0])
#     #extraction =  re.sub(r"inc|mktp|\d+", "", extraction_temp) 
#     #print('extraction result: ', extraction_temp)

In [None]:
import pandas as pd
import numpy as np
cleanData = pd.read_csv('data/cleanedmerchant_training.csv')
cleanData.columns

In [None]:
# stopwordFreeds[(stopwordFreeds['cleanName2'] != stopwordFreeds['CleanName3']) & (stopwordFreeds['CleanName3'] != 'chick fil')].sample(1000)

In [None]:
cleanData.head()