In [3]:
# Import libraries
from keras.models import Sequential
from keras.layers import Dense
from keras.preprocessing import sequence
from IPython.display import clear_output
from IPython.display import display
import time, sys
import numpy as np
import pandas as pd
import re
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.metrics import confusion_matrix, accuracy_score, auc
# tensorflow is needed as a dependency for something else

'''
To run the following code, you can run the following 3 lines:

from model import review_invoices
review_invoices = review_invoices()
review_invoices.run()
'''

class review_invoices:
    '''
    I put all of David's code into a class. I split everything within the class
    into methods. I didn't adjust much of the code. Pretty much all I did was 
    combine the two csv's into one dataframe since the csv's had to be split
    into two.
    
    There is opportunity to adjust and restructure the methods within this class
    in a way that makes more sense. I just want to get a good framework for the
    code before we start expanding on it.
    '''
    
    def __init__(self):
        '''
        Initialize variables. Everything within this __init__ method gets
        run automatically when the review_invoices class gets called. This is
        a great place to define variables and possibly run methods automatically
        
        Every variable defined within this method can be called and viewed by 
        the user. Conversely, anything within the below methods is private.
        
        To make a variable callable/called by other methods, put 
        'self.' in front of the variable. This brings the variable outside of 
        the method and into the class.
        '''
        print('Initializing')
        # Load the data into dataframes
        self.df_1 = pd.read_csv('Data/data.csv')
        self.df_2 = pd.read_csv('Data/data2.csv')
        df = self.df_1.append(self.df_2, ignore_index = True)
        # Rename column headers
        df.rename(columns = {'WO #':'work_order_id', 'Chargeback':'liability', 
            'Terms':'work_order'}, inplace = True)
        self.df = df
        # Update Pandas settings. View full contents of each column
        pd.set_option('display.max_colwidth', -1)
        # Display up to 10 columns
        pd.set_option('display.max_columns', 10)
        # A check for null values
        self.null = df.isnull().values.any()

    def explore_data(self):
        print('Running explore_data()')
        # Define the raw dataframe
        df = self.df
        # Print basic info about dataframe
        print('\nOriginal dataframe info')
        print('----------------------------------------')
        df.info()
        print('----------------------------------------')
        # Print out first 5 rows of the df
        print(f'\nAre there any null values? {self.null}')
        print('\nPrinting the first 5 rows of the original dataframe')
        display(df.head())
        # Create csv of duplicate terms to be audited
        duplicate_terms = df[df.duplicated(subset=['work_order'], keep = False)]
        self.duplicate_terms = duplicate_terms.sort_values(by=['work_order'])
        duplicate_terms['work_order_id'].nunique()
        # Create csv of duplicate work order numbers to be audited
        duplicate_wo = df[df.duplicated(subset=['work_order_id'], keep = False)]
        self.duplicate_wo = duplicate_wo.sort_values(by=['work_order_id'])

    def clean_df(self):
        print('Running clean_df()')
        df = self.df
        # Remove any rows with a null cell
        if self.null is True:
            df = df.dropna()
        # Remove rows with invalid terms
        print('\nDropping work orders with invalid text: "#NAME?"')
        df = df.drop(df[df['work_order'] == '#NAME?'].index)
        # Parse out phone numbers into a new column, phone_num
        print('Extracting and removing phone numbers')
        df['phone_num'] = df['work_order'].str.extract(
            '(\(?\d\d\d\)?-? ?\.?\d\d\d-?\.? ?\d\d\d\d?)')
        # Remove the phone numbers from the work_order column
        df['work_order'] = df['work_order'].replace(
            '(\(?\d\d\d\)?-? ?\.?\d\d\d-?\.? ?\d\d\d\d?)', '', regex = True)
        print('Extracting and removing email addresses')
        # Extract email addresses and put into separate column
        df['email'] = df['work_order'].str.extract('(\S+@\S+)')
        # Remove email addresses from work_order column
        df['work_order'] = df['work_order'].replace('(\S+@\S+)', '', regex = True)
        print('Removing some meaningless words from work order templates')
        # Remove "Contact:", "Email:", "Phone:" from each work order
        df['work_order'] = df['work_order'].replace('(Contact:|Email:|Phone:)', 
            '', regex=True)
        print('Extracting and removing property ID\'s')
        # Extract the property ID from the end of each work order
        df['property_id'] = df['work_order'].str.rsplit(' ', 1).str[1]
        # Remove the property ID from each work order
        df['work_order'] = df['work_order'].str.rsplit(' ', 1).str[0]
        # Replace any non-word characters from work_order column with a space
        print('Replacing all non-word characters with a space')
        df['work_order'] = df['work_order'].str.replace('\W', ' ', regex = True)
        # Make the work_order column all lower case
        print('Making work_order column all lower case')
        df['work_order'] = df['work_order'].str.lower()
        print('Turning column of strings into column of lists (This takes some '
            'time)')
        df['work_order'] = df['work_order'].apply(word_tokenize)
        # Make clean dataframe callable outside of the method
        # The index was messed up after removing some rows, need to reset_index
        df = df.reset_index(drop = True)
        
        self.df_clean = df
        # Review some of the changes made to the data
        df_clean = df
        print('\nCleaned dataframe info')
        print('----------------------------------------')
        df_clean.info()
        print('----------------------------------------')
        print('\nPrinting the first 5 rows of the clean dataframe')
        display(df_clean.head())
        # Convert dataframe columns to series for later method use
        self.X = df["work_order"]
        self.y = df["liability"]
        return df_clean
        

    def link_words(self):
        # Further clean and then lemmatize
        print('Running link_words()')
        # Define the work_order column as X
        X = self.X
        # Create an empty list called documents used to append lemmatized text
        documents = []
        stemmer = WordNetLemmatizer()
        print('\nLemmatizing. This one takes some time too...')
        # Lemmatize each word from each list of words, one at at time
        # Join those words together into strings, like they started
        # Append each string onto the documents list
        for sen in range(0, len(X)):
            document = X[sen]
            document = [stemmer.lemmatize(word) for word in document]
            document = ' '.join(document)
            documents.append(document)
        
        print('Creating equivalence classes...')
        def create_ec(dictionary, corpus):
            for key, values in dictionary.items():
                for value in values:
                    corpus= [item.replace(value, key) for item in corpus]
            return corpus

        corpus = documents
        res_dic = {'resident': ['tenant', 'renter', 'occupant']}
        corpus = create_ec(res_dic, corpus)
        landlord_dic = {'landlord': ['owner','manager']}
        corpus = create_ec(landlord_dic, corpus)
        tech_dic = {'technician': ['tech']}
        corpus = create_ec(tech_dic, corpus)
        house_dic = {'house': ['home','property']}
        corpus = create_ec(house_dic, corpus)
        fridge_dic = {'refrigerator': ['fridge']}
        corpus = create_ec(fridge_dic, corpus)
        air_dic= {'air': ['ac', 'air conditioning']}
        corpus = create_ec(air_dic, corpus)
        bath_dic = {'bath': ['tub', 'bathtub']}
        corpus = create_ec(bath_dic, corpus)
        heater_dic= {'heater': ['furnace']}
        corpus = create_ec(heater_dic, corpus)
        temp_dic= {'temperature': ['temp']}
        corpus = create_ec(temp_dic, corpus)
        roof_dic = {'roof': ['roofing', 'shingles', 'shingle']}
        corpus = create_ec(roof_dic, corpus)
        documents = corpus
        
        print('Dropping words with less than 3 letters...')
        newdocuments =[]
        for row in documents:
            shortword = re.compile(r'\W*\b\w{1,2}\b')
            row1 = (shortword.sub('',row))
            newdocuments.append(row1)
        documents = newdocuments
        
        self.documents = documents
        # Print out first five items in documents list
        print('\nWe\'ve turned the work_order column into a list called '
            '"documents"')


        
    def vectorize(self):
        from sklearn.feature_extraction.text import TfidfVectorizer  
        tfidfconverter = TfidfVectorizer(
            max_features=2000,
            min_df=10,
            max_df=0.7,
            stop_words=stopwords.words('english'))  
        self.X = tfidfconverter.fit_transform(self.documents).toarray()  
        
        tfidf_result = tfidfconverter.fit_transform(self.documents)
        
        scores = zip(tfidfconverter.get_feature_names(),
                 np.asarray(tfidf_result.sum(axis=0)).ravel())
        sorted_scores = sorted(scores, key=lambda x: x[1], reverse=True)

        print('\n Printing the top 20 TFIDF scores...\n')
        for item in sorted_scores[0:20]:
            print ("{0:50} Score: {1}".format(item[0], item[1]))

    def partition(self):
        from sklearn.model_selection import train_test_split  
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(self.X, 
            self.y, test_size=.2, random_state=1)  

    def model(self):
        model = Sequential()
        model.add(Dense(2000, input_dim = 2000, activation = 'relu'))
        model.add(Dense(1000, activation = 'relu'))
        model.add(Dense(500, activation = 'relu'))
        model.add(Dense(1, activation = 'sigmoid'))
        model.compile(optimizer = 'adam', loss = 'binary_crossentropy', 
            metrics = ['accuracy'])
        return model

    def run(self):
        '''
        This method can be called as an easy way to run all of the above methods
        and the commands to get output. 
        
        The other easy alternative is to include all of this stuff in the
        __init__ method so it get's run automatically when the class is called.
        Splitting all of this into its own method just makes the class easier to
        control
        '''
        self.clean_df()
        self.link_words()
        self.vectorize()
        self.partition()
        self.model()
        
        # -------- call model -------- 
        model = self.model()
        
        # -------- fit  -------- 
        model.fit(self.X_train,self.y_train, epochs = 10, batch_size = 512, 
            verbose = True)
        
        # -------- predict  -------- 
        pred = model.predict_classes(self.X_test)
        
        # -------- Confusion Matrix -------- 
        matrix = pd.DataFrame(confusion_matrix(self.y_test,pred, 
            labels = [x for x in range(0,2)]))
        matrix
        
        # -------- accuracy -------- 
        balanced_accuracy_score(self.y_test,pred)
        
        # -------- summary -------- 
        model.summary()

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/salisburyfamily/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/salisburyfamily/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/salisburyfamily/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
ri = review_invoices()
ri.run()