In [1]:
import pyodbc
import pandas as pd
import numpy as np
import time
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV
from sklearn.ensemble import AdaBoostClassifier
from sklearn.decomposition import PCA
from sklearn.metrics import confusion_matrix


from texttransformation import StringTransform, RowTextTransform, TransformDataset
from datasetbuilder import DatasetBuilder
from metrics import MetricsConfig, MetricsCalculator
from helpers import SaveModel, RestoreModel, Normalize, RemoveOutliers
from predictor import Predictor

Configure parameters 

In [2]:
DATASOURCE_COLUMNS = ['FindCode', 'Name', 'Phone', 'Fax', 'OtherPhone', \
                                  'Email', 'WebsiteURL', 'MailingAddressFreeform', \
                                  'MailingAddressCity', 'MailingAddressPostalCode', 'MailingAddressState']
DATASOURCE_INDEX = 'FindCode'
ONE_HOT_ENCONDING_COLUMNS = {'MailingAddressState': ['nunavut ',
                                                        'saskatchewan',
                                                        'ontario  ',
                                                        'alberta',
                                                        'british columbia ',
                                                        'prince edward island ',
                                                        'yukon territory',
                                                        'newfoundland',
                                                        'northwest territories',
                                                        'new brunswick',
                                                        'manitoba',
                                                        'nova scotia ',
                                                        'quebec']}
TEXT_METRICS = {"Ratio": "Ratio_", "TokenSetRatio": "TokenSetRatio_",
              "TokenSortRatio": "TokenSortRatio_", "distance": "distance_",
              "jaro": "jaro_", "setratio": "setratio_", 
              "seqratio": "seqratio_", "longestnumericseq": "longestnumericseq_"}
PASS_THROUGH_COMULNS = ['FindCode','MailingAddressState']
HIGH_IMPORTANCE_COLUMNS = ['Name']

alteration_rules = [
    {
        'rule_Replace': ['none',''],
        'rule_RandomTypo': ['alpha', 2, 'replace'],
        'rule_ScrambleWords': [],
        'rule_DuplicateNumericSequence': [2],
        'rule_RemoveSpecialSymbols': [],
        'rule_RemoveStopWords': [],
        'rule_IncreaseWeightOfShortWords':[]
    },
    {
        'rule_Replace': ['none',''],
        'rule_RandomTypo': ['any', 2, 'add'],
        'rule_ScrambleWords': [],
        'rule_DuplicateNumericSequence': [4],
        'rule_RemoveSpecialSymbols': [],
        'rule_RemoveStopWords': [],
        'rule_IncreaseWeightOfShortWords':[]
    },
    {
        'rule_Replace': ['none',''],
        'rule_RandomTypo': ['digits', 1, 'add'],
        'rule_ScrambleWords': [],
        'rule_DuplicateNumericSequence': [3],
        'rule_RemoveSpecialSymbols': [],
        'rule_RemoveStopWords': [],
        'rule_IncreaseWeightOfShortWords':[]
    }
]

COLUMN_ALTERATIION_RULES = []
for i in range(0, len(alteration_rules)):
    COLUMN_ALTERATIION_RULES.append(
        {
            1: alteration_rules[i],
            2: alteration_rules[i],                    
            3: alteration_rules[i],
            4: alteration_rules[i],
            5: alteration_rules[i],
            6: alteration_rules[i],
            7: alteration_rules[i],
            8: alteration_rules[i],
            9: alteration_rules[i]
        })

Fetch data from source. Meta should match configuration above

In [3]:
df_source = pd.read_csv('prediction_input.csv', header=0, names=DATASOURCE_COLUMNS)
df_source.FindCode = df_source.FindCode.astype(str).str.lower()
df_source.Name = df_source.Name.astype(str).str.lower()
df_source.Phone = df_source.Phone.astype(str).str.lower()
df_source.Fax = df_source.Fax.astype(str).str.lower()
df_source.OtherPhone = df_source.OtherPhone.astype(str).str.lower()
df_source.Email = df_source.Email.astype(str).str.lower()
df_source.WebsiteURL = df_source.WebsiteURL.astype(str).str.lower()
df_source.MailingAddressFreeform = df_source.MailingAddressFreeform.astype(str).str.lower()
df_source.MailingAddressCity = df_source.MailingAddressCity.astype(str).str.lower()
df_source.MailingAddressPostalCode = df_source.MailingAddressPostalCode.astype(str).str.lower()
df_source.MailingAddressState = df_source.MailingAddressState.astype(str).str.lower()

#df_source = df_source[0:10]

Create predictiondataset

In [None]:
builder = DatasetBuilder(DATASOURCE_COLUMNS, DATASOURCE_INDEX, 
                         ONE_HOT_ENCONDING_COLUMNS, TEXT_METRICS, 
                         PASS_THROUGH_COMULNS, COLUMN_ALTERATIION_RULES,
                         HIGH_IMPORTANCE_COLUMNS)
predicting_df = builder.generatePredictionDataset(df_source)

In [None]:
model = RestoreModel("TrainedModel.sav")
predictor = Predictor(model)
predictor.execute(predicting_df, builder.getCompleteDataset())

In [None]:
matches = builder.getCompleteDataset()[
    [e + '_x' for e in DATASOURCE_COLUMNS] + [e + '_y' for e in DATASOURCE_COLUMNS]].iloc[builder.getCompleteDataset().index]

matches[(matches[DATASOURCE_INDEX + '_x'] != matches[DATASOURCE_INDEX + '_y'])]