In [1]:
import pyodbc
import pandas as pd
import numpy as np
import time
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV
from sklearn.ensemble import AdaBoostClassifier
from sklearn.decomposition import PCA
from sklearn.metrics import confusion_matrix


from texttransformation import StringTransform, RowTextTransform, TransformDataset
from datasetbuilder import DatasetBuilder
from metrics import MetricsCalculator
from helpers import SaveModel, RestoreModel, Normalize, RemoveOutliers
from predictor import Predictor

Configure parameters 

In [2]:
DATASOURCE_COLUMNS = ['FindCode', 'Name', 'Phone', 'Fax', 'OtherPhone', \
                                  'Email', 'WebsiteURL', 'MailingAddressFreeform', \
                                  'MailingAddressCity', 'MailingAddressPostalCode', 'MailingAddressState']
DATASOURCE_INDEX = 'FindCode'
ONE_HOT_ENCONDING_COLUMNS = {'MailingAddressState': ['nunavut ',
                                                        'saskatchewan',
                                                        'ontario  ',
                                                        'alberta',
                                                        'british columbia ',
                                                        'prince edward island ',
                                                        'yukon territory',
                                                        'newfoundland',
                                                        'northwest territories',
                                                        'new brunswick',
                                                        'manitoba',
                                                        'nova scotia ',
                                                        'quebec']}
TEXT_METRICS = {"Ratio": "Ratio_", "TokenSetRatio": "TokenSetRatio_",
              "TokenSortRatio": "TokenSortRatio_", "distance": "distance_",
              "jaro": "jaro_", "setratio": "setratio_", 
              "seqratio": "seqratio_", "longestnumericseq": "longestnumericseq_"}
PASS_THROUGH_COMULNS = ['FindCode','MailingAddressState']
HIGH_IMPORTANCE_COLUMNS = ['Name']

alteration_rules = [
    {
        'rule_Replace': ['none',''],
        'rule_RandomTypo': ['alpha', 2, 'replace'],
        'rule_ScrambleWords': [],
        'rule_DuplicateNumericSequence': [2],
        'rule_RemoveSpecialSymbols': [],
        'rule_RemoveStopWords': [],
        'rule_IncreaseWeightOfShortWords':[]
    },
    {
        'rule_Replace': ['none',''],
        'rule_RandomTypo': ['any', 2, 'add'],
        'rule_ScrambleWords': [],
        'rule_DuplicateNumericSequence': [4],
        'rule_RemoveSpecialSymbols': [],
        'rule_RemoveStopWords': [],
        'rule_IncreaseWeightOfShortWords':[]
    },
    {
        'rule_Replace': ['none',''],
        'rule_RandomTypo': ['digits', 1, 'add'],
        'rule_ScrambleWords': [],
        'rule_DuplicateNumericSequence': [3],
        'rule_RemoveSpecialSymbols': [],
        'rule_RemoveStopWords': [],
        'rule_IncreaseWeightOfShortWords':[]
    }
]

COLUMN_ALTERATIION_RULES = []
for i in range(0, len(alteration_rules)):
    COLUMN_ALTERATIION_RULES.append(
        {
            1: alteration_rules[i],
            2: alteration_rules[i],                    
            3: alteration_rules[i],
            4: alteration_rules[i],
            5: alteration_rules[i],
            6: alteration_rules[i],
            7: alteration_rules[i],
            8: alteration_rules[i],
            9: alteration_rules[i]
        })

Fetch data from source. Meta should match configuration above

In [3]:
df_source = pd.read_csv('prediction_input.csv', header=0, names=DATASOURCE_COLUMNS)
df_source.FindCode = df_source.FindCode.astype(str).str.lower()
df_source.Name = df_source.Name.astype(str).str.lower()
df_source.Phone = df_source.Phone.astype(str).str.lower()
df_source.Fax = df_source.Fax.astype(str).str.lower()
df_source.OtherPhone = df_source.OtherPhone.astype(str).str.lower()
df_source.Email = df_source.Email.astype(str).str.lower()
df_source.WebsiteURL = df_source.WebsiteURL.astype(str).str.lower()
df_source.MailingAddressFreeform = df_source.MailingAddressFreeform.astype(str).str.lower()
df_source.MailingAddressCity = df_source.MailingAddressCity.astype(str).str.lower()
df_source.MailingAddressPostalCode = df_source.MailingAddressPostalCode.astype(str).str.lower()
df_source.MailingAddressState = df_source.MailingAddressState.astype(str).str.lower()

df_source = df_source[0:1]

Create predictiondataset

In [4]:
builder = DatasetBuilder(DATASOURCE_COLUMNS, DATASOURCE_INDEX, 
                         ONE_HOT_ENCONDING_COLUMNS, TEXT_METRICS, 
                         PASS_THROUGH_COMULNS, COLUMN_ALTERATIION_RULES,
                         HIGH_IMPORTANCE_COLUMNS, 8)
predicting_df = builder.generatePredictionDataset(df_source)

   index FindCode_x      Name_x     Phone_x Fax_x OtherPhone_x Email_x  \
0      0      aaaqs  tony groen  7054354000  none         none    none   

  WebsiteURL_x MailingAddressFreeform_x MailingAddressCity_x  ...      Name_y  \
0         none            5523 3rd line             alliston  ...  tony groen   

      Phone_y  Fax_y OtherPhone_y Email_y WebsiteURL_y  \
0  7054354000   none         none    none         none   

  MailingAddressFreeform_y MailingAddressCity_y MailingAddressPostalCode_y  \
0            5523 3rd line             alliston                     l9r1v2   

  MailingAddressState_y  
0             ontario    

[1 rows x 24 columns]


2020-04-17 20:03:06,518 - root - INFO - Column: Name_x, Ratio metric took: 0.8908901214599609 seconds
2020-04-17 20:03:07,399 - root - INFO - Column: Name_x, TokenSetRatio metric took: 0.8813481330871582 seconds
2020-04-17 20:03:08,280 - root - INFO - Column: Name_x, TokenSortRatio metric took: 0.8806478977203369 seconds
2020-04-17 20:03:09,123 - root - INFO - Column: Name_x, distance metric took: 0.8429999351501465 seconds
2020-04-17 20:03:09,996 - root - INFO - Column: Name_x, jaro metric took: 0.8573687076568604 seconds
2020-04-17 20:03:10,859 - root - INFO - Column: Name_x, setratio metric took: 0.8629961013793945 seconds
2020-04-17 20:03:11,763 - root - INFO - Column: Name_x, seqratio metric took: 0.9041616916656494 seconds
2020-04-17 20:03:12,639 - root - INFO - Column: Name_x, longestnumericseq metric took: 0.8763706684112549 seconds
2020-04-17 20:03:13,503 - root - INFO - Column: Phone_x, Ratio metric took: 0.8537321090698242 seconds
2020-04-17 20:03:14,378 - root - INFO - Colu

  Name_Ratio Name_TokenSetRatio Name_TokenSortRatio  Name_distance  Name_jaro  \
0       None               None                None              0        1.0   

   Name_setratio  Name_seqratio  Name_longestnumericseq Phone_Ratio  \
0            1.0            1.0                       0        None   

  Phone_TokenSetRatio  ... MailingAddressPostalCode_TokenSortRatio  \
0                None  ...                                    None   

   MailingAddressPostalCode_distance  MailingAddressPostalCode_jaro  \
0                                  0                            1.0   

   MailingAddressPostalCode_setratio  MailingAddressPostalCode_seqratio  \
0                                1.0                                1.0   

   MailingAddressPostalCode_longestnumericseq FindCode_x FindCode_y  \
0                                           0      aaaqs      aaaqs   

  MailingAddressState_x  MailingAddressState_y  
0             ontario                ontario    

[1 rows x 76 colu

In [5]:
predicting_df

Unnamed: 0,Name_Ratio,Name_TokenSetRatio,Name_TokenSortRatio,Name_distance,Name_jaro,Name_setratio,Name_seqratio,Name_longestnumericseq,Phone_Ratio,Phone_TokenSetRatio,...,MailingAddressPostalCode_Ratio,MailingAddressPostalCode_TokenSetRatio,MailingAddressPostalCode_TokenSortRatio,MailingAddressPostalCode_distance,MailingAddressPostalCode_jaro,MailingAddressPostalCode_setratio,MailingAddressPostalCode_seqratio,MailingAddressPostalCode_longestnumericseq,MailingAddressState_x,MailingAddressState_y
0,,,,0,1.0,1.0,1.0,0,,,...,,,,0,1.0,1.0,1.0,0,8,8


In [6]:
model = RestoreModel("TrainedModel.sav")
predictor = Predictor(model)
predictor.execute(predicting_df, builder.getCompleteDataset())

  data_min = np.nanmin(X, axis=0)
  data_max = np.nanmax(X, axis=0)


ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [None]:
matches = builder.getCompleteDataset()[
    [e + '_x' for e in DATASOURCE_COLUMNS] + [e + '_y' for e in DATASOURCE_COLUMNS]].iloc[builder.getCompleteDataset().index]

matches[(matches[DATASOURCE_INDEX + '_x'] != matches[DATASOURCE_INDEX + '_y'])]