In [8]:
import pyodbc
import pandas as pd
import numpy as np
import time
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV
from sklearn.ensemble import AdaBoostClassifier
from sklearn.decomposition import PCA
from sklearn.metrics import confusion_matrix
import logging

from texttransformation import StringTransform, RowTextTransform, TransformDataset
from datasetbuilder import DatasetBuilder
from metrics import MetricsCalculator
from helpers import SaveModel, RestoreModel, Normalize, RemoveOutliers
from predictor import Predictor

Configure parameters 

In [9]:
DATASOURCE_COLUMNS = ['FindCode', 'Name', 'Phone', 'Fax', 'OtherPhone', \
                                  'Email', 'WebsiteURL', 'MailingAddressFreeform', \
                                  'MailingAddressCity', 'MailingAddressPostalCode', 'MailingAddressState']
DATASOURCE_INDEX = 'FindCode'
ONE_HOT_ENCONDING_COLUMNS = {'MailingAddressState': ['nunavut ',
                                                        'saskatchewan',
                                                        'ontario  ',
                                                        'alberta',
                                                        'british columbia ',
                                                        'prince edward island ',
                                                        'yukon territory',
                                                        'newfoundland',
                                                        'northwest territories',
                                                        'new brunswick',
                                                        'manitoba',
                                                        'nova scotia ',
                                                        'quebec']}
TEXT_METRICS = ['ratio','partial_ratio','token_sort_ratio','token_set_ratio','distance',
                'l_ratio','jaro','jaro_winkler','setratio','seqratio','longestnumericseq']
PASS_THROUGH_COMULNS = ['FindCode','MailingAddressState']
HIGH_IMPORTANCE_COLUMNS = ['Name']

alteration_rules = [
    {
        'rule_Replace': ['none',''],
        'rule_RandomTypo': ['alpha', 2, 'replace'],
        'rule_ScrambleWords': [],
        'rule_DuplicateNumericSequence': [2],
        'rule_RemoveSpecialSymbols': [],
        'rule_RemoveStopWords': [],
        'rule_IncreaseWeightOfShortWords':[]
    },
    {
        'rule_Replace': ['none',''],
        'rule_RandomTypo': ['any', 2, 'add'],
        'rule_ScrambleWords': [],
        'rule_DuplicateNumericSequence': [4],
        'rule_RemoveSpecialSymbols': [],
        'rule_RemoveStopWords': [],
        'rule_IncreaseWeightOfShortWords':[]
    },
    {
        'rule_Replace': ['none',''],
        'rule_RandomTypo': ['digits', 1, 'add'],
        'rule_ScrambleWords': [],
        'rule_DuplicateNumericSequence': [3],
        'rule_RemoveSpecialSymbols': [],
        'rule_RemoveStopWords': [],
        'rule_IncreaseWeightOfShortWords':[]
    }
]

COLUMN_ALTERATIION_RULES = []
for i in range(0, len(alteration_rules)):
    COLUMN_ALTERATIION_RULES.append(
        {
            1: alteration_rules[i],
            2: alteration_rules[i],                    
            3: alteration_rules[i],
            4: alteration_rules[i],
            5: alteration_rules[i],
            6: alteration_rules[i],
            7: alteration_rules[i],
            8: alteration_rules[i],
            9: alteration_rules[i]
        })

Fetch data from source. Meta should match configuration above

In [10]:
df_source = pd.read_csv('prediction_input.csv', header=0, names=DATASOURCE_COLUMNS)
df_source.FindCode = df_source.FindCode.astype(str).str.lower()
df_source.Name = df_source.Name.astype(str).str.lower()
df_source.Phone = df_source.Phone.astype(str).str.lower()
df_source.Fax = df_source.Fax.astype(str).str.lower()
df_source.OtherPhone = df_source.OtherPhone.astype(str).str.lower()
df_source.Email = df_source.Email.astype(str).str.lower()
df_source.WebsiteURL = df_source.WebsiteURL.astype(str).str.lower()
df_source.MailingAddressFreeform = df_source.MailingAddressFreeform.astype(str).str.lower()
df_source.MailingAddressCity = df_source.MailingAddressCity.astype(str).str.lower()
df_source.MailingAddressPostalCode = df_source.MailingAddressPostalCode.astype(str).str.lower()
df_source.MailingAddressState = df_source.MailingAddressState.astype(str).str.lower()
df_source.replace(to_replace='none', value='', inplace=True)

df_source = df_source[0:10]

Create predictiondataset

In [None]:
builder = DatasetBuilder(DATASOURCE_COLUMNS, DATASOURCE_INDEX, 
                         ONE_HOT_ENCONDING_COLUMNS, TEXT_METRICS, 
                         PASS_THROUGH_COMULNS, COLUMN_ALTERATIION_RULES,
                         HIGH_IMPORTANCE_COLUMNS, 8, logging.DEBUG)
predicting_df = builder.generatePredictionDataset(df_source)

2020-04-17 21:38:11,393 - root - INFO - Column: Phone_x, ratio metric took: 1.1230065822601318 seconds
2020-04-17 21:38:12,577 - root - INFO - Column: Phone_x, partial_ratio metric took: 1.1830322742462158 seconds
2020-04-17 21:38:13,807 - root - INFO - Column: Phone_x, token_sort_ratio metric took: 1.2296838760375977 seconds
2020-04-17 21:38:15,007 - root - INFO - Column: Phone_x, token_set_ratio metric took: 1.1986937522888184 seconds
2020-04-17 21:38:16,178 - root - INFO - Column: Phone_x, distance metric took: 1.170060634613037 seconds
2020-04-17 21:38:17,306 - root - INFO - Column: Phone_x, l_ratio metric took: 1.126692533493042 seconds
2020-04-17 21:38:18,531 - root - INFO - Column: Phone_x, jaro metric took: 1.2239997386932373 seconds
2020-04-17 21:38:19,654 - root - INFO - Column: Phone_x, jaro_winkler metric took: 1.122619867324829 seconds
2020-04-17 21:38:20,791 - root - INFO - Column: Phone_x, setratio metric took: 1.1359992027282715 seconds
2020-04-17 21:38:21,921 - root - 

('Phone_x', 'Phone_y', 'Phone')
{'ratio': 0     100
1      30
2      30
3      40
28     40
     ... 
98     50
93     40
99    100
94     40
95      0
Length: 100, dtype: int64, 'partial_ratio': 4      30
5       0
6      40
7      20
32     50
     ... 
99    100
92     30
93     40
94     40
95      0
Length: 100, dtype: int64, 'token_sort_ratio': 0     100
1      30
2      30
3      40
32     50
     ... 
97     30
94     40
95      0
98     50
99    100
Length: 100, dtype: int64, 'token_set_ratio': 0     100
1      30
2      30
8      30
3      40
     ... 
95      0
96     20
97     30
98     50
99    100
Length: 100, dtype: int64, 'distance': 0      0
1     10
2      9
3      9
32     6
      ..
88     0
99     0
89     8
90    10
91     7
Length: 100, dtype: int64, 'l_ratio': 28    0.4
29    0.3
30    0.4
31    0.3
32    0.5
     ... 
93    0.4
98    0.5
94    0.4
99    1.0
95    0.0
Length: 100, dtype: float64, 'jaro': 28    0.704762
29    0.516667
4     0.366667
30    0.59444

2020-04-17 21:38:24,284 - root - INFO - Column: Email_x, ratio metric took: 1.2165637016296387 seconds
2020-04-17 21:38:25,439 - root - INFO - Column: Email_x, partial_ratio metric took: 1.1546962261199951 seconds
2020-04-17 21:38:26,602 - root - INFO - Column: Email_x, token_sort_ratio metric took: 1.1630001068115234 seconds
2020-04-17 21:38:27,782 - root - INFO - Column: Email_x, token_set_ratio metric took: 1.1799263954162598 seconds
2020-04-17 21:38:29,181 - root - INFO - Column: Email_x, distance metric took: 1.3985178470611572 seconds
2020-04-17 21:38:30,375 - root - INFO - Column: Email_x, l_ratio metric took: 1.1925857067108154 seconds
2020-04-17 21:38:31,575 - root - INFO - Column: Email_x, jaro metric took: 1.1985728740692139 seconds
2020-04-17 21:38:32,758 - root - INFO - Column: Email_x, jaro_winkler metric took: 1.1821892261505127 seconds
2020-04-17 21:38:34,155 - root - INFO - Column: Email_x, setratio metric took: 1.3957908153533936 seconds
2020-04-17 21:38:35,512 - root

('Email_x', 'Email_y', 'Email')
{'ratio': 16      0
17    100
18    100
19    100
20    100
     ... 
96      0
95    100
97    100
98    100
99    100
Length: 100, dtype: int64, 'partial_ratio': 12    100
13    100
14      0
15    100
32    100
     ... 
82    100
95    100
83    100
86      0
87    100
Length: 100, dtype: int64, 'token_sort_ratio': 12    100
13    100
14      0
15    100
16      0
     ... 
91    100
96      0
97    100
98    100
99    100
Length: 100, dtype: int64, 'token_set_ratio': 16    0
17    0
18    0
19    0
12    0
     ..
91    0
96    0
97    0
98    0
99    0
Length: 100, dtype: int64, 'distance': 20     0
21     0
22     0
23     0
16    13
      ..
95     0
96    13
97     0
98     0
99     0
Length: 100, dtype: int64, 'l_ratio': 16    0.0
17    1.0
18    1.0
19    1.0
32    1.0
     ... 
99    1.0
84    0.0
85    1.0
86    0.0
87    1.0
Length: 100, dtype: float64, 'jaro': 20    1.0
21    1.0
22    1.0
23    1.0
28    1.0
     ... 
96    0.0
97    1.0


2020-04-17 21:38:37,848 - root - INFO - Column: WebsiteURL_x, ratio metric took: 1.157271385192871 seconds
2020-04-17 21:38:39,010 - root - INFO - Column: WebsiteURL_x, partial_ratio metric took: 1.161316156387329 seconds
2020-04-17 21:38:40,242 - root - INFO - Column: WebsiteURL_x, token_sort_ratio metric took: 1.2309999465942383 seconds
2020-04-17 21:38:41,391 - root - INFO - Column: WebsiteURL_x, token_set_ratio metric took: 1.1479241847991943 seconds
2020-04-17 21:38:42,532 - root - INFO - Column: WebsiteURL_x, distance metric took: 1.14151930809021 seconds
2020-04-17 21:38:43,690 - root - INFO - Column: WebsiteURL_x, l_ratio metric took: 1.1565158367156982 seconds
2020-04-17 21:38:44,845 - root - INFO - Column: WebsiteURL_x, jaro metric took: 1.1551733016967773 seconds
2020-04-17 21:38:46,064 - root - INFO - Column: WebsiteURL_x, jaro_winkler metric took: 1.2192668914794922 seconds
2020-04-17 21:38:47,208 - root - INFO - Column: WebsiteURL_x, setratio metric took: 1.14296936988830

In [None]:
model = RestoreModel("TrainedModel.sav")
predictor = Predictor(model)
predictor.execute(predicting_df, builder.getCompleteDataset())

In [None]:
matches = builder.getCompleteDataset()[
    [e + '_x' for e in DATASOURCE_COLUMNS] + [e + '_y' for e in DATASOURCE_COLUMNS]].iloc[builder.getCompleteDataset().index]

matches[(matches[DATASOURCE_INDEX + '_x'] != matches[DATASOURCE_INDEX + '_y'])]