In [1]:
import pandas as pd
import numpy as np
import scipy.stats as stat
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import math

```
from tqdm import tqdm
NAMES = [clean_name(name) for name in np.unique(df_all["aka_name"].values)]
len(NAMES)
MAT = mat = np.zeros((len(NAMES), len(NAMES)))
for i in tqdm(range(len(NAMES))):
    for j in range(len(NAMES)):
        if j > i:
            mat[i][j] = name_distance(NAMES[i], NAMES[j])
pickle.dump(MAT, open("jaro_winkler_mat.pkl", "wb"))
```

```
mat = pickle.load(open("jaro_winkler_mat.pkl", "rb"))
```

```mat```

```type(mat), mat.shape```

```
M, N = mat.shape
size = 1000
filename = "aka_name_jw_mat"
metafile = "models/distance/{}_meta.pkl".format(filename)
pickle.dump((M, N, size), open(metafile, "wb"))
print("Wrote metadata to {}".format(metafile))
for idx, i in enumerate(range(0, M, size)):
    sub_mat = mat[i:i + size]
    outfile = "models/distance/{0}_{1:02}.pkl".format(filename, idx)
    pickle.dump(sub_mat, open(outfile, "wb"))
    print("Wrote {} rows to {}".format(len(sub_mat), outfile))
```

In [6]:
chunks = []
filename = "aka_name_jw_mat"
metafile = "models/distance/{}_meta.pkl".format(filename)
print("Read metadata from {}".format(metafile))
M, N, size = pickle.load(open(metafile, "rb"))
for idx, i in enumerate(range(0, N, size)):
    outfile = "models/distance/{0}_{1:02}.pkl".format(filename, idx)
    sub_mat = pickle.load(open(outfile, "rb"))
    chunks.append(sub_mat)
    print("Read {} rows from {}".format(len(sub_mat), outfile))
res = np.concatenate(chunks)

Read metadata from models/distance/aka_name_jw_mat_meta.pkl
Read 1000 rows from models/distance/aka_name_jw_mat_00.pkl
Read 1000 rows from models/distance/aka_name_jw_mat_01.pkl
Read 1000 rows from models/distance/aka_name_jw_mat_02.pkl
Read 1000 rows from models/distance/aka_name_jw_mat_03.pkl
Read 1000 rows from models/distance/aka_name_jw_mat_04.pkl
Read 1000 rows from models/distance/aka_name_jw_mat_05.pkl
Read 1000 rows from models/distance/aka_name_jw_mat_06.pkl
Read 1000 rows from models/distance/aka_name_jw_mat_07.pkl
Read 1000 rows from models/distance/aka_name_jw_mat_08.pkl
Read 1000 rows from models/distance/aka_name_jw_mat_09.pkl
Read 450 rows from models/distance/aka_name_jw_mat_10.pkl


In [7]:
res

array([[ 0.        ,  0.40277778,  0.43518519, ...,  0.375     ,
         0.51851852,  0.56547619],
       [ 0.        ,  0.        ,  0.51851852, ...,  0.51851852,
         0.44444444,  0.58994709],
       [ 0.        ,  0.        ,  0.        , ...,  0.59259259,
         0.51851852,  0.58994709],
       ..., 
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.4537037 ,  0.5952381 ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.5952381 ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ]])

In [8]:
type(res), res.shape

(numpy.ndarray, (10450, 10450))

In [9]:
(mat == res).all()

True

In [10]:
def read_distance_matrix(filename):
    chunks = []
    metafile = "models/distance/{}_meta.pkl".format(filename)
    print("Read metadata from {}".format(metafile))
    M, N, size = pickle.load(open(metafile, "rb"))
    for idx, i in enumerate(range(0, N, size)):
        outfile = "models/distance/{0}_{1:02}.pkl".format(filename, idx)
        sub_mat = pickle.load(open(outfile, "rb"))
        chunks.append(sub_mat)
        print("Read {} rows from {}".format(len(sub_mat), outfile))
    res = np.concatenate(chunks)
    for i in range(M):
        for j in range(N):
            if j > i:
                res[j][i] = res[i][j]
            if j == i:
                res[i][j] = 1.0
    return res

```
from sklearn.cluster import DBSCAN


dbscan = DBSCAN(eps=0.15, metric="precomputed", min_samples=1)
clusters = dbscan.fit_predict(dist_mat)
len(np.unique(clusters))
```

```
cluster_dict = {}
for name, cluster in zip(RAW_NAMES, clusters):
    if cluster not in cluster_dict:
        cluster_dict[cluster] = []
    cluster_dict[cluster].append(name)
```

```
import textdistance
import string
from sklearn.cluster import DBSCAN


BANNED = list(string.punctuation) + [
    "restaurant",
    "house",
    "kitchen",
    "original",
    "the",
    "new"
] + [" "]
BANNED = [ban.casefold() for ban in BANNED]
CUT_CHARS = ["#", "("]

def clean_name(name):
    cleaned = name.casefold()
    for cut in CUT_CHARS:
        idx = cleaned.find(cut)
        if idx > 0:
            cleaned = cleaned[:idx]
    for ban in BANNED:
        cleaned = cleaned.replace(ban, "")
    return cleaned


def name_distance(a, b):
    sim = textdistance.jaro_winkler.similarity(a, b)
    dist = 1 - sim
    return dist


RAW_NAMES = np.unique(df_all["aka_name"].values)
NAMES = [clean_name(name) for name in RAW_NAMES]
len(NAMES)
```