**TODO**
- Properly handle empty attributes

## Load Dataset

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys

sys.path.insert(0, '..')

In [3]:
# TODO: import here, conflict with libgomp from pytorch
from n2 import HnswIndex
import os

In [4]:
import os
home_dir = os.getenv('HOME')

https://dbs.uni-leipzig.de/research/projects/object_matching/benchmark_datasets_for_entity_resolution

https://www.informatik.uni-leipzig.de/~saeedi/musicBrainz_readme.txt

```
5 sources
---------- 
TID: a unique record's id (in the complete dataset).
CID: cluster id (records having the same CID are duplicate)
CTID: a unique id within a cluster (if two records belong to the same cluster they will have the same CID but different CTIDs). These ids (CTID) start with 1 and grow until cluster size.
SourceID: identifies to which source a record belongs (there are five sources). The sources are deduplicated.
Id: the original id from the source. Each source has its own Id-Format. Uniqueness is not guaranteed!! (can be ignored).
number: track or song number in the album.
length: the length of the track.
artist: the interpreter (artist or band) of the track.
year: date of publication.
language: language of the track.
```

In [5]:
import glob
import csv
import tqdm

current_row_id = 0
row_dict = {}
rows_total = 19375
cluster_id_attr = 'CID'

with tqdm.tqdm(total=rows_total) as pbar:
    for filename in glob.glob(f'{home_dir}/Downloads/musicbrainz-20-A01.csv.dapo'):
        with open(filename) as f:
            for row in csv.DictReader(f):
                row['id'] = current_row_id
                row[cluster_id_attr] = int(row[cluster_id_attr])  # convert cluster_id_attr to int
                row_dict[current_row_id] = row
                current_row_id += 1
                pbar.update(1)

100%|██████████| 19375/19375 [00:00<00:00, 186379.09it/s]


In [6]:
row_dict[1]

{'TID': '2',
 'CID': 2512,
 'CTID': '5',
 'SourceID': '4',
 'id': 1,
 'number': '7',
 'title': '007',
 'length': '1m 58sec',
 'artist': '[unknown]',
 'album': 'Cantigas de roda (unknown)',
 'year': 'null',
 'language': 'Por.'}

In [7]:
attr_list = ['title', 'artist', 'album']
is_multitoken_attr_list = ['title', 'artist', 'album']

preprocess:

In [8]:
import unidecode
from entity_embed.data_utils.one_hot_encoders import _default_tokenizer_fn

def clean_str(s):
    s = unidecode.unidecode(s).lower().strip()
    return ' '.join(s_part[:30] for s_part in _default_tokenizer_fn(s))[:100]

for row in row_dict.values():
    for attr in attr_list:
        row[attr] = clean_str(row[attr])

In [9]:
len(set(r[cluster_id_attr] for r in row_dict.values()))

10000

In [10]:
row_list = list(row_dict.values())
row_list.sort(key=lambda row: (row[cluster_id_attr]))

In [11]:
from ordered_set import OrderedSet  # ensure reproducibility
import itertools

true_pair_set = OrderedSet(
    tuple(sorted((row_left['id'], row_right['id'])))
    for __, row_cluster_list in itertools.groupby(row_list, key=lambda row: row[cluster_id_attr])
    for row_left, row_right in itertools.combinations(row_cluster_list, 2)
)
len(true_pair_set)

16250

In [12]:
[row_dict[id_] for id_ in next(iter(true_pair_set))]

[{'TID': '1',
  'CID': 1,
  'CTID': '1',
  'SourceID': '2',
  'id': 0,
  'number': '9',
  'title': "daniel balavoine - l ' enfant aux yeux d ' italie",
  'length': '219',
  'artist': '',
  'album': 'de vous a elle en passant par moi',
  'year': '75',
  'language': 'French'},
 {'TID': '15184',
  'CID': 1,
  'CTID': '2',
  'SourceID': '3',
  'id': 15183,
  'number': '9',
  'title': "l ' enfant aux yeux d ' italie - de vous a elle en passant par moi",
  'length': '3.663',
  'artist': 'daniel balavoine',
  'album': '',
  'year': "'75",
  'language': 'French'}]

In [13]:
from entity_embed.evaluation import precision_and_recall

## Pairs

In [14]:
import random

random_seed = 42
random.seed(random_seed)

In [15]:
len(true_pair_set)

16250

In [16]:
import random

train_len = 5_000
train_pair_set = OrderedSet(random.sample(true_pair_set, train_len))
valid_pair_set = true_pair_set - train_pair_set

print(len(train_pair_set))
print(len(valid_pair_set))

5000
11250


In [17]:
train_id_set = OrderedSet(id_ for pair in train_pair_set for id_ in pair)
len(train_id_set)

7582

In [18]:
train_row_dict = {id_: row_dict[id_] for id_ in train_id_set}
len(train_row_dict)

7582

In [19]:
valid_id_set = OrderedSet(id_ for pair in valid_pair_set for id_ in pair)
len(valid_id_set)

12381

In [20]:
valid_row_dict = {id_: row_dict[id_] for id_ in valid_id_set}
len(valid_row_dict)

12381

## Self-training

In [21]:
import torch
import numpy as np

In [22]:
torch.manual_seed(random_seed)
np.random.seed(random_seed)

In [23]:
from entity_embed.data_utils.one_hot_encoders import RowOneHotEncoder

row_encoder = RowOneHotEncoder(
    row_dict,
    attr_list=attr_list,
    is_multitoken_attr_list=is_multitoken_attr_list,
    tokenizer_fn=lambda s: s.split(),
    alphabet=list('0123456789abcdefghijklmnopqrstuvwxyz!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~ '),
    show_progress=True,
)

100%|██████████| 3/3 [00:00<00:00, 25.58it/s]


In [24]:
from entity_embed.data_utils.datasets import PairDataset

pos_pair_batch_size = 45
neg_pair_batch_size = 1225

train_dataset = PairDataset(
    row_dict=train_row_dict,
    cluster_attr=cluster_id_attr, 
    row_encoder=row_encoder,
    pos_pair_batch_size=pos_pair_batch_size,
    neg_pair_batch_size=neg_pair_batch_size
)

In [25]:
import os

train_loader = torch.utils.data.DataLoader(
    train_dataset,
    batch_size=None,
    shuffle=True,
    num_workers=os.cpu_count(),
    multiprocessing_context='fork'
)

In [26]:
if torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

device

device(type='cuda')

In [27]:
epochs = 20

In [28]:
# https://github.com/pytorch/examples/blob/master/mnist/main.py
import time

from pytorch_metric_learning.distances import CosineSimilarity
from pytorch_metric_learning.losses import NTXentLoss
from pytorch_metric_learning.miners import BatchHardMiner

from entity_embed.models import BlockerNet, get_current_signature_weights
from entity_embed.trainer import train_epoch, valid_epoch

model = BlockerNet(row_encoder.attr_info_dict).to(device)
loss_func = NTXentLoss(temperature=0.1)
mining_func = BatchHardMiner(distance=CosineSimilarity())
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

with tqdm.tqdm(total=epochs * len(train_loader), desc="# training") as p_bar:
    for epoch in range(epochs):
        # for phase in ['train', 'valid']:
        for phase in ['train']:
            if phase == 'valid' and epoch % 5 != 0:
                # valid only every 5 epochs
                continue
            elif phase == 'train':
                epoch_func = train_epoch
                epoch_args = dict(
                    model=model,
                    loss_func=loss_func,
                    mining_func=mining_func,
                    device=device,
                    train_loader=train_loader,
                    optimizer=optimizer,
                    epoch=epoch)
            else:
                epoch_func = valid_epoch
                epoch_args = dict(
                    model=model,
                    loss_func=loss_func,
                    device=device,
                    valid_loader=valid_block_loader)
            
            loss_agg = 0.0
            start_time = time.time()
            
            for idx, loss_item in enumerate(epoch_func(**epoch_args)):
                loss_agg += loss_item
                if phase == 'train':
                    p_bar.update(1)
                    p_bar.set_description(
                        "# Train Epoch: %3d Time: %.3f Loss: %.3f"
                        % (
                            epoch,
                            time.time() - start_time,
                            loss_agg / (idx + 1),
                        )
                    )

            if phase == 'train':
                print(get_current_signature_weights(row_encoder.attr_info_dict, model))
            else:
                print("Valid loss: %.5f" % (loss_agg / (idx + 1)))

# Train Epoch:   0 Time: 14.029 Loss: 0.263:   5%|▌         | 178/3560 [00:14<04:22, 12.90it/s]

[('title', tensor(0.3527, device='cuda:0')), ('artist', tensor(0.3054, device='cuda:0')), ('album', tensor(0.3419, device='cuda:0'))]


# Train Epoch:   1 Time: 13.762 Loss: 0.067:  10%|█         | 356/3560 [00:27<03:59, 13.40it/s]

[('title', tensor(0.3554, device='cuda:0')), ('artist', tensor(0.3004, device='cuda:0')), ('album', tensor(0.3442, device='cuda:0'))]


# Train Epoch:   2 Time: 13.451 Loss: 0.033:  15%|█▌        | 534/3560 [00:41<03:42, 13.58it/s]

[('title', tensor(0.3572, device='cuda:0')), ('artist', tensor(0.3017, device='cuda:0')), ('album', tensor(0.3411, device='cuda:0'))]


# Train Epoch:   3 Time: 13.702 Loss: 0.021:  20%|██        | 712/3560 [00:55<03:49, 12.41it/s]

[('title', tensor(0.3599, device='cuda:0')), ('artist', tensor(0.2993, device='cuda:0')), ('album', tensor(0.3408, device='cuda:0'))]


# Train Epoch:   4 Time: 14.267 Loss: 0.017:  25%|██▌       | 890/3560 [01:09<03:19, 13.38it/s]

[('title', tensor(0.3630, device='cuda:0')), ('artist', tensor(0.2967, device='cuda:0')), ('album', tensor(0.3403, device='cuda:0'))]


# Train Epoch:   5 Time: 13.773 Loss: 0.015:  30%|███       | 1068/3560 [01:23<03:04, 13.50it/s]

[('title', tensor(0.3639, device='cuda:0')), ('artist', tensor(0.2935, device='cuda:0')), ('album', tensor(0.3426, device='cuda:0'))]


# Train Epoch:   6 Time: 13.895 Loss: 0.014:  35%|███▌      | 1246/3560 [01:37<02:55, 13.22it/s]

[('title', tensor(0.3659, device='cuda:0')), ('artist', tensor(0.2896, device='cuda:0')), ('album', tensor(0.3445, device='cuda:0'))]


# Train Epoch:   7 Time: 13.444 Loss: 0.013:  40%|████      | 1424/3560 [01:50<02:43, 13.10it/s]

[('title', tensor(0.3678, device='cuda:0')), ('artist', tensor(0.2886, device='cuda:0')), ('album', tensor(0.3436, device='cuda:0'))]


# Train Epoch:   8 Time: 13.706 Loss: 0.013:  45%|████▌     | 1602/3560 [02:04<02:24, 13.55it/s]

[('title', tensor(0.3696, device='cuda:0')), ('artist', tensor(0.2884, device='cuda:0')), ('album', tensor(0.3420, device='cuda:0'))]


# Train Epoch:   9 Time: 13.602 Loss: 0.012:  50%|█████     | 1780/3560 [02:18<02:22, 12.53it/s]

[('title', tensor(0.3730, device='cuda:0')), ('artist', tensor(0.2829, device='cuda:0')), ('album', tensor(0.3441, device='cuda:0'))]


# Train Epoch:  10 Time: 13.586 Loss: 0.012:  55%|█████▌    | 1958/3560 [02:31<01:58, 13.46it/s]

[('title', tensor(0.3749, device='cuda:0')), ('artist', tensor(0.2786, device='cuda:0')), ('album', tensor(0.3465, device='cuda:0'))]


# Train Epoch:  11 Time: 14.004 Loss: 0.011:  60%|██████    | 2136/3560 [02:45<01:52, 12.66it/s]

[('title', tensor(0.3755, device='cuda:0')), ('artist', tensor(0.2774, device='cuda:0')), ('album', tensor(0.3471, device='cuda:0'))]


# Train Epoch:  12 Time: 13.781 Loss: 0.010:  65%|██████▌   | 2314/3560 [02:59<01:44, 11.92it/s]

[('title', tensor(0.3781, device='cuda:0')), ('artist', tensor(0.2739, device='cuda:0')), ('album', tensor(0.3480, device='cuda:0'))]


# Train Epoch:  13 Time: 13.842 Loss: 0.010:  70%|███████   | 2492/3560 [03:13<01:19, 13.48it/s]

[('title', tensor(0.3827, device='cuda:0')), ('artist', tensor(0.2688, device='cuda:0')), ('album', tensor(0.3486, device='cuda:0'))]


# Train Epoch:  14 Time: 14.136 Loss: 0.010:  75%|███████▌  | 2670/3560 [03:27<01:07, 13.19it/s]

[('title', tensor(0.3847, device='cuda:0')), ('artist', tensor(0.2700, device='cuda:0')), ('album', tensor(0.3453, device='cuda:0'))]


# Train Epoch:  15 Time: 13.952 Loss: 0.009:  80%|████████  | 2848/3560 [03:41<00:56, 12.64it/s]

[('title', tensor(0.3854, device='cuda:0')), ('artist', tensor(0.2656, device='cuda:0')), ('album', tensor(0.3491, device='cuda:0'))]


# Train Epoch:  16 Time: 13.884 Loss: 0.009:  85%|████████▌ | 3026/3560 [03:55<00:41, 12.74it/s]

[('title', tensor(0.3856, device='cuda:0')), ('artist', tensor(0.2639, device='cuda:0')), ('album', tensor(0.3504, device='cuda:0'))]


# Train Epoch:  17 Time: 14.012 Loss: 0.008:  90%|█████████ | 3204/3560 [04:09<00:29, 12.27it/s]

[('title', tensor(0.3881, device='cuda:0')), ('artist', tensor(0.2604, device='cuda:0')), ('album', tensor(0.3516, device='cuda:0'))]


# Train Epoch:  18 Time: 14.110 Loss: 0.008:  95%|█████████▌| 3382/3560 [04:23<00:13, 13.20it/s]

[('title', tensor(0.3881, device='cuda:0')), ('artist', tensor(0.2580, device='cuda:0')), ('album', tensor(0.3539, device='cuda:0'))]


# Train Epoch:  19 Time: 14.113 Loss: 0.008: 100%|██████████| 3560/3560 [04:38<00:00, 12.80it/s]

[('title', tensor(0.3892, device='cuda:0')), ('artist', tensor(0.2560, device='cuda:0')), ('album', tensor(0.3548, device='cuda:0'))]





In [29]:
# torch.save(model, "music_model.torch")

In [30]:
# model = torch.load("music_model.torch")

In [31]:
from entity_embed.data_utils.datasets import RowDataset

valid_id_list = list(valid_id_set)
row_batch_size = 32
valid_row_dataset = RowDataset(
    row_encoder=row_encoder,
    row_dict=valid_row_dict,
    batch_size=row_batch_size
)
valid_row_loader = torch.utils.data.DataLoader(
    valid_row_dataset,
    batch_size=None,
    shuffle=False,
    # num_workers=os.cpu_count(),
    # multiprocessing_context='fork'
)

In [32]:
import tqdm
import os
import torch.nn.functional as F

model.eval()
valid_embed_matrix = []
with tqdm.tqdm(total=len(valid_row_loader), desc="# batch embedding") as p_bar:
    for i, (t_dict, t_lengths_dict) in enumerate(valid_row_loader):
        t_dict = {attr: t.to(device) for attr, t in t_dict.items()}
        valid_embed_matrix.extend(
            v.data.numpy()
            for v in model(t_dict, t_lengths_dict).cpu().unbind()
        )
        p_bar.update(1)

# batch embedding: 100%|██████████| 387/387 [00:10<00:00, 36.40it/s]


In [33]:
valid_id_to_index = {id_: index for index, id_ in enumerate(valid_id_set)}

In [34]:
%%time

ef_construction = 150
M = 64
metric = 'angular'

# https://github.com/nmslib/hnswlib/blob/master/ALGO_PARAMS.md#construction-parameters
approx_knn_index = HnswIndex(dimension=valid_embed_matrix[0].shape[0], metric=metric)
for valid_embed_vec in valid_embed_matrix:
    approx_knn_index.add_data(valid_embed_vec)

approx_knn_index.build(    
    m=M,
    max_m0=M,
    ef_construction=ef_construction,
    n_threads=os.cpu_count(),
)

CPU times: user 4.59 s, sys: 40.8 ms, total: 4.64 s
Wall time: 551 ms


In [35]:
%%time

ntop = 10

neighbor_distance_list = approx_knn_index.batch_search_by_ids(
    item_ids=list(range(len(valid_embed_matrix))),
    k=ntop,
    ef_search=-1,
    num_threads=os.cpu_count(),
    include_distances=True
)

CPU times: user 8.36 s, sys: 5.94 ms, total: 8.37 s
Wall time: 726 ms


In [36]:
neighbor_distance_list[0]

[(0, 0.0),
 (1, 0.11829233169555664),
 (1475, 0.4682372212409973),
 (1474, 0.48449021577835083),
 (10464, 0.5463501811027527),
 (2039, 0.5763387680053711),
 (10463, 0.5765520334243774),
 (1332, 0.5827150344848633),
 (10466, 0.5829010009765625),
 (7618, 0.586234450340271)]

In [37]:
threshold = 0.5
distance_threshold = 1 - threshold

In [38]:
[row_dict[valid_id_list[neighbor]] for neighbor, distance in neighbor_distance_list[0] if distance <= distance_threshold]

[{'TID': '1',
  'CID': 1,
  'CTID': '1',
  'SourceID': '2',
  'id': 0,
  'number': '9',
  'title': "daniel balavoine - l ' enfant aux yeux d ' italie",
  'length': '219',
  'artist': '',
  'album': 'de vous a elle en passant par moi',
  'year': '75',
  'language': 'French'},
 {'TID': '15184',
  'CID': 1,
  'CTID': '2',
  'SourceID': '3',
  'id': 15183,
  'number': '9',
  'title': "l ' enfant aux yeux d ' italie - de vous a elle en passant par moi",
  'length': '3.663',
  'artist': 'daniel balavoine',
  'album': '',
  'year': "'75",
  'language': 'French'},
 {'TID': '2160',
  'CID': 1135,
  'CTID': '1',
  'SourceID': '5',
  'id': 2159,
  'number': '7',
  'title': "t ' as d ' beaux yeux tu sais",
  'length': '147800',
  'artist': 'serge reggiani',
  'album': 'enfants , soyez meilleurs que nous',
  'year': '2000',
  'language': 'French'},
 {'TID': '1731',
  'CID': 1135,
  'CTID': '2',
  'SourceID': '1',
  'id': 1730,
  'number': '007',
  'title': "t ' as d ' beaux yeux tu sais ( enfants ,

In [39]:
found_pair_set = set()

for i, neighbor_distance in tqdm.tqdm_notebook(enumerate(neighbor_distance_list), total=len(neighbor_distance_list)):
    for j, distance in neighbor_distance:
        if i != j and distance <= distance_threshold:
            pair = tuple(sorted([valid_id_list[i], valid_id_list[j]]))
            found_pair_set.add(pair)

len(found_pair_set)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for i, neighbor_distance in tqdm.tqdm_notebook(enumerate(neighbor_distance_list), total=len(neighbor_distance_list)):


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=12381.0), HTML(value='')))




22033

In [40]:
def pe_ratio(found_pair_set, valid_id_list):
    return len(found_pair_set) / len(valid_id_list)

pe_ratio(found_pair_set, valid_id_list)

1.7795816169937808

In [41]:
precision_and_recall(found_pair_set, valid_pair_set)

(0.5039259292878864, 0.9869333333333333)

In [42]:
false_positives = list(found_pair_set - valid_pair_set)
len(false_positives)

10930

In [43]:
false_negatives = list(valid_pair_set - found_pair_set)
len(false_negatives)

147

In [44]:
cos_similarity = lambda a, b: np.dot(a, b)

In [45]:
for (id_left, id_right) in false_negatives[:10]:
    i = valid_id_to_index[id_left]
    j = valid_id_to_index[id_right]
    display((cos_similarity(valid_embed_matrix[i], valid_embed_matrix[j]), row_dict[id_left], row_dict[id_right]))

(0.014535156,
 {'TID': '490',
  'CID': 266,
  'CTID': '1',
  'SourceID': '3',
  'id': 489,
  'number': '5',
  'title': 'onriman - sakurada zhan di ju ge yao quan ji',
  'length': '5.715',
  'artist': 'tachibanamaria',
  'album': '',
  'year': '',
  'language': 'Japanese'},
 {'TID': '1355',
  'CID': 266,
  'CTID': '2',
  'SourceID': '4',
  'id': 1354,
  'number': '5',
  'title': '005 -',
  'length': '5m 42sec',
  'artist': '',
  'album': '( unknown )',
  'year': 'null',
  'language': 'Jap.'})

(0.09722456,
 {'TID': '561',
  'CID': 298,
  'CTID': '1',
  'SourceID': '3',
  'id': 560,
  'number': '6',
  'title': "prikhodi - zn @ menatel '",
  'length': '4.055',
  'artist': 'splin',
  'album': '',
  'year': "'00",
  'language': 'Russian'},
 {'TID': '2355',
  'CID': 298,
  'CTID': '2',
  'SourceID': '4',
  'id': 2354,
  'number': '6',
  'title': '006 -',
  'length': '4m 3sec',
  'artist': '',
  'album': '@ ( 2000 )',
  'year': 'null',
  'language': 'Rus.'})

(0.0596832,
 {'TID': '2355',
  'CID': 298,
  'CTID': '2',
  'SourceID': '4',
  'id': 2354,
  'number': '6',
  'title': '006 -',
  'length': '4m 3sec',
  'artist': '',
  'album': '@ ( 2000 )',
  'year': 'null',
  'language': 'Rus.'},
 {'TID': '3390',
  'CID': 298,
  'CTID': '3',
  'SourceID': '5',
  'id': 3389,
  'number': '6',
  'title': 'prikhodi',
  'length': '243293',
  'artist': 'splin',
  'album': "zn @ menatel '",
  'year': '2000',
  'language': 'Russian'})

(0.1449579,
 {'TID': '831',
  'CID': 442,
  'CTID': '1',
  'SourceID': '3',
  'id': 830,
  'number': '13',
  'title': '19841031',
  'length': '8.667',
  'artist': 'u2',
  'album': '',
  'year': '',
  'language': 'English'},
 {'TID': '6047',
  'CID': 442,
  'CTID': '2',
  'SourceID': '4',
  'id': 6046,
  'number': '13',
  'title': '013 - bad',
  'length': '8m 40sec',
  'artist': 'u2',
  'album': '1984 - 10 - 31 : sportpaleis ahoy , rotterdam , netherlands ( unknown )',
  'year': 'null',
  'language': 'nEg.'})

(0.22457673,
 {'TID': '831',
  'CID': 442,
  'CTID': '1',
  'SourceID': '3',
  'id': 830,
  'number': '13',
  'title': '19841031',
  'length': '8.667',
  'artist': 'u2',
  'album': '',
  'year': '',
  'language': 'English'},
 {'TID': '11882',
  'CID': 442,
  'CTID': '3',
  'SourceID': '5',
  'id': 11881,
  'number': '13',
  'title': 'bad',
  'length': '520000',
  'artist': 'u2',
  'album': '1984 - 10 - 31 : sportpaleis ahoy , rotterdam , netherlands',
  'year': '',
  'language': 'English'})

(0.24256386,
 {'TID': '831',
  'CID': 442,
  'CTID': '1',
  'SourceID': '3',
  'id': 830,
  'number': '13',
  'title': '19841031',
  'length': '8.667',
  'artist': 'u2',
  'album': '',
  'year': '',
  'language': 'English'},
 {'TID': '18004',
  'CID': 442,
  'CTID': '4',
  'SourceID': '1',
  'id': 18003,
  'number': '013',
  'title': 'bad ( 1984 - 10 - 31 : sportpaleis ahoy , rotterdam , netherlands )',
  'length': '08:40',
  'artist': 'u2',
  'album': '1984 - 10 - 31 : sportpaleis ahoy , rotterdam , netherlands',
  'year': '',
  'language': ''})

(-0.10132472,
 {'TID': '887',
  'CID': 470,
  'CTID': '1',
  'SourceID': '1',
  'id': 886,
  'number': '006',
  'title': "sweet and slow ( guy ' s all - star shoe band )",
  'length': '03:04',
  'artist': "guy ' s all star shoe band",
  'album': "guy ' s all - star shoe band",
  'year': '',
  'language': ''},
 {'TID': '17706',
  'CID': 470,
  'CTID': '2',
  'SourceID': '2',
  'id': 17705,
  'number': 'MBox10988814-HH',
  'title': '6',
  'length': "Guy's All Star Shoe Band - Sweet and Slow",
  'artist': '1184',
  'album': '',
  'year': "Guy's All-Star Shoe Band",
  'language': ''})

(0.3220865,
 {'TID': '948',
  'CID': 502,
  'CTID': '1',
  'SourceID': '3',
  'id': 947,
  'number': '16',
  'title': "let ' s get together - subarashikikonosekai original soundtrack",
  'length': '0.267',
  'artist': 'shi yuan zhang qing',
  'album': '',
  'year': "'07",
  'language': 'Japanese'},
 {'TID': '7146',
  'CID': 502,
  'CTID': '2',
  'SourceID': '4',
  'id': 7145,
  'number': '16',
  'title': '',
  'length': '0m 16sec',
  'artist': '',
  'album': 'soundtrack',
  'year': 'null',
  'language': 'Jap.'})

(0.4551689,
 {'TID': '984',
  'CID': 521,
  'CTID': '1',
  'SourceID': '2',
  'id': 983,
  'number': '11',
  'title': "al martino - you ' re the love of my life",
  'length': '178',
  'artist': '',
  'album': 'best of jose carreras gala',
  'year': '04',
  'language': '[Multiple languages]'},
 {'TID': '2803',
  'CID': 521,
  'CTID': '2',
  'SourceID': '3',
  'id': 2802,
  'number': '11',
  'title': "you ' re",
  'length': '2.967',
  'artist': 'al martino',
  'album': '',
  'year': "'04",
  'language': ''})

(0.40416634,
 {'TID': '1110',
  'CID': 587,
  'CTID': '1',
  'SourceID': '4',
  'id': 1109,
  'number': '21',
  'title': '21z -',
  'length': '0m 43sec',
  'artist': '',
  'album': '3 () ( 1999 )',
  'year': 'null',
  'language': 'Jap.'},
 {'TID': '2482',
  'CID': 587,
  'CTID': '2',
  'SourceID': '5',
  'id': 2481,
  'number': '21',
  'title': 'ran se ti sanpuru',
  'length': '41888',
  'artist': 'da gu xing',
  'album': 'gamera 3 xie shen ( irisu ) jue xing',
  'year': '1999',
  'language': 'Japanesee'})