**TODO**
- Properly handle empty attributes

## Load Dataset

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys

sys.path.insert(0, '..')

In [3]:
# TODO: import here, conflict with libgomp from pytorch
from n2 import HnswIndex
import os

In [4]:
import os
home_dir = os.getenv('HOME')

https://dbs.uni-leipzig.de/research/projects/object_matching/benchmark_datasets_for_entity_resolution

https://www.informatik.uni-leipzig.de/~saeedi/musicBrainz_readme.txt

```
5 sources
---------- 
TID: a unique record's id (in the complete dataset).
CID: cluster id (records having the same CID are duplicate)
CTID: a unique id within a cluster (if two records belong to the same cluster they will have the same CID but different CTIDs). These ids (CTID) start with 1 and grow until cluster size.
SourceID: identifies to which source a record belongs (there are five sources). The sources are deduplicated.
Id: the original id from the source. Each source has its own Id-Format. Uniqueness is not guaranteed!! (can be ignored).
number: track or song number in the album.
length: the length of the track.
artist: the interpreter (artist or band) of the track.
year: date of publication.
language: language of the track.
```

In [5]:
import glob
import csv
import tqdm

current_row_id = 0
row_dict = {}
rows_total = 1937500
cluster_id_attr = 'CID'

with tqdm.tqdm(total=rows_total) as pbar:
    for filename in glob.glob(f'{home_dir}/Downloads/musicbrainz-2000-A01.csv.dapo'):
        with open(filename) as f:
            for row in csv.DictReader(f):
                row['id'] = current_row_id
                row[cluster_id_attr] = int(row[cluster_id_attr])  # convert cluster_id_attr to int
                row_dict[current_row_id] = row
                current_row_id += 1
                pbar.update(1)

100%|██████████| 1937500/1937500 [00:08<00:00, 231309.90it/s]


In [6]:
row_dict[1]

{'TID': '2',
 'CID': 2,
 'CTID': '1',
 'SourceID': '5',
 'id': 1,
 'number': '17',
 'title': 'Mustard Gas',
 'length': '129000',
 'artist': 'Action Painting!',
 'album': 'There and Back Again Lane',
 'year': '1995',
 'language': 'English'}

In [7]:
attr_list = ['title', 'artist', 'album']
is_multitoken_attr_list = ['title', 'artist', 'album']

preprocess:

In [8]:
import unidecode
from entity_embed.data_utils.one_hot_encoders import _default_tokenizer_fn

def clean_str(s):
    s = unidecode.unidecode(s).lower().strip()
    return ' '.join(s_part[:30] for s_part in _default_tokenizer_fn(s))[:100]

for row in row_dict.values():
    for attr in attr_list:
        row[attr] = clean_str(row[attr])

In [9]:
len(set(r[cluster_id_attr] for r in row_dict.values()))

1000000

In [10]:
row_list = list(row_dict.values())
row_list.sort(key=lambda row: (row[cluster_id_attr]))

In [11]:
from ordered_set import OrderedSet  # ensure reproducibility
import itertools

true_pair_set = OrderedSet(
    tuple(sorted((row_left['id'], row_right['id'])))
    for __, row_cluster_list in itertools.groupby(row_list, key=lambda row: row[cluster_id_attr])
    for row_left, row_right in itertools.combinations(row_cluster_list, 2)
)
len(true_pair_set)

1625000

In [12]:
[row_dict[id_] for id_ in next(iter(true_pair_set))]

[{'TID': '1',
  'CID': 1,
  'CTID': '1',
  'SourceID': '5',
  'id': 0,
  'number': '9',
  'title': "l ' enfant aux yeux d ' italie",
  'length': '219800',
  'artist': 'daniel balavoine',
  'album': 'de vous a elle en passant par moi',
  'year': '1975',
  'language': 'French'},
 {'TID': '262214',
  'CID': 1,
  'CTID': '2',
  'SourceID': '1',
  'id': 262213,
  'number': '009',
  'title': "l ' enfant aux yeux d ' italie ( de vous a elle en passant par moi )",
  'length': '03:39',
  'artist': 'daniel balavoine',
  'album': 'de vous a elle en passant par moi',
  'year': '1975',
  'language': ''}]

In [13]:
from entity_embed.evaluation import precision_and_recall

## Pairs

In [14]:
import random

random_seed = 42
random.seed(random_seed)

In [15]:
len(true_pair_set)

1625000

In [16]:
import random

train_len = 5_000
train_pair_set = OrderedSet(random.sample(true_pair_set, train_len))
valid_pair_set = true_pair_set - train_pair_set

print(len(train_pair_set))
print(len(valid_pair_set))

5000
1620000


In [17]:
train_id_set = OrderedSet(id_ for pair in train_pair_set for id_ in pair)
len(train_id_set)

9962

In [18]:
train_row_dict = {id_: row_dict[id_] for id_ in train_id_set}
len(train_row_dict)

9962

In [19]:
valid_id_set = OrderedSet(id_ for pair in valid_pair_set for id_ in pair)
len(valid_id_set)

1435997

In [20]:
valid_row_dict = {id_: row_dict[id_] for id_ in valid_id_set}
len(valid_row_dict)

1435997

## Self-training

In [21]:
import torch
import numpy as np

In [22]:
torch.manual_seed(random_seed)
np.random.seed(random_seed)

In [23]:
from entity_embed.data_utils.one_hot_encoders import RowOneHotEncoder

row_encoder = RowOneHotEncoder(
    row_dict,
    attr_list=attr_list,
    is_multitoken_attr_list=is_multitoken_attr_list,
    tokenizer_fn=lambda s: s.split(),
    alphabet=list('0123456789abcdefghijklmnopqrstuvwxyz!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~ '),
    show_progress=True,
)

100%|██████████| 3/3 [00:11<00:00,  3.84s/it]


In [24]:
from entity_embed.data_utils.datasets import PairDataset

pos_pair_batch_size = 45
neg_pair_batch_size = 1225

train_dataset = PairDataset(
    row_dict=train_row_dict,
    cluster_attr=cluster_id_attr, 
    row_encoder=row_encoder,
    pos_pair_batch_size=pos_pair_batch_size,
    neg_pair_batch_size=neg_pair_batch_size
)

In [25]:
import os

train_loader = torch.utils.data.DataLoader(
    train_dataset,
    batch_size=None,
    shuffle=True,
    num_workers=os.cpu_count(),
    multiprocessing_context='fork'
)

In [26]:
if torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

device

device(type='cuda')

In [27]:
epochs = 20

In [28]:
# https://github.com/pytorch/examples/blob/master/mnist/main.py
import time

from pytorch_metric_learning.distances import CosineSimilarity
from pytorch_metric_learning.losses import NTXentLoss
from pytorch_metric_learning.miners import BatchHardMiner

from entity_embed.models import BlockerNet, get_current_signature_weights
from entity_embed.trainer import train_epoch, valid_epoch

model = BlockerNet(row_encoder.attr_info_dict).to(device)
loss_func = NTXentLoss(temperature=0.1)
mining_func = BatchHardMiner(distance=CosineSimilarity())
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

with tqdm.tqdm(total=epochs * len(train_loader), desc="# training") as p_bar:
    for epoch in range(epochs):
        # for phase in ['train', 'valid']:
        for phase in ['train']:
            if phase == 'valid' and epoch % 5 != 0:
                # valid only every 5 epochs
                continue
            elif phase == 'train':
                epoch_func = train_epoch
                epoch_args = dict(
                    model=model,
                    loss_func=loss_func,
                    mining_func=mining_func,
                    device=device,
                    train_loader=train_loader,
                    optimizer=optimizer,
                    epoch=epoch)
            else:
                epoch_func = valid_epoch
                epoch_args = dict(
                    model=model,
                    loss_func=loss_func,
                    device=device,
                    valid_loader=valid_block_loader)
            
            loss_agg = 0.0
            start_time = time.time()
            
            for idx, loss_item in enumerate(epoch_func(**epoch_args)):
                loss_agg += loss_item
                if phase == 'train':
                    p_bar.update(1)
                    p_bar.set_description(
                        "# Train Epoch: %3d Time: %.3f Loss: %.3f"
                        % (
                            epoch,
                            time.time() - start_time,
                            loss_agg / (idx + 1),
                        )
                    )

            if phase == 'train':
                print(get_current_signature_weights(row_encoder.attr_info_dict, model))
            else:
                print("Valid loss: %.5f" % (loss_agg / (idx + 1)))

# Train Epoch:   0 Time: 12.547 Loss: 0.191:   5%|▌         | 114/2280 [00:12<03:27, 10.46it/s]

[('title', tensor(0.3438, device='cuda:0')), ('artist', tensor(0.3151, device='cuda:0')), ('album', tensor(0.3411, device='cuda:0'))]


# Train Epoch:   1 Time: 12.531 Loss: 0.042:  10%|█         | 228/2280 [00:25<03:17, 10.37it/s]

[('title', tensor(0.3489, device='cuda:0')), ('artist', tensor(0.3123, device='cuda:0')), ('album', tensor(0.3388, device='cuda:0'))]


# Train Epoch:   2 Time: 12.435 Loss: 0.025:  15%|█▌        | 342/2280 [00:37<03:00, 10.71it/s]

[('title', tensor(0.3503, device='cuda:0')), ('artist', tensor(0.3108, device='cuda:0')), ('album', tensor(0.3389, device='cuda:0'))]


# Train Epoch:   3 Time: 12.545 Loss: 0.018:  20%|██        | 456/2280 [00:50<02:52, 10.55it/s]

[('title', tensor(0.3509, device='cuda:0')), ('artist', tensor(0.3092, device='cuda:0')), ('album', tensor(0.3398, device='cuda:0'))]


# Train Epoch:   4 Time: 12.508 Loss: 0.015:  25%|██▌       | 570/2280 [01:03<02:43, 10.43it/s]

[('title', tensor(0.3525, device='cuda:0')), ('artist', tensor(0.3093, device='cuda:0')), ('album', tensor(0.3382, device='cuda:0'))]


# Train Epoch:   5 Time: 12.470 Loss: 0.014:  30%|███       | 684/2280 [01:15<02:33, 10.40it/s]

[('title', tensor(0.3546, device='cuda:0')), ('artist', tensor(0.3067, device='cuda:0')), ('album', tensor(0.3387, device='cuda:0'))]


# Train Epoch:   6 Time: 12.581 Loss: 0.013:  35%|███▌      | 798/2280 [01:28<02:22, 10.37it/s]

[('title', tensor(0.3564, device='cuda:0')), ('artist', tensor(0.3048, device='cuda:0')), ('album', tensor(0.3388, device='cuda:0'))]


# Train Epoch:   7 Time: 12.549 Loss: 0.012:  40%|████      | 912/2280 [01:41<02:17,  9.95it/s]

[('title', tensor(0.3573, device='cuda:0')), ('artist', tensor(0.3022, device='cuda:0')), ('album', tensor(0.3405, device='cuda:0'))]


# Train Epoch:   8 Time: 12.534 Loss: 0.011:  45%|████▌     | 1026/2280 [01:53<02:00, 10.38it/s]

[('title', tensor(0.3592, device='cuda:0')), ('artist', tensor(0.3002, device='cuda:0')), ('album', tensor(0.3406, device='cuda:0'))]


# Train Epoch:   9 Time: 12.533 Loss: 0.010:  50%|█████     | 1140/2280 [02:06<01:54,  9.94it/s]

[('title', tensor(0.3594, device='cuda:0')), ('artist', tensor(0.3002, device='cuda:0')), ('album', tensor(0.3404, device='cuda:0'))]


# Train Epoch:  10 Time: 12.714 Loss: 0.011:  55%|█████▌    | 1254/2280 [02:19<01:36, 10.65it/s]

[('title', tensor(0.3622, device='cuda:0')), ('artist', tensor(0.2981, device='cuda:0')), ('album', tensor(0.3398, device='cuda:0'))]


# Train Epoch:  11 Time: 12.630 Loss: 0.011:  60%|██████    | 1368/2280 [02:32<01:29, 10.18it/s]

[('title', tensor(0.3632, device='cuda:0')), ('artist', tensor(0.2977, device='cuda:0')), ('album', tensor(0.3392, device='cuda:0'))]


# Train Epoch:  12 Time: 12.679 Loss: 0.009:  65%|██████▌   | 1482/2280 [02:45<01:14, 10.66it/s]

[('title', tensor(0.3656, device='cuda:0')), ('artist', tensor(0.2955, device='cuda:0')), ('album', tensor(0.3389, device='cuda:0'))]


# Train Epoch:  13 Time: 12.700 Loss: 0.009:  70%|███████   | 1596/2280 [02:57<01:07, 10.09it/s]

[('title', tensor(0.3676, device='cuda:0')), ('artist', tensor(0.2936, device='cuda:0')), ('album', tensor(0.3388, device='cuda:0'))]


# Train Epoch:  14 Time: 12.566 Loss: 0.009:  75%|███████▌  | 1710/2280 [03:10<00:55, 10.25it/s]

[('title', tensor(0.3694, device='cuda:0')), ('artist', tensor(0.2924, device='cuda:0')), ('album', tensor(0.3382, device='cuda:0'))]


# Train Epoch:  15 Time: 12.710 Loss: 0.008:  80%|████████  | 1824/2280 [03:23<00:44, 10.32it/s]

[('title', tensor(0.3695, device='cuda:0')), ('artist', tensor(0.2921, device='cuda:0')), ('album', tensor(0.3384, device='cuda:0'))]


# Train Epoch:  16 Time: 12.636 Loss: 0.008:  85%|████████▌ | 1938/2280 [03:36<00:35,  9.52it/s]

[('title', tensor(0.3712, device='cuda:0')), ('artist', tensor(0.2908, device='cuda:0')), ('album', tensor(0.3380, device='cuda:0'))]


# Train Epoch:  17 Time: 12.711 Loss: 0.007:  90%|█████████ | 2052/2280 [03:49<00:22, 10.03it/s]

[('title', tensor(0.3717, device='cuda:0')), ('artist', tensor(0.2899, device='cuda:0')), ('album', tensor(0.3384, device='cuda:0'))]


# Train Epoch:  18 Time: 12.703 Loss: 0.007:  95%|█████████▌| 2166/2280 [04:01<00:10, 10.48it/s]

[('title', tensor(0.3726, device='cuda:0')), ('artist', tensor(0.2890, device='cuda:0')), ('album', tensor(0.3385, device='cuda:0'))]


# Train Epoch:  19 Time: 12.677 Loss: 0.007: 100%|██████████| 2280/2280 [04:14<00:00,  8.94it/s]

[('title', tensor(0.3739, device='cuda:0')), ('artist', tensor(0.2870, device='cuda:0')), ('album', tensor(0.3391, device='cuda:0'))]





In [29]:
# torch.save(model, "music_model.torch")

In [30]:
# model = torch.load("music_model.torch")

In [31]:
from entity_embed.data_utils.datasets import RowDataset

valid_id_list = list(valid_id_set)
row_batch_size = 32
valid_row_dataset = RowDataset(
    row_encoder=row_encoder,
    row_dict=valid_row_dict,
    batch_size=row_batch_size
)
valid_row_loader = torch.utils.data.DataLoader(
    valid_row_dataset,
    batch_size=None,
    shuffle=False,
    # num_workers=os.cpu_count(),
    # multiprocessing_context='fork'
)

In [32]:
import tqdm
import os
import torch.nn.functional as F

model.eval()
valid_embed_matrix = []
with tqdm.tqdm(total=len(valid_row_loader), desc="# batch embedding") as p_bar:
    for i, (t_dict, t_lengths_dict) in enumerate(valid_row_loader):
        t_dict = {attr: t.to(device) for attr, t in t_dict.items()}
        valid_embed_matrix.extend(
            v.data.numpy()
            for v in model(t_dict, t_lengths_dict).cpu().unbind()
        )
        p_bar.update(1)

# batch embedding: 100%|██████████| 44875/44875 [21:31<00:00, 34.73it/s]


In [33]:
valid_id_to_index = {id_: index for index, id_ in enumerate(valid_id_set)}

In [34]:
%%time

ef_construction = 150
M = 64
metric = 'angular'

# https://github.com/nmslib/hnswlib/blob/master/ALGO_PARAMS.md#construction-parameters
approx_knn_index = HnswIndex(dimension=valid_embed_matrix[0].shape[0], metric=metric)
for valid_embed_vec in valid_embed_matrix:
    approx_knn_index.add_data(valid_embed_vec)

approx_knn_index.build(    
    m=M,
    max_m0=M,
    ef_construction=ef_construction,
    n_threads=os.cpu_count(),
)

CPU times: user 1h 33min 17s, sys: 12.1 s, total: 1h 33min 29s
Wall time: 8min 27s


In [35]:
%%time

ntop = 10

neighbor_distance_list = approx_knn_index.batch_search_by_ids(
    item_ids=list(range(len(valid_embed_matrix))),
    k=ntop,
    ef_search=-1,
    num_threads=os.cpu_count(),
    include_distances=True
)

CPU times: user 3h 35min 59s, sys: 12.5 s, total: 3h 36min 11s
Wall time: 18min 39s


In [36]:
neighbor_distance_list[0]

[(0, 0.0),
 (1, 0.010957956314086914),
 (211793, 0.28383827209472656),
 (367956, 0.3329240679740906),
 (1347520, 0.3470045328140259),
 (1347519, 0.3863089680671692),
 (367957, 0.39578837156295776),
 (76980, 0.3996577858924866),
 (984581, 0.40467971563339233),
 (984580, 0.4055212140083313)]

In [37]:
threshold = 0.5
distance_threshold = 1 - threshold

In [38]:
[row_dict[valid_id_list[neighbor]] for neighbor, distance in neighbor_distance_list[0] if distance <= distance_threshold]

[{'TID': '1',
  'CID': 1,
  'CTID': '1',
  'SourceID': '5',
  'id': 0,
  'number': '9',
  'title': "l ' enfant aux yeux d ' italie",
  'length': '219800',
  'artist': 'daniel balavoine',
  'album': 'de vous a elle en passant par moi',
  'year': '1975',
  'language': 'French'},
 {'TID': '262214',
  'CID': 1,
  'CTID': '2',
  'SourceID': '1',
  'id': 262213,
  'number': '009',
  'title': "l ' enfant aux yeux d ' italie ( de vous a elle en passant par moi )",
  'length': '03:39',
  'artist': 'daniel balavoine',
  'album': 'de vous a elle en passant par moi',
  'year': '1975',
  'language': ''},
 {'TID': '1197215',
  'CID': 147005,
  'CTID': '3',
  'SourceID': '3',
  'id': 1197214,
  'number': '9',
  'title': "l ' enfant aux yeux d ' italie - l ' integrale des albums originaux",
  'length': '3.663',
  'artist': 'daniel balavoine',
  'album': '',
  'year': "'10",
  'language': 'French'},
 {'TID': '494634',
  'CID': 255482,
  'CTID': '1',
  'SourceID': '1',
  'id': 494633,
  'number': '008',

In [39]:
found_pair_set = set()

for i, neighbor_distance in tqdm.tqdm_notebook(enumerate(neighbor_distance_list), total=len(neighbor_distance_list)):
    for j, distance in neighbor_distance:
        if i != j and distance <= distance_threshold:
            pair = tuple(sorted([valid_id_list[i], valid_id_list[j]]))
            found_pair_set.add(pair)

len(found_pair_set)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for i, neighbor_distance in tqdm.tqdm_notebook(enumerate(neighbor_distance_list), total=len(neighbor_distance_list)):


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1435997.0), HTML(value='')))




9044656

In [40]:
def pe_ratio(found_pair_set, valid_id_list):
    return len(found_pair_set) / len(valid_id_list)

pe_ratio(found_pair_set, valid_id_list)

6.298520122256523

In [41]:
precision_and_recall(found_pair_set, valid_pair_set)

(0.17331604430284578, 0.9676444444444444)

In [42]:
false_positives = list(found_pair_set - valid_pair_set)
len(false_positives)

7477072

In [43]:
false_negatives = list(valid_pair_set - found_pair_set)
len(false_negatives)

52416

In [44]:
cos_similarity = lambda a, b: np.dot(a, b)

In [45]:
for (id_left, id_right) in false_negatives[:10]:
    i = valid_id_to_index[id_left]
    j = valid_id_to_index[id_right]
    display((cos_similarity(valid_embed_matrix[i], valid_embed_matrix[j]), row_dict[id_left], row_dict[id_right]))

(-0.13808128,
 {'TID': '24',
  'CID': 12,
  'CTID': '1',
  'SourceID': '4',
  'id': 23,
  'number': '1',
  'title': '001 -',
  'length': '3m 52sec',
  'artist': '',
  'album': '( 2010 )',
  'year': 'null',
  'language': 'Jap.'},
 {'TID': '1065861',
  'CID': 12,
  'CTID': '2',
  'SourceID': '5',
  'id': 1065860,
  'number': '1',
  'title': 'lian hua da luan',
  'length': '232000',
  'artist': 'ao jing ya mei',
  'album': 'lian hua da luan',
  'year': '2010',
  'language': 'Japanese'})

(-0.13808128,
 {'TID': '24',
  'CID': 12,
  'CTID': '1',
  'SourceID': '4',
  'id': 23,
  'number': '1',
  'title': '001 -',
  'length': '3m 52sec',
  'artist': '',
  'album': '( 2010 )',
  'year': 'null',
  'language': 'Jap.'},
 {'TID': '1325147',
  'CID': 12,
  'CTID': '3',
  'SourceID': '1',
  'id': 1325146,
  'number': '001',
  'title': 'lian hua da luan',
  'length': '03:52',
  'artist': 'ao jing ya mei',
  'album': 'lian hua da luan',
  'year': '2010',
  'language': ''})

(0.8296295,
 {'TID': '118',
  'CID': 60,
  'CTID': '1',
  'SourceID': '2',
  'id': 117,
  'number': '53',
  'title': 'maria starring tian zhong li hui - ci ai noyan xie ji / ri chang hui hua pian',
  'length': '6',
  'artist': '',
  'album': '[ hayatenogotoku !] kiyarakutacd 2',
  'year': '07',
  'language': 'Japanese'},
 {'TID': '873110',
  'CID': 60,
  'CTID': '2',
  'SourceID': '3',
  'id': 873109,
  'number': '53',
  'title': 'ci ai noyan xie ji / ri chang hui hua pian - [ hayatenogotoku !] kiyarakutacd 2',
  'length': '0.104',
  'artist': 'maria starring tian zhong li hui',
  'album': '',
  'year': "'07",
  'language': 'Japanese'})

(0.43151316,
 {'TID': '120',
  'CID': 62,
  'CTID': '1',
  'SourceID': '3',
  'id': 119,
  'number': '4',
  'title': 'homuwaku ( orizinarukaraoke ) - anniversary ~ wu xian nicalling you',
  'length': '',
  'artist': 'song ren gu you shi',
  'album': '',
  'year': "'89",
  'language': 'Japanese'},
 {'TID': '917678',
  'CID': 62,
  'CTID': '2',
  'SourceID': '4',
  'id': 917677,
  'number': '4',
  'title': '',
  'length': 'unknown',
  'artist': '',
  'album': 'anniversarycalling you ( 1989 )',
  'year': 'null',
  'language': 'Jap.'})

(0.32669473,
 {'TID': '238',
  'CID': 118,
  'CTID': '1',
  'SourceID': '4',
  'id': 237,
  'number': '31',
  'title': '031 -',
  'length': '1m 31sec',
  'artist': '',
  'album': '( 2009 )',
  'year': 'null',
  'language': 'Jap.'},
 {'TID': '298889',
  'CID': 118,
  'CTID': '2',
  'SourceID': '5',
  'id': 298888,
  'number': '31',
  'title': 'ashitaqing retara',
  'length': '91480',
  'artist': 'gasuto',
  'album': 'rinanoatorie ~ shiyutorarunolian jin shu shi ~ orizinarusaundotoratsuku',
  'year': '2009',
  'language': 'Japanese'})

(-0.009961842,
 {'TID': '529',
  'CID': 266,
  'CTID': '1',
  'SourceID': '3',
  'id': 528,
  'number': '5',
  'title': 'onriman - sakurada zhan di ju ge yao quan ji',
  'length': '5.715',
  'artist': 'tachibanamaria',
  'album': '',
  'year': '',
  'language': 'Japanese'},
 {'TID': '693741',
  'CID': 266,
  'CTID': '2',
  'SourceID': '4',
  'id': 693740,
  'number': '5',
  'title': '005 -',
  'length': '5m 42sec',
  'artist': '',
  'album': '( unknown )',
  'year': 'null',
  'language': 'Jap.'})

(0.38111877,
 {'TID': '555',
  'CID': 278,
  'CTID': '1',
  'SourceID': '3',
  'id': 554,
  'number': '22',
  'title': 'memoirs shopping - tatakiuri',
  'length': '0.95',
  'artist': 'jon rose & otomo yoshihide',
  'album': '',
  'year': "'95",
  'language': 'English'},
 {'TID': '1050240',
  'CID': 278,
  'CTID': '2',
  'SourceID': '4',
  'id': 1050239,
  'number': '22',
  'title': '022 - memoirs shopping',
  'length': '0m 57sec',
  'artist': 'n . a .',
  'album': 'tnatakiuri ( 1995 )',
  'year': 'null',
  'language': 'Eng.'})

(0.39144593,
 {'TID': '1050240',
  'CID': 278,
  'CTID': '2',
  'SourceID': '4',
  'id': 1050239,
  'number': '22',
  'title': '022 - memoirs shopping',
  'length': '0m 57sec',
  'artist': 'n . a .',
  'album': 'tnatakiuri ( 1995 )',
  'year': 'null',
  'language': 'Eng.'},
 {'TID': '1663895',
  'CID': 278,
  'CTID': '3',
  'SourceID': '5',
  'id': 1663894,
  'number': '22',
  'title': 'shopping',
  'length': '57000',
  'artist': 'jon rose & otomo yoshihide',
  'album': 'tatakiuri',
  'year': '1995',
  'language': 'English'})

(0.8240696,
 {'TID': '1154324',
  'CID': 495,
  'CTID': '4',
  'SourceID': '2',
  'id': 1154323,
  'number': '6',
  'title': 'j . k . rowling - chapter 10 - 8 : das haus der gaunts',
  'length': '301',
  'artist': '',
  'album': 'harry potter und der halbblutprinz ( feat . narrator : rufus beck )',
  'year': '06',
  'language': 'German'},
 {'TID': '1516559',
  'CID': 495,
  'CTID': '5',
  'SourceID': '3',
  'id': 1516558,
  'number': '6',
  'title': 'chapter 10 - 8 : das haus der gaunts - harry potter und der halbblutprinz ( feat . narrator : rufus ',
  'length': '5.026',
  'artist': 'j . k . rowling',
  'album': '',
  'year': "'06",
  'language': 'German'})

(0.73762566,
 {'TID': '875246',
  'CID': 517,
  'CTID': '3',
  'SourceID': '2',
  'id': 875245,
  'number': '10',
  'title': 'richard strauss - salome , op . 54 : " sie ist ein ungeheuer , deine tochter "',
  'length': '60',
  'artist': '',
  'album': 'salome ( wiener philharmoniker feat . conductor : sir georg solti , soprano : birgit nilsson )',
  'year': '',
  'language': 'German'},
 {'TID': '1813302',
  'CID': 517,
  'CTID': '4',
  'SourceID': '3',
  'id': 1813301,
  'number': '10',
  'title': 'salome , op . 54 : " sie ist ein ungeheuer , deine tochter " - salome ( wiener philharmoniker feat .',
  'length': '1.0',
  'artist': 'richard strauss',
  'album': '',
  'year': '',
  'language': 'German'})