# Clean Data

Main issue is we want to check if a track exists in both classes. I.e. if a track is in the proper class, it should not be in the non-proper class and vice versa.

This is difficult because the filenames are not unique and there are many different versions of the same track. So we need to check how close similar the filenames are to each other. We may have cases where the youtube video name is slightly different but the audio is the same, but potentially different quality. 


To solve this we will use a library called PyDejavu. It is a python implementation of the Dejavu audio fingerprinting library. 

In this notebook we will use pydejavu to make sure we dont have any tracks that are in both classes.

NB: PyDejaVu is also used in the backend to fingerprint the audio when we store track information in the database, we use the library to avoid re-scoring tracks that already exist in the database, so when a user submits a track we check if it exists in the database using the fingerprint, if not we fingerpring, score and store the track in the database, return the score to the user.

UPDATE 26/10/2024:

I have had good success using regular string matching to find duplicates, so I will bench audio fingerprinting for now. Might need to come back to it once I start thinking about how to avoid rescoring tracks that already exist in the database. But for now, since the download process retrieved enough metadata, it's more appropriate to use string matching.

In [None]:
import os
from mutagen import File
from dejavu import Dejavu
from dejavu.recognize import FileRecognizer

# Configure Dejavu
config = {
    "database": {
        "host": "127.0.0.1",
        "user": "yourusername",
        "passwd": "yourpassword",
        "db": "dejavu"
    }
}

# Initialize Dejavu
djv = Dejavu(config)

def get_audio_metadata(file_path):
    audio = File(file_path)
    if audio is not None:
        artist = audio.get('TPE1', 'Unknown Artist')  # TPE1 is the tag for artist
        title = audio.get('TIT2', 'Unknown Title')   # TIT2 is the tag for title
        return str(artist), str(title)
    return None, None

def list_tracks(directory):
    tracks = {}
    for root, _, files in os.walk(directory):
        for file in files:
            if file.endswith(('.mp3', '.flac', '.wav', '.m4a')):  # Add more formats if needed
                file_path = os.path.join(root, file)
                artist, title = get_audio_metadata(file_path)
                if artist and title:
                    tracks[(artist, title)] = file_path
    return tracks

def find_common_tracks(proper_dir, non_proper_dir):
    proper_tracks = list_tracks(proper_dir)
    non_proper_tracks = list_tracks(non_proper_dir)
    
    # First check by metadata
    common_tracks = set(proper_tracks.keys()).intersection(set(non_proper_tracks.keys()))
    
    # Check by fingerprint
    fingerprint_matches = []
    for track in proper_tracks.values():
        song = djv.recognize(FileRecognizer, track)
        if song and song['song_id'] in [djv.recognize(FileRecognizer, non_proper_tracks[meta])['song_id'] for meta in non_proper_tracks]:
            fingerprint_matches.append(track)
    
    return common_tracks, fingerprint_matches

def save_common_tracks_to_file(common_tracks, fingerprint_matches, output_file_path):
    with open(output_file_path, 'w') as file:
        file.write("Tracks in both directories:\n")
        for artist, title in common_tracks:
            file.write(f"Artist: {artist}, Title: {title} - Matched by metadata\n")
        for track in fingerprint_matches:
            file.write(f"Track: {track} - Matched by fingerprint\n")

# Example usage
proper_directory = '/path/to/proper/tracks'
non_proper_directory = '/path/to/non_proper/tracks'
output_directory = 'notebooks/output'
output_file = 'common_tracks.txt'

# Ensure the output directory exists
os.makedirs(output_directory, exist_ok=True)

common_tracks, fingerprint_matches = find_common_tracks(proper_directory, non_proper_directory)
output_file_path = os.path.join(output_directory, output_file)
save_common_tracks_to_file(common_tracks, fingerprint_matches, output_file_path)

print(f"Common tracks have been saved to {output_file_path}")