In [1]:
# Imports
import matplotlib.pyplot as plt
import numpy as np
from pathlib import Path

In [2]:
# Read data set
datasetPath = Path('specgrams')

losslessData = {}
lossyData = {}

filenameList = []

# Read the lossless data
for childPath in datasetPath.joinpath('specgram_lossless').iterdir():
    if childPath.suffix == '.tiff':
        filename = childPath.name.replace('.tiff', '')
        filenameList.append(filename)
        # Flatten to 1D array in order to use as feature
        losslessData[filename] = plt.imread(childPath.as_posix()).flatten()
        plt.close('all')
print('Lossless data loaded: ', len(losslessData))
        
# Read the lossy data
for childPath in datasetPath.joinpath('specgram_lossy_insanely_high').iterdir():
    if childPath.suffix == '.tiff':
        filename = childPath.name.replace('.mp3', '').replace('.tiff', '')
        # This should not happen, but we should check if the lossy image file
        # corresponds to a lossless image file
        if not filename in losslessData:
            print('WARNING: Mismatched lossy file: ', filename)
            continue
        # Flatten to 1D array in order to use as feature
        lossyData[filename] = plt.imread(childPath.as_posix()).flatten()
        plt.close('all')
print('Lossy data loaded: ', len(losslessData))

Lossless data loaded:  6877
Lossy data loaded:  6877


In [3]:
# Naive approach to split training/testing set
trainTestRatio = 0.8

# Label: lossless = 0, lossy = 1
trainingLabelList = []
trainingSetList = []
testingLabelList = []
testingSetList = []

for name in losslessData:
    # Skip if the track is only in lossless data
    if not name in lossyData:
        continue
    # Add the entry to training set if the training set is not big enough
    if len(trainingSetList) < (2 * len(lossyData)) * trainTestRatio:
        # Add both lossless entry and the lossy counterpart
        trainingSetList.append(losslessData[name])
        trainingLabelList.append(0)
        trainingSetList.append(lossyData[name])
        trainingLabelList.append(1)
    else:
        testingSetList.append(losslessData[name])
        testingLabelList.append(0)
        testingSetList.append(lossyData[name])
        testingLabelList.append(1)
print('Training set: ', len(trainingSetList))
print('Testing set: ', len(testingSetList))

Training set:  11004
Testing set:  2750


In [4]:
# Train decision tree classifier
from sklearn import tree
clf = tree.DecisionTreeClassifier()
clf.fit(trainingSetList, trainingLabelList)

DecisionTreeClassifier()

In [5]:
# Test classifier
clf.score(testingSetList, testingLabelList)

0.9501818181818181