In [1]:
from fastbook import *
from fastai.vision.all import *
from nnAudio import features
from scipy.io import wavfile
import pathlib
from torchvision import transforms
from IPython.display import Audio

In [2]:
#torch.multiprocessing.set_start_method('spawn')

## Making a fastai class

In [3]:
path = Path('DeepShip')
classes = [f for f in os.listdir(path) if os.path.isdir(Path(path/f))]
fns = get_files(path,'.wav')

def wavs(p : Path):
    return get_files(p,'.wav')

def path_to_id(p : Path):
    cls  = pathlib.PurePath(p).parent.parent.name
    id = pathlib.PurePath(p).parent.name
    idx = id.rfind('-')
    if idx == -1: return (cls,-1)
    return (cls,id[idx+1:])

def label_func(p : Path):
    if pathlib.PurePath(p).parent.name == "AmbientSE": return []
    return [pathlib.PurePath(p).parent.parent.name]

def normSamp(audio):
    ret = audio - np.mean(audio)
    return ret / np.max(ret)

def normSpec(spec):
    # take the logarithm of the values
    ret = torch.log10(spec)
    mean = torch.mean(ret)
    std = torch.std(ret)
    # Normalize each frame so its max 1, we dont need the extra dimension
    #return (ret / torch.transpose(torch.max(ret,2)[0],0,1))[0]
    #return (ret / torch.max(ret))[0]
    ret =  ((ret - mean + 0.5) / (std*4))[0]
    ret = torch.minimum(ret,torch.tensor(1))
    ret = torch.maximum(ret,torch.tensor(0))
    return ret

def normCqt(cqt):
    ret = torch.log10(cqt) / 2
    #return (ret / torch.max(ret,1)[0])[0]
    ret = (ret - torch.mean(ret) + 0.5)[0]
    ret = torch.minimum(ret,torch.tensor(1))
    ret = torch.maximum(ret,torch.tensor(0))
    return ret


In [4]:
imgsize = 460
sr = 32000

In [5]:
# STFT LF
Nfft_lf = 32768
Nskip_lf = int(Nfft_lf*0.1)
rng_lf = int(Nfft_lf*0.1 * imgsize)

stft_lf = features.STFT(n_fft=Nskip_lf, hop_length=Nskip_lf, freq_bins=None, 
              window='hann', freq_scale='linear', center=True, pad_mode='constant',
              output_format='Magnitude', fmin=0,fmax=1000, sr=sr,verbose=False).to('cuda:0')

In [6]:
# STFT HF
Nfft_hf = 4096
Nskip_hf = int(Nfft_hf*0.5)
rng_hf = int(Nfft_hf*0.5 * imgsize)
stft_hf = features.STFT(n_fft=Nfft_hf, hop_length=Nskip_hf, freq_bins=None, 
              window='hann', freq_scale='linear', center=True, pad_mode='constant',
              output_format='Magnitude', sr=sr, verbose=False).to('cuda:0')

In [7]:
# CQT
rng_cqt = 235200
cqt_ = features.CQT(sr=sr,bins_per_octave=64, n_bins=imgsize+2, verbose=False).to('cuda:0')

In [8]:
#features.CQT?

In [9]:
#norm = transforms.Normalize(mean=[0.485, 0.456, 0.406],std=[0.229, 0.224, 0.225])

In [10]:
#(rng_lf, rng_hf, rng_cqt)

In [11]:
# If to short, add random noise, return gpu tensor. 
def get_rand_waves(o : Path):
    start = 0
    _, waves = wavfile.read(o)
    rng = len(waves) - rng_lf
    waves = normSamp(waves)
    
    while rng < 0:
        #waves = np.append(waves,np.random.rand(-rng))
        #waves = np.append(waves,np.flip(waves))
        waves = np.append(waves,waves)
        rng = len(waves) - rng_lf
        
    start = random.randint(0,rng)
        
    return torch.tensor(waves[start:start+rng_lf]).float().to('cuda:0')

In [12]:
#stft_lf()(get_waves(fns[33]))

In [13]:
#np.flip([1,2,3,4,5])

In [14]:
#TensorImageBase??
#PILImage??


In [15]:
#torchvision.transforms?

In [16]:
class Spectrogram(TensorImageBase):
    """Type to represent a spectogram which knows show itself"""
    @classmethod
    def create(cls, o):
        
        if type(o) == 'torch.Tensor': return cls(o)
        waves = get_rand_waves(o)
        
        start_hf =  random.randint(0,rng_lf-rng_hf)
        start_cqt =  random.randint(0,rng_lf-rng_cqt)
        
        sampsLow = waves[:rng_lf]
        sampsHigh = waves[start_hf:start_hf+rng_hf]
        sampsCqt = waves[start_cqt:start_cqt+rng_cqt]

        lf = normSpec(stft_lf(sampsLow))[1:imgsize+1, 0:imgsize]
        hf = normSpec(stft_hf(sampsHigh))[4:imgsize+4, 0:imgsize]
        cqt = normCqt(cqt_(sampsCqt))[2:imgsize+2, 0:imgsize]
    
        #return cls(norm(torch.stack((lf,hf,cqt),0)))
        return cls(torch.stack((lf,hf,cqt),0))
    
    def show(self, figsize=None, ctx=None, **kwargs): 
        t = self
        if not isinstance(t, Tensor): return ctx
        if figsize is None: figsize=(10,10)
        return show_image(t, figsize=figsize, ctx=ctx)


In [17]:
#transforms.ToPILImage?

In [18]:
#spectrogram = Spectrogram.create(fns[12])
#spectrogram.show()

In [19]:
def SpectrogramBlock(cls=Spectrogram) : 
    "A `TransformBlock` for spectograms of `cls`"
    return TransformBlock(type_tfms=cls.create, batch_tfms=IntToFloatTensor)

In [20]:
"""specs = DataBlock(blocks=(SpectrogramBlock, CategoryBlock),
                   splitter=RandomSplitter(),
                   get_items=wavs, 
                   get_y=label_func,
                   item_tfms=Resize(460))
"""

'specs = DataBlock(blocks=(SpectrogramBlock, CategoryBlock),\n                   splitter=RandomSplitter(),\n                   get_items=wavs, \n                   get_y=label_func,\n                   item_tfms=Resize(460))\n'

In [21]:
#dls = specs.dataloaders(path, bs=16, num_workers=0)


In [22]:
#dls.show_batch(nrows=1, ncols=1)

In [23]:
#learn = cnn_learner(dls, resnet50, metrics=error_rate)

In [24]:
RandomSplitter?

[0;31mSignature:[0m [0mRandomSplitter[0m[0;34m([0m[0mvalid_pct[0m[0;34m=[0m[0;36m0.2[0m[0;34m,[0m [0mseed[0m[0;34m=[0m[0;32mNone[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m Create function that splits `items` between train/val with `valid_pct` randomly.
[0;31mFile:[0m      ~/miniconda/lib/python3.9/site-packages/fastai/data/transforms.py
[0;31mType:[0m      function


In [25]:
np.random.seed(43)
specs = DataBlock(blocks=(SpectrogramBlock, MultiCategoryBlock),
                   splitter=RandomSplitter(valid_pct=0.3, seed=43),
                   get_items=wavs, 
                   get_y=label_func)

In [26]:
dls = specs.dataloaders(path, bs=16, num_workers=0)


In [27]:
# Create the learner object
learn = cnn_learner(dls, resnet50, 
                    loss_func=BCEWithLogitsLossFlat(), 
                    metrics=[accuracy_multi]).to_fp16() #partial(accuracy_multi, thresh=0.95)
                    


In [None]:
learn.lr_find()
#SaveModelCallback?

In [34]:
dls.after_batch.fs.filter

<bound method L.filter of [IntToFloatTensor -- {'div': 255.0, 'div_mask': 1}:
encodes: (TensorImage,object) -> encodes
(TensorMask,object) -> encodes
decodes: (TensorImage,object) -> decodes
, Normalize -- {'mean': tensor([[[[0.4850]],

         [[0.4560]],

         [[0.4060]]]], device='cuda:0'), 'std': tensor([[[[0.2290]],

         [[0.2240]],

         [[0.2250]]]], device='cuda:0'), 'axes': (0, 2, 3)}:
encodes: (TensorImage,object) -> encodes
(Tabular,object) -> encodes
decodes: (TensorImage,object) -> decodes
(Tabular,object) -> decodes
]>

In [56]:
torch.mean(dls.one_batch()[0][0][0,:,:])

Spectrogram(0.5159, device='cuda:0')

In [57]:
dls.one_batch()[0][0].shape

torch.Size([3, 460, 460])

In [58]:
cbs = [SaveModelCallback(fname='accuracy_multi',monitor='accuracy_multi')]
learn.fine_tune(22, base_lr=2.5e-3, cbs=cbs)

epoch,train_loss,valid_loss,accuracy_multi,time


RuntimeError: CUDA out of memory. Tried to allocate 104.00 MiB (GPU 0; 9.78 GiB total capacity; 846.48 MiB already allocated; 87.69 MiB free; 930.00 MiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
learn.save('resnet50-90')
learn.lr_find()

In [None]:
learn.unfreeze()

In [None]:
learn.fit_one_cycle(4, lr_max=5.25e-5)

In [None]:
learn.save('mode29.pkl')

In [None]:
learn.lr_find()

In [None]:
learn.fit_one_cycle(20, lr_max=3e-5)

In [None]:
learn.load('resnet50-255')

In [None]:
btn_upload = widgets.FileUpload()
btn_run = widgets.Button(description='Run')
out_pl = widgets.Output()
text = widgets.HTML()
btns = widgets.HBox([btn_upload,btn_run])
audio = widgets.Audio(autoplay=True)
ui = widgets.VBox([btns,widgets.HBox([out_pl,text]),audio])


def predict(p : Path ):
    out_pl.clear_output()
    audio.from_file(p)
    if len(btn_upload.data) == 0: 
        text.value = 'using test.wav'
    #else:
        #img = PILImage.create(btn_upload.data[-1])
        # save data
        #p = Path('./upload.wav')
        #with open(p, 'wb') as f: f.write(btn_upload.data[-1])
    
    #img  = PILImage.create(Spectrogram.create(p).cpu())
    #with out_pl: display(img.to_thumb(256,256))
    _,mask,probs = learn.predict(p)
    l = np.argsort(-probs)
    if mask[l[0]]== False:
        text.value = '<B>Ingen sikre funn! Viser de med høyest score.</B><br />'
        for i in l[0:4]:
            #if mask[l[-i]] == False: continue
            text.value += f'{learn.dls.vocab[i]} {probs[i]}<br/>'
    else: 
        text.value = '<B>Funn!</B><br/>'
        for i in l:
            if mask[i] == False: continue
            text.value += f'{learn.dls.vocab[i]} {probs[i]}'
                        
btn_run.on_click(predict)

In [None]:
display(ui)

In [None]:
#predict()

file = Path('./passenger.wav')

_, waves = wavfile.read(file)
rng = len(waves) - rng_lf
waves = normSamp(waves)

while rng < 0:
        #waves = np.append(waves,np.random.rand(-rng))
        #waves = np.append(waves,np.flip(waves))
        waves = np.append(waves,waves)
        rng = len(waves) - rng_lf

        sampsLow = waves[:rng_lf]
        sampsHigh = waves[start_hf:start_hf+rng_hf]
        sampsCqt = waves[start_cqt:start_cqt+rng_cqt]

        lf = normSpec(stft_lf(sampsLow))[1:imgsize+1, 0:imgsize]
        hf = normSpec(stft_hf(sampsHigh))[4:imgsize+4, 0:imgsize]
        cqt = normCqt(cqt_(sampsCqt))[2:imgsize+2, 0:imgsize]
    
        #return cls(norm(torch.stack((lf,hf,cqt),0)))
        return cls(torch.stack((lf,hf,cqt),0))

def get_rand_waves(o : Path):
    
   
   
    
    
    
        
    start = random.randint(0,rng)
        
    return torch.tensor(waves[start:start+rng_lf]).float().to('cuda:0')

spec  = Spectrogram.create('./passenger.wav')
spec.show()

In [None]:
widgets.Audio??

In [None]:
rng_hf / rng_lf