In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from tqdm import tqdm
import glob

hives_ids = ["smrpiclient7", "smrpiclient6", "smrpiclient3"]
timezone_offset_hours = 2


Here we load train data sound samples and prepare spectrogram. We should provide data with utc_timestamps as it will be shifted with `timezone_offset_hours` var.

In [110]:
import struct
import math
import numpy as np
import librosa
import librosa.display

from tqdm import tqdm
from datetime import datetime, timedelta
from scipy.io import wavfile
from scipy import signal
from scipy.fftpack import fft

import matplotlib.pyplot as plt
%matplotlib widget

sound_time_ms = 2000
# ~93 ms for fft window
nfft = 4096
# ~34% overlapping
hop_len = (nfft//3) + 30
# This can be manipulated to adjust number of bins for conv layer
fmax = 2750

hives_data = []
rmses = {}
max_to_norm = 0
for idx, hive_id in enumerate(hives_ids):
    sound_files = [f for f in glob.glob(f"..\\measurements\\smartulav2\\{hive_id}_*\\*.wav")]
    print(f"Sound data preparation for hive: {hive_id} which has {len(sound_files)} recordings...", end=' ', flush=True)
    rmses[hive_id] = []
    for file in tqdm(sound_files):
        sample_rate, sound_samples = wavfile.read(file)
        sound_samples = sound_samples.T[0]/(2.0**31)
        rms = np.sqrt(sum(sound_samples**2)/len(sound_samples))
        if(rms < 0.7):    # that threshold was observed from plot_distribution() function
            rmses[hive_id].append(rms)
            
            mfccs = librosa.feature.mfcc(y=sound_samples, sr=sample_rate, n_fft=nfft, hop_length=hop_len, n_mfcc=13)
            np_mfcc_avg = np.mean(mfccs, axis=1)

            spectrogram = librosa.core.stft(sound_samples, n_fft=nfft, hop_length=hop_len)
            spectrogram_magnitude = np.abs(spectrogram)
            spectrogram_phase = np.angle(spectrogram)
            spectrogram_db = librosa.amplitude_to_db(spectrogram_magnitude, ref=np.max)
            frequencies = librosa.fft_frequencies(sr=sample_rate, n_fft=nfft)
            times = (np.arange(0, spectrogram_magnitude.shape[1])*hop_len)/sample_rate

            freq_slice = np.where((frequencies < fmax))
            frequencies = frequencies[freq_slice]
            spectrogram_db = spectrogram_db[freq_slice, :][0]    

            filename = file.rsplit('\\', 1)[-1]
            utc_timestamp = filename[filename.index('-')+1:].rsplit(".wav")[0]
            sound_datetime = datetime.strptime(utc_timestamp, '%Y-%m-%dT%H-%M-%S') + timedelta(hours=timezone_offset_hours)
            hives_data.append([sound_datetime, hive_id, sound_samples, [frequencies, times, spectrogram_db], np_mfcc_avg])
    print(" done.")

print(f"Got {len(hives_data)} sound samples")

Sound data preparation for hive: smrpiclient7 which has 3367 recordings... 

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3367/3367 [02:13<00:00, 25.29it/s]

 done.
Sound data preparation for hive: smrpiclient6 which has 3172 recordings... 


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3172/3172 [02:15<00:00, 23.41it/s]


 done.
Sound data preparation for hive: smrpiclient3 which has 602 recordings... 

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 602/602 [00:23<00:00, 25.80it/s]

 done.
Got 7033 sound samples





In [4]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

sc = StandardScaler()
mm = MinMaxScaler()

spectrogram_ae_data = [hive_data[3][2] for hive_data in hives_data]
standarized_ae_data = [sc.fit_transform(spec.T).T for spec in spectrogram_ae_data]
scaled_ae_data = [mm.fit_transform(stan.T).T for stan in standarized_ae_data]
datetimes = [hive_data[0] for hive_data in hives_data]
names = [hive_data[1] for hive_data in hives_data]

data_to_analyze = list(zip(scaled_ae_data, datetimes, names))

print(f"Got dataset of size: {len(scaled_ae_data)}")

Got dataset of size: 7141


## Train basic AE

In [None]:
from scipy import signal as sig
from sklearn.preprocessing import MinMaxScaler
from torch.utils.data import DataLoader
from torch.utils import data as tdata

# Divide data to test, validation and train
train_stop_idx = int(pd_ae_data.shape[0]*90/100)

pd_ae_data_train = pd_ae_data[:train_stop_idx]
pd_ae_data_test = pd_ae_data[train_stop_idx:]

print(f'Train data size: {pd_ae_data_train.shape[0]}')
print(f'Test data size: {pd_ae_data_test.shape[0]}')

tensor_train = torch.Tensor(pd_ae_data_train['periodogram'].values.tolist())
tensor_test = torch.Tensor(pd_ae_data_test['periodogram'].values.tolist())

train_dataset = tdata.TensorDataset(tensor_train)
test_dataset = tdata.TensorDataset(tensor_test)

dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)
dataloader_test = DataLoader(test_dataset, batch_size=32, shuffle=True)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
from torch.autograd import Variable
from torchvision import transforms
import torch.nn.functional as F

num_epochs = 1000
learning_rate = 1e-3

model = autoencoder_basic().to(device)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=1e-5)

for epoch in range(num_epochs):
    loss = 0
    for data in dataloader:
        periodogram = data[0].to(device)
        # ===================forward=====================
        output = model(periodogram)
        #train_loss = criterion(output, periodogram)
        train_loss = F.binary_cross_entropy(output, periodogram)
        # ===================backward====================
        optimizer.zero_grad()
        train_loss.backward()
        optimizer.step()
        loss += train_loss.item()
    loss = loss / len(dataloader)
    if (epoch+1) % (num_epochs/10) == 0:
        print(f'epoch {epoch + 1}/{num_epochs}, loss:{loss}')

In [None]:
import matplotlib.pyplot as plt

counter = 0
with torch.no_grad():
    loss_test = 0
    for data in dataloader_test:
        periodograms_test = data[0].to(device)
        output = model(periodograms_test)
        for idx, i in enumerate(output):
            #loss_test += nn.MSELoss()(periodograms_test[idx], i)
            loss_test += F.binary_cross_entropy(periodograms_test[idx], i)

loss_test = loss_test/len(pd_ae_data_test)
print(f'Final test loss: {loss_test}')


## Train CONV AE

In [None]:
import torch 
from torch.utils import data as tdata

train_data_size = len(scaled_ae_data)*80//100
val_data_size = len(scaled_ae_data) - train_data_size

dataset_tensor = torch.Tensor(scaled_ae_data)
print(f"Dataset shape: {dataset_tensor.shape}")
print(f"Train set size: {train_data_size}")
print(f"Validation set size: {val_data_size}")

# add one extra dimension as it is required for conv layer
dataset_tensor = dataset_tensor[:, None, :, :] 
dataset = tdata.TensorDataset(dataset_tensor)
train_set, val_set = torch.utils.data.random_split(dataset, [train_data_size, val_data_size])

dataloader_train = tdata.DataLoader(train_set, batch_size=32, shuffle=True)
dataloader_val = tdata.DataLoader(val_set, batch_size=32, shuffle=True)

In [None]:
del modelConvAE

In [None]:
%matplotlib widget

import sys
import matplotlib.pyplot as plt

num_epochs = 50
learning_rate = 1e-3

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# del modelConvAE

modelConvAE = ConvAutoencoder().to(device)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(modelConvAE.parameters(), lr=learning_rate, weight_decay=1e-6)

# monitor training loss per batch
train_loss = []
# monitor validation loss per batch
val_loss = []
# save avg train losses for early stopping visualization
avg_train_loss = []
# save avg train losses for early stopping visualization
avg_val_loss = [] 
# patience when stop training
patience = 10
# counter for patience in early sotpping
patience_counter = 0
# best validation score
best_val_loss = -1
# model checkpoint filename
checkpoint_filename = 'checkpoint.pth'
# early stopping epoch
win_epoch = 0
    
for epoch in range(1, num_epochs+1):    
    ###################
    # train the model #
    ###################
    modelConvAE.train()
    for data in dataloader_train:
        # transfer data to device
        periodogram = data[0].to(device)
        # clear the gradients of all optimized variables
        optimizer.zero_grad()
        # forward pass
        outputs = modelConvAE(periodogram)
        # calculate the loss
        loss = criterion(outputs, periodogram)
        # backward pass: compute gradient of the loss with respect to model parameters
        loss.backward()
        # perform a single optimization step (parameter update)
        optimizer.step()
        # update running training loss
        train_loss.append(float(loss.item()))
        
    ###################
    # val the model   #
    ###################
    modelConvAE.eval()
    for val_data in dataloader_val:
        # transfer data to device
        periodogram_val = val_data[0].to(device)
        # forward pass
        voutputs = modelConvAE(periodogram_val)
        # calculate the loss
        vloss = criterion(voutputs, periodogram_val)
        # update running val loss
        val_loss.append(float(vloss.item()))
    
    # print training/validation statistics 
    # calculate average loss over an epoch
    train_loss = np.average(train_loss)
    val_loss = np.average(val_loss)
    avg_train_loss.append(train_loss)
    avg_val_loss.append(val_loss)
    
    epoch_len = len(str(num_epochs))
    # print avg training statistics 
    print(f'[{epoch:>{epoch_len}}/{num_epochs:>{epoch_len}}] train_loss: {train_loss:.5f} valid_loss: {val_loss:.5f}', end=' ', flush=True)
    
    if val_loss < best_val_loss or best_val_loss == -1:
        # new checkpoint
        print("checkpoint!")
        best_val_loss = val_loss
        patience_counter = 0
        torch.save(modelConvAE.state_dict(), checkpoint_filename)
        win_epoch = epoch
    elif patience_counter >= patience:
        print("early stopping.")
        print(f"=> loading checkpoint {checkpoint_filename}")
        device = torch.device("cuda")
        modelConvAE.load_state_dict(torch.load(checkpoint_filename))
        break
    else:
        print(".")
        patience_counter = patience_counter + 1
        
    # clear batch losses
    train_loss = []
    val_loss = []

fig = plt.figure(figsize=(10,5))
plt.plot(np.arange(1, epoch + 1), avg_train_loss, 'r', label="train loss")
plt.plot(np.arange(1, epoch + 1), avg_val_loss, 'b', label="validation loss")
plt.axvline(win_epoch, linestyle='--', color='g',label='Early Stopping Checkpoint')
plt.legend(loc='best')
plt.grid(True)
plt.show()

# Encode data

In [None]:
encoded_data = conv2d_encode(modelConvAE, scaled_ae_data)

In [None]:
%matplotlib widget
import matplotlib.pyplot as plt
import random

idx = random.randint(0, len(hives_data) - 1)
with torch.no_grad():
    fig, axs = plt.subplots(2, 1)
    frequencies = librosa.fft_frequencies(sr=sample_rate, n_fft=nfft)
    freq_slice = np.where((frequencies < fmax))
    frequencies = frequencies[freq_slice]
    times = (np.arange(0, spectrogram_magnitude.shape[1])*hop_len)/sample_rate    
    
    elem = scaled_ae_data[idx]
    elem = elem[None, None,: ,:]
    elem = torch.Tensor(elem)

    axs[0].pcolormesh(times, frequencies, scaled_ae_data[idx])
    axs[1].pcolormesh(times, frequencies, modelConvAE(elem.to(device)).cpu().numpy().squeeze())

# Add temperature/humidity/gas

In [None]:
hive_under_analysis = hives_ids[0]
start_time = '2020-08-10 00:00:00'
end_time = '2020-09-16 00:00:00'
print(f"extracting data for hive under analysis: {hive_under_analysis} from {start_time} to {end_time}...")

df_hives_sound = pd.DataFrame(data_to_analyze, columns=['spectrogram', 'timestamp', 'name'])
df_hive_sound_ua = df_hives_sound[(df_hives_sound['name'] == hive_under_analysis)
                                 & (df_hives_sound['timestamp'] > start_time)
                                 & (df_hives_sound['timestamp'] < end_time)]
df_hive_sound_ua.set_index('timestamp', inplace=True)
print(f"-> prepared base of {df_hive_sound_ua.count()['spectrogram']} numer of sound spectrum <-")

df_hive_temperature_ua = read_sensor_data('..//measurements//smartulav2//sulmin-10082020-15092020-inside-temperature.csv', hive_under_analysis, start_time, end_time, 'temperature')
df_hive_humidity_ua = read_sensor_data('..//measurements//smartulav2//sulmin-10082020-15092020-inside-humidity.csv', hive_under_analysis, start_time, end_time, 'humidity')
df_hive_alcohol_ua = read_sensor_data('..//measurements//smartulav2//sulmin-10082020-15092020-inside-alcohol.csv', hive_under_analysis, start_time, end_time, 'alcohol')
df_hive_aceton_ua = read_sensor_data('..//measurements//smartulav2//sulmin-10082020-15092020-inside-aceton.csv', hive_under_analysis, start_time, end_time, 'aceton')
df_hive_amon_ua = read_sensor_data('..//measurements//smartulav2//sulmin-10082020-15092020-inside-jon-amonowy.csv', hive_under_analysis, start_time, end_time, 'jon-amonowy')
df_hive_toluen_ua = read_sensor_data('..//measurements//smartulav2//sulmin-10082020-15092020-inside-toluen.csv', hive_under_analysis, start_time, end_time, 'toluen')
df_hive_co2_ua = read_sensor_data('..//measurements//smartulav2//sulmin-10082020-15092020-inside-co2.csv', hive_under_analysis, start_time, end_time, 'co2')
df_hive_siarkowodor_ua = read_sensor_data('..//measurements//smartulav2//sulmin-10082020-15092020-inside-siarkowodor.csv', hive_under_analysis, start_time, end_time, 'siarkowodor')
df_hive_metanotiol_ua = read_sensor_data('..//measurements//smartulav2//sulmin-10082020-15092020-inside-metanotiol.csv', hive_under_analysis, start_time, end_time, 'metanotiol')
df_hive_trimetyloamina_ua = read_sensor_data('..//measurements//smartulav2//sulmin-10082020-15092020-inside-trimetyloamina.csv', hive_under_analysis, start_time, end_time, 'trimetyloamina')
df_hive_wodor_ua = read_sensor_data('..//measurements//smartulav2//sulmin-10082020-15092020-inside-wodor.csv', hive_under_analysis, start_time, end_time, 'wodor')
df_hive_co_ua = read_sensor_data('..//measurements//smartulav2//sulmin-10082020-15092020-inside-co.csv', hive_under_analysis, start_time, end_time, 'co')

print("merging data to sound samples...")
df_hive_data = merge_dataframes_ontimestamp(df_hive_sound_ua,
                                            df_hive_temperature_ua, df_hive_humidity_ua)
#                                             df_hive_alcohol_ua, df_hive_aceton_ua, df_hive_amon_ua, df_hive_toluen_ua, df_hive_co2_ua,
#                                             df_hive_siarkowodor_ua, df_hive_metanotiol_ua, df_hive_trimetyloamina_ua, df_hive_wodor_ua,
#                                             df_hive_co_ua)

print(f"encoding sound samples with autoencoder...", end=' ')
df_hive_data['conv_ae'] = conv2d_encode(modelConvAE, df_hive_data['spectrogram'].to_list())
print(f"finish!")

df_hive_data['feature_vector'] = merge_columns(df_hive_data, ['conv_ae', 'humidity', 'temperature'])
#                                                 'alcohol', 'aceton', 'jon-amonowy', 'toluen', 'co2', 
#                                                 'siarkowodor', 'metanotiol', 'trimetyloamina', 'wodor',
#                                                 'co',
#                                                 'conv_ae'])

# SVM classification 

In [144]:
[str(start_hour) for start_hour in start_hours][:6]

['20', '21', '22', '23', '0', '1']

In [156]:
mfcc_hive_name = hives_ids[1] 
start_hours = [20, 21, 22, 23, 0, 1, 2, 3, 4, 5]

print(f'extracting mfcc data for hive {mfcc_hive_name}... ', end='', flush=True)
mfccs = [hive_data[4] for hive_data in hives_data if hive_data[1] == mfcc_hive_name]
datetimes = [hive_data[0] for hive_data in hives_data if hive_data[1] == mfcc_hive_name]
mfccs_data = list(zip(datetimes, mfccs))
print(f'got {len(mfccs_data)} samples')

pd_mfcc_data = pd.DataFrame(mfccs_data, columns=['datetime', 'mfcc'])
pd_mfcc_data.set_index('datetime', inplace=True)

search_best_night_day(pd_mfcc_data, 'mfcc', 10, start_hours, 7)

extracting mfcc data for hive smrpiclient6... got 3172 samples
learning with train data size: 2212 and test data size: 960
number of nights in train/test data: 192.0/80
new max acuuracy for 20 to 21, accuracy: 91.77
for night start at 20 and end at 21 got accuracy: 91.77
learning with train data size: 2212 and test data size: 960
number of nights in train/test data: 195.0/76
new max acuuracy for 21 to 22, accuracy: 92.08
for night start at 21 and end at 22 got accuracy: 92.08
learning with train data size: 2212 and test data size: 960
number of nights in train/test data: 201.0/74
new max acuuracy for 22 to 23, accuracy: 92.29
for night start at 22 and end at 23 got accuracy: 92.29
learning with train data size: 2212 and test data size: 960
number of nights in train/test data: 196.0/77
for night start at 23 and end at 0 got accuracy: 91.88
learning with train data size: 2212 and test data size: 960
number of nights in train/test data: 187.0/77
for night start at 0 and end at 1 got accur

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [None]:
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

reduced_ae_pca = PCA(n_components=2).fit_transform(df_svm['feature_vector'].values.tolist())
reduced_ae_tsne =  TSNE(n_components=2, perplexity=100, learning_rate=500).fit_transform(df_svm['feature_vector'].values.tolist())

colors = ['red', 'green', 'blue', 'yellow']
labels = ['day', 'night']

fig, axs = plt.subplots(2, figsize=(10,10))

axs[0].scatter(x=[data[0] for data in reduced_ae_pca],
               y=[data[1] for data in reduced_ae_pca],
               c=[colors[data] for data in initial_list],
              alpha=0.3)
axs[0].set_title('PCA')

axs[1].scatter(x=[data[0] for data in reduced_ae_tsne],
               y=[data[1] for data in reduced_ae_tsne],
               c=[colors[data] for data in initial_list],
              alpha=0.3)
axs[1].set_title('TSNE')

plt.show()

In [None]:
pca = PCA().fit(ae_standarized)

plt.figure()
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('number of components')
plt.ylabel('cumulative explained variance');

# ----- Functions/Classes -----

plotting sepctrogram by fucntion 

In [None]:
import matplotlib.pyplot as plt

%matplotlib widget

def plot_spectrogram(frequency, time_x, spectrocgram, title):
    fig = plt.figure(figsize=(6,4))
    plt.title(title)
    plt.pcolormesh(time_x, frequency, spectrocgram)
    plt.ylabel('Frequency [Hz]')
    plt.xlabel('Time [sec]')
    plt.show()

convolutional autoencoder

In [None]:
import torch.nn as nn
import torch.nn.functional as F

class View(nn.Module):
    """ Function for nn.Sequentional to reshape data """
    def __init__(self, shape):
        super(View, self).__init__()
        self.shape = shape

    def forward(self, x):
        return x.view(*self.shape)

def conv2d_block(in_f, out_f, *args, **kwargs):
    """ Function for building convolutional block

        Attributes
            in_f - number of input features
            out_f - number of output features
    """
    return nn.Sequential(
        nn.Conv2d(in_f, out_f, *args, **kwargs),
        nn.BatchNorm2d(out_f),
        nn.ReLU(),
        nn.Dropout2d(p=0.2)
    )

def conv2d_transpose_block(in_f, out_f, *args, **kwargs):
    """ Function for building transpose convolutional block
        
        Attributes
            in_f - number of input features
            out_f - number of output features
    """
    return nn.Sequential(
        nn.ConvTranspose2d(in_f, out_f, *args, **kwargs),
        nn.BatchNorm2d(out_f),
        nn.ReLU(),
        nn.Dropout2d(p=0.2)
    )

######################################
#                                    #
#   Main convolutional autoencoder   #
#                                    #
######################################
class ConvAutoencoder(nn.Module):
    def __init__(self):
        super(ConvAutoencoder, self).__init__()
        
        ## encoder layers ##
        self.encoder = nn.Sequential(
            # [1x256x64] => [64x256x64]
            conv2d_block(1, 64, kernel_size=3, padding=1),
            # [64x256x64] => [64x128x32]
            nn.MaxPool2d(2, 2),
            # [64x128x32] => [32x128x32]
            conv2d_block(64, 32, kernel_size=3, padding=1),
            # [32x128x32] => [32x64x16]
            nn.MaxPool2d(2, 2),
            # [32x64x16] => [16x64x16]
            conv2d_block(32, 16, kernel_size=3, padding=1),
            # [16x64x16] => [16x32x8]
            nn.MaxPool2d(2, 2),
            # [16x32x8] => [4x32x8]
            conv2d_block(16, 4, kernel_size=3, padding=1),
            # [4x32x8] => [4x16x4]
            nn.MaxPool2d(2, 2),
            # [4x16x4] => [1x256]
            nn.Flatten(),
            # [1x256] => [1x64]
            nn.Linear(256, 64),
            nn.ReLU()
        )
        
        ## decoder layers ##
        self.decoder = nn.Sequential(
            # [1x64] => [1x256]
            nn.Linear(64, 256),
            nn.ReLU(),
            # [1x256] => [4x16x4]
            View([-1, 4, 16, 4]),
            # [4x16x4] => [16x32x8]
            conv2d_transpose_block(4, 16, kernel_size=2, stride=2),
            # [16x32x8] => [32x64x16]
            conv2d_transpose_block(16, 32, kernel_size=2, stride=2),
            # [32x64x16] => [64x128x32]
            conv2d_transpose_block(32, 64, kernel_size=2, stride=2),
            # [64x128x32] => [1x256x64]
            nn.ConvTranspose2d(64, 1, kernel_size=2, stride=2),
            nn.Sigmoid()
        )


    def forward(self, x):
        x = self.encoder(x)
        x = self.decoder(x)
        
        return x

basic fully connected autoencoder

In [None]:
import torch
from torch import nn

class autoencoder_basic(nn.Module):
    def __init__(self):
        super(autoencoder_basic, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(1499, 512),
            nn.SELU(True),
            nn.Linear(512, 128),
            nn.SELU(True),
            nn.Linear(128, 64),
            nn.SELU(True))
        self.decoder = nn.Sequential(
            nn.Linear(64, 128),
            nn.SELU(True),
            nn.Linear(128, 512),
            nn.SELU(True),
            nn.Linear(128, 1499),
            nn.Sigmoid())

    def forward(self, x):
        x = self.encoder(x)
        x = self.decoder(x)
        return x

function for extracting ecoded data from trained model

In [None]:
def conv2d_encode(model, data_intput):
    """ Function for encoding data and returning encoded """
    dataset_tensor = torch.Tensor(data_intput)
    dataset_tensor = dataset_tensor[:, None, :, :]
    dataset_tensor = tdata.TensorDataset(dataset_tensor)
    dataset = tdata.DataLoader(dataset_tensor, batch_size=32, shuffle=True)
    encoded_data = []
    
    model.eval()
    with torch.no_grad():
        for data in dataset:
            periodograms = data[0].to(device)
            output = modelConvAE.encoder(periodograms).cpu().numpy().squeeze()
            encoded_data.extend(output)
    
    return encoded_data

In [None]:
import pandas as pd
from datetime import datetime, timedelta

def read_sensor_data(filename, hive_sn, start_time, end_time, sensor_column_name):
    """ Function for reading smartula sensor file (from grafana) and build pandas dataframe """
    df_sensor_data = pd.read_csv(filename, skiprows=1, sep=";")
    
    if hive_sn not in hives_ids:
        print(f"Hive {hive_sn} is not in hives_ids set! Returning empty dataframe")
        return pd.DataFrame()
    
    # change series column to be coherent with sounds
    for hive in hives_ids:
        df_sensor_data.loc[df_sensor_data['Series'].str.contains(hive[2:]), 'Series'] = hive

    # change column names to match sound
    df_sensor_data.columns = ['name', 'timestamp', sensor_column_name]
    # convert timestamp to pandas timestamp
    df_sensor_data['timestamp'] = [(datetime.strptime(date_pd[:-6], '%Y-%m-%dT%H:%M:%S') + timedelta(hours=timezone_offset_hours)) for date_pd in df_sensor_data['timestamp'].values.tolist()]
    
    df_sensor_data = df_sensor_data[(df_sensor_data['name'] == hive_sn) & (df_sensor_data['timestamp'] > start_time) & (df_sensor_data['timestamp'] < end_time)]
    df_sensor_data.set_index('timestamp', inplace=True)
    print(f"got {df_sensor_data[sensor_column_name].count()} of {sensor_column_name} samples")
    
    return df_sensor_data

def merge_dataframes_ontimestamp(df_merge_to, *args):
    """ Merging dataframes to df_merge_to """
    df_hive_data_ua = df_merge_to
    for dataframe in args:
        df_hive_data_ua = pd.merge(df_hive_data_ua, dataframe.reindex(df_hive_data_ua.index, method='nearest'), on=['timestamp', 'name'])
        
    return df_hive_data_ua

flatten util

In [None]:
import collections

def flatten(x):
    if isinstance(x, collections.abc.Iterable):
        return [a for i in x for a in flatten(i)]
    else:
        return [x]
    
def merge_columns(dataframe, column_names):
    """ Function for merging columns with irregular size """
    return [flatten(val) for val in  dataframe[column_names].values.tolist()]
    

Function for performing grid search on best OneClasSVM on day/night classification and visualizing results

In [155]:
from sklearn.svm import SVC
import pandas as pd

  
def plot_6hour_shift(y, xticklabels):
    """ Function for plotting six hour shift """
    fig, axs  = plt.subplots(3, 2, figsize=(10,8))
    fig.subplots_adjust(hspace=0.7)

    axs[0][0].plot(y[0], 'ro')
    axs[0][0].grid()
    axs[0][0].set_xticks(np.arange(0, 10, 1))
    axs[0][0].tick_params(axis='x', rotation=270)
    axs[0][0].set_xticklabels(xticklabels)
    axs[0][0].set_title('1 hour long bee-night')
    axs[0][0].set_ylabel('SVM accuracy')
    axs[0][0].set_xlabel('Hour')

    axs[0][1].plot(y[1], 'ro')
    axs[0][1].grid()
    axs[0][1].set_xticks(np.arange(0, 10, 1))
    axs[0][1].tick_params(axis='x', rotation=270)
    axs[0][1].set_xticklabels(xticklabels)
    axs[0][1].set_title('2 hours long bee-night')
    axs[0][1].set_ylabel('SVM accuracy')
    axs[0][1].set_xlabel('Hour')

    axs[1][0].plot(y[2], 'ro')
    axs[1][0].grid()
    axs[1][0].set_xticks(np.arange(0, 10, 1))
    axs[1][0].tick_params(axis='x', rotation=270)
    axs[1][0].set_xticklabels(xticklabels)
    axs[1][0].set_title('3 hours long bee-night')
    axs[1][0].set_ylabel('SVM accuracy')
    axs[1][0].set_xlabel('Hour')

    axs[1][1].plot(y[3], 'ro')
    axs[1][1].grid()
    axs[1][1].set_xticks(np.arange(0, 10, 1))
    axs[1][1].tick_params(axis='x', rotation=270)
    axs[1][1].set_xticklabels(xticklabels)
    axs[1][1].set_title('4 hours long bee-night')
    axs[1][1].set_ylabel('SVM accuracy')
    axs[1][1].set_xlabel('Hour')

    axs[2][0].plot(y[4], 'ro')
    axs[2][0].grid()
    axs[2][0].set_xticks(np.arange(0, 10, 1))
    axs[2][0].tick_params(axis='x', rotation=270)
    axs[2][0].set_xticklabels(xticklabels)
    axs[2][0].set_title('5 hours long bee-night')
    axs[2][0].set_ylabel('SVM accuracy')
    axs[2][0].set_xlabel('Hour')

    axs[2][1].plot(y[5], 'ro')
    axs[2][1].grid()
    axs[2][1].set_xticks(np.arange(0, 10, 1))
    axs[2][1].tick_params(axis='x', rotation=270)
    axs[2][1].set_xticklabels(xticklabels)
    axs[2][1].set_title('6 hours long bee-night')
    axs[2][1].set_ylabel('SVM accuracy')
    axs[2][1].set_xlabel('Hour')
    
    fig.show()

def search_best_night_day(input_data, feature_name, days_test, start_hours, max_shift):
    """ Function performing One-class SVM
    
        attribute: train_data - pandas series dataframe
        attribute: feature_name - name of column from dataframe which will be used as feature
        attribute: days_test - number of last days which will be used to create train data
        attribute: start_hours - list with start hours
        attribute: max_shift - max shift in hours
    """
    max_accuracy = 0
    
    accs_per_shift = []
    final_accs = []

    for shift in range(1, max_shift):
        for start_hour in start_hours:
            data_to_svm = pd.DataFrame(input_data)
            data_to_svm.sort_index(inplace=True)
            
            end_hour = (start_hour + shift) % 24
            if end_hour > 12 or start_hour < max_shift:
                data_to_svm['is_night'] = (data_to_svm.index.hour >= start_hour) & (data_to_svm.index.hour <= end_hour)
            else:
                data_to_svm['is_night'] = (data_to_svm.index.hour >= start_hour) | (data_to_svm.index.hour <= end_hour)
                
            samples_in_day = data_to_svm[data_to_svm.index < (data_to_svm.index[0] + timedelta(days=1))].count()
            data_test = data_to_svm.tail(samples_in_day[0]*days_test)
            data_train = data_to_svm[~data_to_svm.isin(data_test)].dropna(how='all')
            
            train_data = data_train[feature_name].values.tolist()
            train_labels = data_train['is_night'].values.tolist()
            test_data = data_test[feature_name].values.tolist()
            test_labels = data_test['is_night'].values.tolist()
            
            print(f'learning with train data size: {len(train_data)} and test data size: {len(test_data)}')
            print(f'number of nights in train/test data: {sum(train_labels)}/{sum(test_labels)}')
            svc = SVC(kernel='rbf', class_weight='balanced', gamma='auto')
            svc.fit(train_data, train_labels)
            predicted = svc.predict(test_data)
            
            sum_correct = 0
            for idx, label_predicted in enumerate(predicted):
                if(label_predicted == int(test_labels[idx])):
                    sum_correct += 1

            accuracy = (sum_correct/len(test_labels)*100)
            if accuracy > max_accuracy:
                print(f'new max acuuracy for {start_hour} to {end_hour}, accuracy: {accuracy:.2f}')
                max_accuracy = accuracy
            print(f'for night start at {start_hour} and end at {end_hour} got accuracy: {accuracy:.2f}')
            print('==============================================================================')
            accs_per_shift.append(accuracy)
        final_accs.append(accs_per_shift)
        accs_per_shift = []
        
    plot_6hour_shift(final_accs, [str(start_hour) for start_hour in start_hours])

In [116]:
import matplotlib

def plot_distribution(distribution_dict, bin_size):
    """ Plotting distribiution for dictionary elements"""
    colors = ['blue', 'green', 'red', 'yellow', 'black', 'pink', 'purple']
    rms_max = 0
    rms_min = 65535
    for k, v in rmses.items():
        if np.max(v) > rms_max:
            rms_max = np.max(v)
        if np.min(v) < rms_min:
            rms_min = np.min(v)
        
    plt.figure()
    for idx, (k, v) in enumerate(distribution_dict.items()):
        plt.hist(v, color=colors[idx%len(colors)], bins=int(np.abs(rms_max-rms_min)/bin_size))
    plt.show()