# Atmospheric Data - preparation 

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [2]:
# 1300001 - hive
# 1300002 - not hive?
# 1400001 - hive szymanski
# 1400002 - hive szymanski
#hives_ids = [1300001, 1300002, 1400001, 1400002]
hives_ids = [1300001]
DATA_INIT = True

In [3]:
if DATA_INIT:
    dfh_hives = [pd.read_csv(f"measurements/{hive_id}/humidity.csv") for hive_id in hives_ids]
    dft_hives = [pd.read_csv(f"measurements/{hive_id}/temperature.csv") for hive_id in hives_ids]
    dfh_hivesWithoutDuplicates = [dfh_hive.drop_duplicates(subset=['timestamp'], keep=False) for dfh_hive in dfh_hives]
    dft_hivesWithoutDuplicates = [dft_hive.drop_duplicates(subset=['timestamp'], keep=False) for dft_hive in dft_hives]

    for idx, hive_id in enumerate(hives_ids):
        print(f"Hive no. {hives_ids[idx]} | humidity temperature dataset size : {dfh_hivesWithoutDuplicates[idx].shape} {dft_hivesWithoutDuplicates[idx].shape}")

    df_hive = [pd.merge(
        dfh_hivesWithoutDuplicates[idx], dft_hivesWithoutDuplicates[idx], on='timestamp', suffixes=(f"_humidity_{hive_id}",f"_temperature_{hive_id}"))
               for idx, hive_id in enumerate(hives_ids)]

    
    total = 0
    for atmosphere_data in df_hive:
        atmosphere_data['timestamp'] = pd.to_datetime(atmosphere_data['timestamp'], format='%Y-%m-%dT%H-%M-%S').sort_values()
        atmosphere_data.set_index('timestamp', inplace=True)
        print(f"Atmospheric data after merge: {atmosphere_data.shape}")
        total += atmosphere_data.shape[0]
    
    print(f"Total atmoshpere dataset size: {total}")

Hive no. 1300001 | humidity temperature dataset size : (6776, 2) (6786, 2)
Atmospheric data after merge: (6696, 2)
Total atmoshpere dataset size: 6696


In [4]:
atmosphere_data.head()

atmosphere_night = atmosphere_data.between_time("23:00", "3:30")
atmosphere_day = atmosphere_data.between_time("3:30", "23:00")

# Sound Data - preparation

In [8]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import glob
import pandas as pd
from tqdm import tqdm
from sklearn import preprocessing
from functools import reduce
import pandas as pd
import librosa
import librosa.display

In [9]:
DATA_INIT = True
hives_ids = ['1300001']

In [10]:
if DATA_INIT:
    hive_sounds = []
    hive_timestamps = []
    for idx, hive_id in enumerate(hives_ids):
        print(f"Data preparation for hive: {hive_id}")

        sound_files = [f for f in glob.glob(f"measurements\\{hive_id}\\sound*.csv")]
        sound_hive_list = []
        for file in tqdm(sound_files):
            df_samples = pd.read_csv(file)
            pd_timestamp = pd.to_datetime(file.split("sound-")[1].split(".csv")[0], format='%Y-%m-%dT%H-%M-%S')
            if(len(df_samples.index) == 3000 and max(df_samples['samples'].values) < 4500):
                np_samples = np.array(df_samples['samples'].values, dtype="float32")
                np_samples = np_samples / 4080
                #np_samples = scaler.fit_transform(np_samples.reshape(-1, 1))
                hive_sounds.append([pd_timestamp, np_samples])
    sound_pd = pd.DataFrame(hive_sounds, columns=['timestamp', 'samples'])
    sound_pd = sound_pd.set_index('timestamp')
    np.save("night_day_sound_training.npy", sound_pd)
else:
    sound_pd = np.load("night_day_sound_training.npy", allow_pickle=True)
    print(f"Loaded: {len(hive_sounds)} sound recordings.")

Data preparation for hive: 1300001


100%|██████████| 1664/1664 [00:14<00:00, 112.59it/s]


In [11]:
if DATA_INIT:
    sound_night = sound_pd.between_time("23:00", "3:30")
    sound_day = sound_pd.between_time("3:30", "23:00")

    mfccs_avg_labeled = []
    for index, row in sound_night.iterrows():
        full_mfccs = librosa.feature.mfcc(y=row['samples'], sr=3000, n_fft=512, hop_length=256, n_mfcc=14)
        mfccs_avg_labeled.append([np.mean(full_mfccs,axis=1), 0])

    for index, row in sound_day.iterrows():
        full_mfccs = librosa.feature.mfcc(y=row['samples'], sr=3000, n_fft=512, hop_length=256, n_mfcc=14)
        mfccs_avg_labeled.append([np.mean(full_mfccs,axis=1), 1])

    np.random.shuffle(mfccs_avg_labeled)
    np.save("night_day_mfcc_training.npy", mfccs_avg_labeled)
else:
    mfccs_avg_labeled = np.load("night_day_mfcc_training.npy", allow_pickle=True)

### Merge sound with atmospheric data

In [60]:
for index, row in sound_night.iterrows():
    atmosphere_nearest = atmosphere_night.iloc[atmosphere_night.index.get_loc(index, method='nearest')]
    sound_night.loc[index, 'humidity'] = atmosphere_nearest[f"value_humidity_{hives_ids[0]}"]
    sound_night.loc[index, 'temperature'] = atmosphere_nearest[f"value_temperature_{hives_ids[0]}"]
    
for index, row in sound_day.iterrows():
    atmosphere_nearest = atmosphere_day.iloc[atmosphere_day.index.get_loc(index, method='nearest')]
    sound_day.loc[index, 'humidity'] = atmosphere_nearest[f"value_humidity_{hives_ids[0]}"]
    sound_day.loc[index, 'temperature'] = atmosphere_nearest[f"value_temperature_{hives_ids[0]}"]

In [31]:
sound_night.head(100)

Unnamed: 0_level_0,samples,humidity,temperature
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2019-08-02 23:01:37,"[0.6110294, 0.60514706, 0.6022059, 0.60490197,...",56.50,34.06
2019-08-02 23:17:37,"[0.58235294, 0.5875, 0.59460783, 0.59338236, 0...",56.31,34.06
2019-08-02 23:35:37,"[0.58235294, 0.58210784, 0.5769608, 0.58235294...",56.37,34.06
2019-08-02 23:50:37,"[0.61764705, 0.62058824, 0.6019608, 0.5872549,...",56.18,34.00
2019-08-03 00:05:37,"[0.6044118, 0.6012255, 0.60539216, 0.5953431, ...",56.06,33.93
...,...,...,...
2019-08-11 00:29:10,"[0.5995098, 0.6039216, 0.59411764, 0.58137256,...",56.68,33.96
2019-08-11 00:47:10,"[0.5948529, 0.5852941, 0.5708333, 0.5860294, 0...",56.56,33.87
2019-08-11 01:03:10,"[0.5987745, 0.5894608, 0.58235294, 0.5767157, ...",56.68,33.93
2019-08-11 01:18:10,"[0.58235294, 0.5747549, 0.5833333, 0.5953431, ...",56.68,33.87


# Basic classification PCA

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

standarized_mfcc_avg = StandardScaler().fit_transform([mfcc[0] for mfcc in mfccs_avg_labeled])
pca = PCA(n_components=2)
pc_data = pca.fit_transform(standarized_mfcc_avg)

# Basic classification t-sne

In [None]:
import numpy as np
from sklearn.manifold import TSNE

X_embedded = TSNE(n_components=2).fit_transform(standarized_mfcc_avg)

# Visualize

In [None]:
pc_data_labeled = list(zip(X_embedded, [mfcc[1] for mfcc in mfccs_avg_labeled]))

colors = ['red', 'green', 'blue', 'yellow']

fig = plt.figure()
ax = fig.add_subplot(1, 1, 1)

for data, label in tqdm(pc_data_labeled):
    x, y = data
    ax.scatter(x, y, c=colors[label], alpha=0.3)

plt.title("Mfcc scatter plot")
plt.savefig('foo.png')
plt.show()

# AUTOENCODER - BASIC

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
num_epochs = 50
learning_rate = 1e-4

class autoencoder(nn.Module):
    def __init__(self):
        super(autoencoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(3000, 2048),
            nn.ReLU(True),
            nn.Linear(2048, 1024),
            nn.ReLU(True),
            nn.Linear(1024, 512),
            nn.ReLU(True),
            nn.Linear(512, 256))
        self.decoder = nn.Sequential(
            nn.Linear(256, 512),
            nn.ReLU(True),
            nn.Linear(512, 1024),
            nn.ReLU(True),
            nn.Linear(1024, 2048),
            nn.ReLU(True),
            nn.Linear(2048, 3000), nn.Tanh())

    def forward(self, x):
        x = self.encoder(x)
        x = self.decoder(x)
        return x

In [None]:
import torch.utils.data as utils

sound_dataset = utils.TensorDataset(torch.Tensor([x[0] for x in training_data_sound]),
                                   torch.Tensor([x[1] for x in training_data_sound])) # create your datset
                                    
print("Length of complete sound dataset is", len(sound_dataset))
sound_trainset = torch.utils.data.DataLoader(sound_dataset, batch_size=20, shuffle=True)

In [None]:
model = autoencoder()
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=1e-6)

for epoch in range(num_epochs):
    for data in tqdm(sound_trainset):
        X, y = data
        model.zero_grad()
        output = model(X)
        loss = criterion(output, X)
        loss.backward()
        optimizer.step()
    # ===================log========================
    print(f"epoch [{epoch}/{num_epochs}], loss:{loss.item()}")

In [None]:
torch.save(model.state_dict(), 'autoencoder-basic-model.pth')

### Evaluation

In [None]:
model.load_state_dict(torch.load('autoencoder-basic-model.pth'))

In [None]:
with torch.no_grad():
    for data in tqdm(sound_trainset):
        X, y = data
        output = model(X)
        for idx, i in enumerate(output):
            if torch.argmax(i) == y[idx]:
                correct += 1
            total += 1

# AUTOENCODER CNN

In [None]:
a = [1,2,3,4,5,6,7,7,8,98,90,0]
print(a[-3:])

# PyTorch

# ---------------


In [None]:
import torch
import torchvision
import torch.nn as nn
import torch.nn.functional as F
from torchvision import transforms, datasets

In [None]:
train = datasets.MNIST("", train=True, download=True, transform = transforms.Compose([transforms.ToTensor()]))
test = datasets.MNIST("", train=False, download=True, transform = transforms.Compose([transforms.ToTensor()]))

In [None]:
print(train)

In [None]:
trainset = torch.utils.data.DataLoader(train, batch_size=20, shuffle=True)
testset = torch.utils.data.DataLoader(test, batch_size=20, shuffle=True)

In [None]:
class Net(nn.Module):
    
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(28*28, 64)
        self.fc2 = nn.Linear(64, 64)
        self.fc3 = nn.Linear(64, 64)
        self.fc4 = nn.Linear(64, 10)
        
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        x = self.fc4(x)
        return F.log_softmax(x, dim=1)
    
    def weight_reset(m):
        if isinstance(m, nn.Linear):
            m.reset_parameters()
        
        
net = Net()


In [None]:
import torch.optim as optim

optimizer = optim.Adam(net.parameters(), lr=1e-3)
EPOCHS = 6

net.weight_reset()

for epoch in range(EPOCHS):
    for data in trainset:
        X, y = data
        net.zero_grad()
        output = net(X.view(-1, 784))
        loss = F.nll_loss(output, y)
        loss.backward()
        optimizer.step()
    print("Current loss is: ", loss.item())
    if loss < 1e-5:
        print("Loss threshold obtained!")
        break
        

In [None]:
correct = 0
total = 0

with torch.no_grad():
    for data in testset:
        X, y = data
        output = net(X.view(-1, 784))
        for idx, i in enumerate(output):
            if torch.argmax(i) == y[idx]:
                correct += 1
            total += 1
            
print("Accuracy on test data: ", round(correct/total, 3))

In [None]:
correct = 0
total = 0

with torch.no_grad():
    for data in trainset:
        X, y = data
        output = net(X.view(-1, 784))
        for idx, i in enumerate(output):
            if torch.argmax(i) == y[idx]:
                correct += 1
            total += 1
            
print("Accuracy on train data: ", round(correct/total, 3))