In [1]:
import h5py
import numpy as np
import collections
import datetime
import csv
import pickle
import sys

  from ._conv import register_converters as _register_converters


In [2]:
print(sys.version)

3.6.5 |Anaconda custom (64-bit)| (default, Mar 29 2018, 13:14:23) 
[GCC 4.2.1 Compatible Clang 4.0.1 (tags/RELEASE_401/final)]


In [5]:
f = h5py.File('features.h5', 'r')

In [6]:
cluster_recording = list(f.values())[0]
cluster_recording.shape

(5036582,)

## Raw Data Peeking: The first ten 3-seconds audio clips:

In [7]:
cluster_recording[1][0]

b'/scratch/mc6591/sonyc/features/sonycnode-b827ebb40450.sonyc/2017-08-26.hdf5'

In [8]:
cluster_recording[1][1].decode("utf-8").split('_')

['b827ebb40450', '1503787235.74']

In [9]:
embedding = cluster_recording[1][2]
embedding

array([[152,  27, 156, ..., 110, 105, 255],
       [150,  22, 137, ...,  80, 102, 255],
       [152,  26, 150, ...,  48,  62, 255],
       ..., 
       [153,  27, 151, ...,  44,  34, 255],
       [151,  25, 145, ...,   0,   0, 255],
       [147,  14, 123, ...,   0,   0, 255]], dtype=uint8)

In [24]:
embedding.shape

(10, 128)

## 1. Convert the plain data format to dictionary for fast key look-up

In [10]:
alldata = collections.defaultdict(dict)

numberOfDataPoints = cluster_recording.shape[0]

counter = 0
print('Start: '+str(datetime.datetime.now()))

for datapoint in range(numberOfDataPoints):
    sensor_and_timestamp = cluster_recording[datapoint][1].decode("utf-8")
    if len(sensor_and_timestamp.split('_')) == 2:
        sensor_id = cluster_recording[datapoint][1].decode("utf-8").split('_')[0]
        timestamp = cluster_recording[datapoint][1].decode("utf-8").split('_')[1]
        embedding = cluster_recording[datapoint][2]
        alldata[sensor_id][timestamp] = embedding
    elif len(sensor_and_timestamp.split('-')) == 2:
        sensor_id = cluster_recording[datapoint][1].decode("utf-8").split('-')[0]
        timestamp = cluster_recording[datapoint][1].decode("utf-8").split('-')[1]
        embedding = cluster_recording[datapoint][2]
        alldata[sensor_id][timestamp] = embedding
    else:
        print('Warning: Not splittable!:' + sensor_and_timestamp)
    
    counter += 1
        
    if counter % 150000 == 0:
        print(str(counter) + ' ' + str(datetime.datetime.now()))

Start: 2018-04-15 14:29:20.328806
150000 2018-04-15 14:30:30.882628
300000 2018-04-15 14:31:41.793707
450000 2018-04-15 14:32:52.866983
600000 2018-04-15 14:34:03.708763
750000 2018-04-15 14:35:14.716887
900000 2018-04-15 14:36:25.763546
1050000 2018-04-15 14:37:36.449837
1200000 2018-04-15 14:38:46.566283
1350000 2018-04-15 14:39:57.197749
1500000 2018-04-15 14:41:07.087242
1650000 2018-04-15 14:42:17.441425
1800000 2018-04-15 14:43:29.261708
1950000 2018-04-15 14:44:40.380015
2100000 2018-04-15 14:45:50.351543
2250000 2018-04-15 14:47:00.075252
2400000 2018-04-15 14:48:09.823971
2550000 2018-04-15 14:49:19.412391
2700000 2018-04-15 14:50:28.949616
2850000 2018-04-15 14:51:37.994237
3000000 2018-04-15 14:52:48.207530
3150000 2018-04-15 14:53:58.567916
3300000 2018-04-15 14:55:08.085415
3450000 2018-04-15 14:56:17.653236
3600000 2018-04-15 14:57:31.433095
3750000 2018-04-15 14:58:43.666232
3900000 2018-04-15 14:59:53.731985
4050000 2018-04-15 15:01:03.796854
4200000 2018-04-15 15:02:13

### 1.1 Save the re-formatted feature data; now in the format of DefaultDictionary, into pickle file for efficient reading

In [None]:
with open("features.pickle", 'wb') as pfile:
    pickle.dump(alldata, pfile, protocol=pickle.HIGHEST_PROTOCOL)

### 1.2 Load saved dictionary pickle! (All features data)

In [7]:
# alldata_dict = collections.defaultdict(dict)
with (open("features.pickle", "rb")) as openfile:
    while True:
        try:
            alldata_dict= pickle.load(openfile)
        except EOFError:
            print('failed')
            break

failed


### 1.3 Example: Now we have all the sensor IDs and timestamp effectively accessible in the dictionary

In [18]:
alldata_dict.keys()

NameError: name 'alldata_dict' is not defined

In [28]:
alldata_dict['74da385c6855'].keys()

dict_keys(['1484629333.39', '1486275276.49', '1486270823.64', '1484456462.61', '1484506256.94', '1484456868.87', '1485062169.07', '1485061248.92', '1485107944.57', '1485105300.84', '1484370845.88', '1484370025.38', '1484377939.68', '1484377520.33', '1484802567.76', '1484812312.72', '1484802064.4', '1484811876.07', '1484715649.85', '1485320440.89', '1485397215.11', '1486457713.33', '1486510104.53', '1486509851.17', '1485493226.56', '1485982646.26', '1484888890.45', '1484888476.2', '1484974865.71', '1485255027.89', '1485241169.45', '1485249454.09'])

## 2. Read the positive_samples.pickle file - all positive sensor_id and timestamp
positive_samples.pickle file from Yu Wang, April 6, 2018, shared on Slack group.

In [7]:
# alldata_dict = collections.defaultdict(dict)
with (open("positive_samples.pickle", "rb")) as openfile:
    while True:
        try:
            positive_samples= pickle.load(openfile)
        except EOFError:
            break

In [8]:
positive_samples

{'b827eb429cd4': ['1488836836.42', '1488810598.15'],
 'b827eb5895e9': ['1490690898.85', '1490686108.53'],
 'b827eb0d8af7': ['1485198017.91', '1485163671.07'],
 'b827eb122f0f': ['1498892374.76', '1498261062.11'],
 'b827eb86d458': ['1487000875.51', '1487028598.04'],
 'b827eb0fedda': ['1497928649.89', '1480740512.58'],
 'b827eb8e2420': ['1500149760.47', '1500173615.97'],
 'b827eb1685c7': ['1499724045.58', '1486212620.04'],
 'b827eb815321': ['1482308965.46', '1482399908.63'],
 'b827eb9bed23': ['1491297344.73', '1491323380.55'],
 'b827eb44506f': ['1486149582.98', '1499473078.4'],
 'b827eb4e7821': ['1483014176.71', '1483015339.86'],
 'b827eb2a1bce': ['1487029980.37', '1487018417.81'],
 'b827ebad073b': ['1480303515.84', '1480297488.98'],
 'b827eb42bd4a': ['1495845294.38', '1495820507.31']}

### 2.1 Select the positive embeddings according to the csv file data  - Set up positive training dataset

In [15]:
totalPositiveEmbedding = np.empty([1,128], dtype=int)
for sensorid in positive_samples.keys():
    for timestamp in positive_samples[sensorid]:
        if sensorid in alldata_dict.keys():
            if timestamp in alldata_dict[sensorid].keys():
                embedding = alldata_dict[sensorid][timestamp]
                totalPositiveEmbedding = np.vstack((totalPositiveEmbedding, embedding))
        
totalPositiveEmbedding =np.delete(totalPositiveEmbedding, 0, 0) # remove the first dummy 128 embedding

In [16]:
totalPositiveEmbedding.shape

(170, 128)

In [17]:
# add the positive label, 1
positive_xy = np.insert(totalPositiveEmbedding, 128, 1, axis=1)

In [19]:
positive_xy.shape

(170, 129)

### 2.2 Save the positive training embedding data to pickle file:

In [21]:
with open("positive_xy.pickle", 'wb') as pfile:
    pickle.dump(positive_xy, pfile, protocol=pickle.HIGHEST_PROTOCOL)

### 2.3 Load the pickle! (positive training numpy object 170 $\times$ (128+1), with 1 mark as Is-Noise):

In [22]:
with (open("positive_xy.pickle", "rb")) as openfile:
    while True:
        try:
            positive_xy= pickle.load(openfile)
        except EOFError:
            break

## 3. Read the negative csv file

In [7]:
ls

README.md                          [1m[34mmy_code_NO_COMMIT[m[m/
Sensor_data_negative.csv           negative_xy.pickle
choosing_sensors.ipynb             [1m[34mpapers[m[m/
[1m[34mcode[m[m/                              positive_samples.pickle
[1m[34mgoogle_active-learning_githubrepo[m[m/ positive_xy.pickle
[1m[34mmodal-playground[m[m/                  positive_xy_wrong.pickle


In [8]:
negativeCsvData = {}
with open('Sensor_data_negative.csv') as csvfile:
    Sensor_data_positive = csv.reader(csvfile, skipinitialspace=True, delimiter=' ')
    csvrow = -1
    for row in Sensor_data_positive:
        if csvrow != -1: # skip the first row
            sid = row[0].split(',')[0]
            ts  = row[0].split(',')[1]
            if sid in negativeCsvData.keys():
                negativeCsvData[sid].append(ts)
            else:
                negativeCsvData[sid] = []
                negativeCsvData[sid].append(ts)
        csvrow += 1   

In [9]:
print(type(negativeCsvData))
print(len(negativeCsvData))
print(negativeCsvData['b827eb4e7821'])

<class 'dict'>
15
['1491211362.82_5', '1491800203.12_8', '1482424939.17_1', '1498868810.69_4', '1488993967.2_1', '1492066899.36_6', '1484297539.06_6', '1490406200.26_1', '1494789341.92_4', '1490926970.82_4', '1488422336.29_2', '1488377458.61_6', '1486479347.85_4', '1483636861.26_6', '1484122883.92_3', '1487404654.04_8', '1492186534.34_2', '1493876183.72_2', '1483134229.53_1', '1491421469.99_7']


In [15]:
negativeCount = 0
for key in negativeCsvData.keys():
    timestamps = negativeCsvData[key]
    print(timestamps)

['1491211362.82_5', '1491800203.12_8', '1482424939.17_1', '1498868810.69_4', '1488993967.2_1', '1492066899.36_6', '1484297539.06_6', '1490406200.26_1', '1494789341.92_4', '1490926970.82_4', '1488422336.29_2', '1488377458.61_6', '1486479347.85_4', '1483636861.26_6', '1484122883.92_3', '1487404654.04_8', '1492186534.34_2', '1493876183.72_2', '1483134229.53_1', '1491421469.99_7']
['1495523629.05_6', '1480117895.75_1', '1488292369.26_8', '1492102763.57_1', '1480694468.75_4', '1493213984.51_3', '1491487510.36_4', '1464325406.85_4', '1492035667.77_7', '1488896319.7_5', '1495720875.77_4', '1482091712.59_8', '1464240195.4_6', '1479310178.0_3', '1496909426.89_4', '1482329266.43_4', '1479682002.65_8', '1479267519.49_5', '1487318170.88_4', '1482296730.24_5']
['1492046254.64_1', '1488310935.14_4', '1497759001.33_2', '1495153300.1_3', '1481478245.04_4', '1489760860.19_5', '1479413956.79_5', '1481412339.02_4', '1485885766.44_4', '1493425976.33_4', '1494547745.44_8', '1497484597.2_3', '1500126760.74_

### 3.1 Pull negative embeddings according to the negative sensor and timestamp data (negativeCsvData)

In [10]:
totalNegativeEmbedding = np.empty([1,128], dtype=int)
for sensorid in negativeCsvData.keys():
    for timestamp in negativeCsvData[sensorid]:
        if sensorid in alldata_dict.keys():
            all_timestamp_full = negativeCsvData[sensorid]
            for timestamp_full in all_timestamp_full:
                timestamp = timestamp_full.split('_')[0]
                embedding_index = timestamp_full.split('_')[1]
                if timestamp in alldata_dict[sensorid]:
                    one_negative_embedding = alldata_dict[sensorid][timestamp][int(embedding_index)]
                    totalNegativeEmbedding = np.vstack((totalNegativeEmbedding, one_negative_embedding))
            
totalNegativeEmbedding =np.delete(totalNegativeEmbedding, 0, 0) # remove the first dummy 128 embedding

NameError: name 'alldata_dict' is not defined

In [55]:
print(totalNegativeEmbedding.shape)
print(totalNegativeEmbedding)

(3980, 128)
[[146  14 129 ...   0  62 255]
 [154  30 171 ...  79 158 255]
 [153  25 150 ... 114 150 255]
 ...
 [164  33 165 ... 211 244 255]
 [164  37 180 ... 174 166 255]
 [153  28 167 ...  76 108 255]]


### 3.2 Add negative labels (0) to the negative embeddings.

In [51]:
# add the negative label, 0
negative_xy = np.insert(totalNegativeEmbedding, 128, 1, axis=1)

In [52]:
negative_xy.shape

(3980, 129)

### 3.3 Save the negative embedding and labels to a pickle file.

In [56]:
with open("negative_xy.pickle", 'wb') as pfile:
    pickle.dump(negative_xy, pfile, protocol=pickle.HIGHEST_PROTOCOL)

### 3.4 Load the negative_xy pickle

In [21]:
with (open("../negative_xy.pickle", "rb")) as openfile:
    while True:
        try:
            negative_xy= pickle.load(openfile)
        except EOFError:
            break

In [25]:
negative_xy_new = np.insert(np.delete(negative_xy, -1, axis=1), 128, 0, axis=1)

In [27]:

# negative_xy_new

with open("negative_xy.pickle", 'wb') as pfile:
    pickle.dump(negative_xy_new, pfile, protocol=pickle.HIGHEST_PROTOCOL)

## 4. Summary:
Now we have the following datasets that have been cleaned and formatted to pickles for better usage:  
1. ``features.pickle``
2. ``positive_xy.pickle``
3. ``negative_xy.pickle``

The original raw data files are:
1. ``features.h5``
2. ``positive_samples.pickle``
3. ``Sensor_data_negative.csv``

In [5]:
s = np.array([1,3,2,4,7,9,0,5])

In [7]:
uncertainty_index = np.argsort(s)[::-1]
uncertainty_index

array([5, 4, 7, 3, 1, 2, 0, 6])

In [8]:
uncertainty_index = uncertainty_index[np.in1d(uncertainty_index, unlabeled_indices)][:5]

NameError: name 'unlabeled_indices' is not defined