# extractDirectionality

## Imports

In [4]:
import pandas as pd
import re 
import json
import datetime
import os
import time
import glob
import os

%pylab inline

Populating the interactive namespace from numpy and matplotlib


## Subroutines
Subroutines neccessary for extractDirectionality
timeExtract takes the log file to be parsed, and returns the start time and end time in a tuple:<br>
    [startTime, endTime]<br>
<br>
durationinMicroseconds takes the log file to be parsed, and it returns the total duration, startTime, and endTime:<br>
    duration, startTime, endTime

In [5]:
# Extract time information of each recording from the log file
def timeExtract(filename):
    with open(filename, 'rb') as f:
        # Start counting from the last byte
        counter = 1
        # Go to the 2nd byte before the end of the last line
        f.seek(-2, 2) 
        while f.read(1) != b'\n':
            f.seek(-2, 1)
            counter=counter+1
        endTime_line = f.readline().decode()
        # Go to the 2nd byte before the end of the last second line
        f.seek(-counter-2, 2)
        while f.read(1) != b'\n':
            f.seek(-2, 1)
        startTime_line = f.readline().decode()

    return [startTime_line, endTime_line]

In [6]:
# Calculate duration of each recording in microseconds
def durationinMicroseconds(filename):
    startTime = timeExtract(filename)[0].split()[2:]
    endTime = timeExtract(filename)[1].split()[2:]
    startTimeStr = startTime[0] + ' ' + startTime[1]
    endTimeStr = endTime[0] + ' ' + endTime[1]
    T1 = datetime.datetime.strptime(startTimeStr, '%Y-%m-%d %H:%M:%S.%f')
    T2 = datetime.datetime.strptime(endTimeStr, '%Y-%m-%d %H:%M:%S.%f')
    delta = T2-T1
    duration = delta.seconds*1000000 + delta.microseconds
    
    return duration, T1, T2

## Function: extractDirectionalities
Decription:<br>
extractDirectionalities accepts a log file generated from data collected from an ODAS microphone, it will then organize the file into a dataframe, where each data point is time, direction, and strength of a single source.<br><br>
Parameters:<br>
**log file path, Microphopne Number**<br><br>
Returns a dataframe with the following columns:<br> 
**Timestamp, Time, Time In Seconds, Microphone Number, ID, X, Y, Z, Activity** <br>


In [7]:
def extractDirectionalities(filename, mic_number):

    with open(filename, 'r') as f:
        text = f.read()

        # Use repex to store blocks of data into a list
    data = re.split('(?<=})\n(?={)', text)
        # Delete the time info from the last data block
    tmp = data[-1][:(data[-1].rfind("}")+1)]
    data[-1] = tmp

    #list of src blocks 

    srcList = [json.loads(block)["src"] for block in data]
    timeList = [json.loads(block)["timeStamp"] for block in data]


    #timestamp is the initial time stamp
    #time is the datetime value converted from the timestamp and intitial time
    #source is a 4 by 6 array where the rows are the source, and the columns are the source values
    df = pd.DataFrame(columns = ['Timestamp', 'Time', 'Time In Seconds', 'Microphone Number', 'Source ID', 'X', 'Y', 'Z', 'Activity'])

    #Used for calculating timestamps -> time
    duration, startTime, endTime = durationinMicroseconds(filename)
    start_time_in_seconds = time.mktime(startTime.timetuple())
    t = duration/len(data) / 1000000.0

    index = 1.0
    ind = 0
    df_dict = {}
    for block, temp in zip(srcList, timeList):
        if block[0]["id"] != 0 or block[1]["id"] != 0 or block[2]["id"] != 0 or block[3]["id"] != 0:
            time_in_seconds = start_time_in_seconds + (index - 1.0) * t
            for i in range(0, 4):
                if block[i]['id'] != 0:
                    #to do: Fix the interns code: Timestamp != ind
                    df_dict[ind] = {"Timestamp": temp, "Time":datetime.datetime.fromtimestamp(time_in_seconds).strftime("%B %d, %Y %I:%M:%S"), "Time In Seconds": time_in_seconds, "Microphone Number":mic_number, "Source ID": block[i]["id"], "X": block[i]["x"], "Y": block[i]["y"], "Z": block[i]["z"], "Activity": block[i]["activity"]}
                    ind = ind + 1
                    #df = df.append(pd.DataFrame({"Timestamp": [index], "Time":datetime.datetime.fromtimestamp(time_in_seconds).strftime("%A, %B %d, %Y %I:%M:%S"), "Time In Seconds": time_in_seconds, "Microphone Number":mic_number, "Source ID": block[i]["id"], "X": block[i]["x"], "Y": block[i]["y"], "Z": block[i]["z"], "Activity": block[i]["activity"]}, index=[0]))
        index = index + 1.0

    df = pd.DataFrame.from_dict(df_dict, orient = 'index')
    end = time.time()

    return(df)


## Function: mergeDirectionalities
Decription: <br>
mergeDirectionalities will iterate through all the files in "data" folder, and it will use extractDirectionalities create each file into a dataframe. The dataframe created will be appended into a master dataframe consiting of all the dataframes created from using extractDirectionalities on each file in "data" folder. <br>

Prerequirements:<br>
All folders in "data" folder must be filled with desired .log files in their respective recordingx folders.To do this, run the function above to automate it, or manually download the files from google drive and insert them into the correct folders.<br><br>

Parameters:<br>
None.<br><br>

Returns a dataframe with the following columns:<br> 
**Timestamp, Time, Time In Seconds, Microphone Number, ID, X, Y, Z, Activity** <br>

In [8]:
def mergeDirectionalities():   
    #create dataframe
    df = pd.DataFrame(columns = ['Timestamp', 'Time', 'Time In Seconds', 'Microphone Number', 'Source ID', 'X', 'Y', 'Z', 'Activity'])
    for i in range(4):
        for filename in glob.glob("/Users/brian_wangst/Google Drive File Stream/My Drive/ODAS/recordings" + str(i) + "/*.log"):
        #for filename in glob.glob("notast4ing"):
            with open(filename, 'r') as f:
                firstline = f.readline()
            if firstline == "SST log contains no useful data\n":
                pass
            else:
                df1 = extractDirectionalities(filename, i)
                df = df.append(df1)
            
    df = df.sort_values(['Time In Seconds'])
    return df

## Function: directionalitiesOfMicrophone

Description:<br>
directionalitiesOfMicrophone takes the directory of all the log files produced by a single microphone, and it will create a dataframe from that data.<br><br>
Parameters:<br>
String representation of the path from the home directory to the directory of the log files of the given microphone.<br><br>
Returns a dataframe with the following columns:<br> 
**Timestamp, Time, Time In Seconds, Microphone Number, ID, X, Y, Z, Activity** <br>

In [9]:
def diectionalitiesOfMicrophone(microphone_directory):
    df = pd.DataFrame(columns = ['Timestamp', 'Time', 'Microphone Number', 'Source ID', 'X', 'Y', 'Z', 'Activity'])
    for filename in glob.glob(microphone_directory +  "*.log"):
        with open(filename, 'r') as f:
            firstline = f.readline()
        if firstline == "SST log contains no useful data\n":
            pass
        else:
            df1 = extractDirectionalities(filename, i)
            df = df.append(df1)
    return df

# K Clustering

## Imports

In [7]:
import numpy as np
from matplotlib import pyplot as plt
from sklearn.cluster import KMeans
from mpl_toolkits import mplot3d

In [None]:
data0 = extractDirectionalities("/Users/ardelalegre/CSE4223-ODAS/data/recordings0/cSSt_2019-09-09_14_55_03.log", 0)

X = data[['X', 'Y', 'Z']].values

In [None]:
wcss = []
for i in range(1, 11):
    kmeans = KMeans(n_clusters=i, init='k-means++', max_iter=300, n_init=10, random_state=0)
    kmeans.fit(X)
    wcss.append(kmeans.inertia_)
plt.plot(range(1, 11), wcss)
plt.title('Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.show()

In [None]:
kmeans = KMeans(n_clusters=4, init='k-means++', max_iter=300, n_init=10, random_state=0)
pred_y = kmeans.fit_predict(X)
plt.scatter(X[:,0], X[:,1], X[:,2])
plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], s=300, c='red')
plt.show()

In [None]:
ax = plt.axes(projection='3d')
ax.scatter(data['X'].values,data['Y'].values, data['Z'])
ax.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], kmeans.cluster_centers_[:, 2])

In [None]:
d = mergeDirectionalities()

In [None]:
x = d[['X', 'Y', 'Z']].values

wcss = []
for i in range(1, 11):
    kmeans = KMeans(n_clusters=i, init='k-means++', max_iter=3000, n_init=10, random_state=0)
    kmeans.fit(x)
    wcss.append(kmeans.inertia_)
plt.plot(range(1, 11), wcss)
plt.title('Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.show()

In [None]:
kmeans = KMeans(n_clusters=5, init='k-means++', max_iter=300, n_init=10, random_state=0)
pred_y = kmeans.fit_predict(x)
plt.scatter(x[:,0], x[:,1], x[:,2])
plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], s=300, c='red')
plt.show()

In [None]:
ax = plt.axes(projection='3d')
ax.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], kmeans.cluster_centers_[:, 2], c='red')
ax.scatter(d['X'].values,d['Y'].values, d['Z'])

2020-01-08-09-25-03
## K means for January 8

In [8]:
def to_CSV(mic_number):
    dataframe = pd.DataFrame(columns = ['Timestamp', 'Time', 'Time In Seconds', 'Microphone Number', 'Source ID', 'X', 'Y', 'Z', 'Activity'])
    for filename in glob.glob("/Users/brian_wangst/Google Drive File Stream/My Drive/ODAS/recordings" + str(mic_number) + "/*.log"):
        if("2020-01-08" in filename):
            print(filename)
            with open(filename, 'r') as f:
                firstline = f.readline()
                if firstline == "SST log contains no useful data\n":
                    pass
            try:
                dataframe = extractDirectionalities(filename, mic_number)
                dataframe.to_csv(path_or_buf='/home/ardelalegre/CSE4223-ODAS/data/recordings' + str(mic_number) + '/' + filename[48:filename.find('.log')] + '.csv')
            except KeyboardInterrupt:
                print(nothing)
            except:
                print("Error with " + filename)
    return dataframe   

In [57]:
def earliest_day(file0, file1, file2, file3):
    compare = []
    if(file0.find('cSST') != -1):
        compare.append(file0[file0.find('cSST') + 5: file0.find('cSST') + 10 + 5])
        
    if(file1.find('cSST') != -1):
        compare.append(file1[file1.find('cSST') + 5: file1.find('cSST') + 10 + 5])
    
    if(file2.find('cSST') != -1):
        compare.append(file2[file2.find('cSST') + 5: file2.find('cSST') + 10 + 5])
    
    if(file3.find('cSST') != -1):
        compare.append(file3[file3.find('cSST') + 5: file3.find('cSST') + 10 + 5])
    
    #check year 
    min_year = 2021
    for y in compare:
        if(int(y[:4]) < min_year):
            min_year = int(y[:4])
           
    for y in compare:
        if(int(y[:4])!= min_year):
            del y
    
    #check month
    min_month = 13
    for y in compare:
        if(int(y[5:7]) < min_month):
            min_month = int(y[5:7])
           
    for y in compare:
        if(int(y[5:7])!= min_month):
            del y
           
    #check day
    min_day = 32
    for y in compare:
        if(int(y[5:7]) < min_day):
            min_day = int(y[5:7])
           
    for y in compare:
        if(int(y[5:7])!= min_day):
            del y
            
    return compare[0]

In [58]:
def combine_CSV():
    master_list = []
    count = 0
    total = 0
    records0 = []
    records1 = []
    records2 = []
    records3 = []
    fridays = ["2019-09-13", "2019-09-20", "2019-09-27", 
               "2019-10-4", "2019-10-11", "2019-10-18", "2019-10-25", 
               "2019-11-1", "2019-11-8", "2019-11-15", "2019-11-22", "2019-11-29",
               "2019-12-6", "2019-12-13", "2019-12-20", "2019-12-27",
               "2020-01-03", "2020-01-10", "2020-01-17", "2020-01-24", "2020-01-31", 
               "2020-02-07", "2020-02-14", "2020-02-21", "2020-02-28",
               "2020-03-06", "2020-03-13", "2020-03-20", "2020-03-27"]
    
    temp0 = glob.glob("/Users/brian_wangst/Google Drive File Stream/My Drive/ODAS/recordings" + str(0) + "/*.log")
    temp1 = glob.glob("/Users/brian_wangst/Google Drive File Stream/My Drive/ODAS/recordings" + str(1) + "/*.log")
    temp2 = glob.glob("/Users/brian_wangst/Google Drive File Stream/My Drive/ODAS/recordings" + str(2) + "/*.log")
    temp3 = glob.glob("/Users/brian_wangst/Google Drive File Stream/My Drive/ODAS/recordings" + str(3) + "/*.log")
    
    for date in temp0:
        for day in fridays:
            if day in date:
                records0.append(date)
                
    for date in temp1:
        for day in fridays:
            if day in date:
                records1.append(date)
            
    for date in temp2:
        for day in fridays:
            if day in date:
                records2.append(date)
            
    for date in temp3:
        for day in fridays:
            if day in date:
                records3.append(date)
   
    records0.sort()
    records1.sort()
    records2.sort()
    records3.sort()

    curDay = ""
    temp = ""
    for mic0,mic1,mic2,mic3 in zip(records0,records1,records2,records3):
        curDay = mic0[mic0.find("cSST"): mic0.find("cSST") + 15]
        if temp != curDay:
            dict_to_csv(master_list, curDay)
            temp = curDay
            master_list = []
            
        if curDay in mic0:
            with open(mic0, 'r') as f:
                firstline = f.readline()
                if firstline == "SST log contains no useful data\n":
                    pass
                else:
                    df0 = extractDirectionalities(mic0,0)
                    count = count + 1
                    for index, row in df0.iterrows():
                        dic = {}
                        dic['Timestamp'] = row['Timestamp']
                        dic['Time'] = row['Time']
                        dic['Time In Seconds'] = row['Time In Seconds']
                        dic['Microphone Number'] = row['Microphone Number']
                        dic['Source ID'] = row['Source ID']
                        dic['X'] = row['X']
                        dic['Y'] = row['Y']
                        dic['Z'] = row['Z']
                        dic['Activity'] = row['Activity']
                        master_list.append(dic)


                    
        if curDay in mic1:
            with open(mic1, 'r') as f:
                firstline = f.readline()
                if firstline ==  "SST log contains no useful data\n":
                    pass
                else:
                    df1 = extractDirectionalities(mic1,1)
                    for index, row in df1.iterrows():
                        dic = {}
                        dic['Timestamp'] = row['Timestamp']
                        dic['Time'] = row['Time']
                        dic['Time In Seconds'] = row['Time In Seconds']
                        dic['Microphone Number'] = row['Microphone Number']
                        dic['Source ID'] = row['Source ID']
                        dic['X'] = row['X']
                        dic['Y'] = row['Y']
                        dic['Z'] = row['Z']
                        dic['Activity'] = row['Activity']
                        master_list.append(dic)


                    
        if curDay in mic2:
            with open(mic2, 'r') as f:
                firstline = f.readline()
                if firstline ==  "SST log contains no useful data\n":
                    pass
                else:
                    df2 = extractDirectionalities(mic2,2)
                    for index, row in df2.iterrows():
                        dic = {}
                        dic['Timestamp'] = row['Timestamp']
                        dic['Time'] = row['Time']
                        dic['Time In Seconds'] = row['Time In Seconds']
                        dic['Microphone Number'] = row['Microphone Number']
                        dic['Source ID'] = row['Source ID']
                        dic['X'] = row['X']
                        dic['Y'] = row['Y']
                        dic['Z'] = row['Z']
                        dic['Activity'] = row['Activity']
                        master_list.append(dic)
                    
        if curDay in mic3:
            with open(mic3, 'r') as f:
                firstline = f.readline()
                if firstline ==  "SST log contains no useful data\n":
                    pass
                else:
                    df3 = extractDirectionalities(mic3,3)
                    for index, row in df3.iterrows():
                        dic = {}
                        dic['Timestamp'] = row['Timestamp']
                        dic['Time'] = row['Time']
                        dic['Time In Seconds'] = row['Time In Seconds']
                        dic['Microphone Number'] = row['Microphone Number']
                        dic['Source ID'] = row['Source ID']
                        dic['X'] = row['X']
                        dic['Y'] = row['Y']
                        dic['Z'] = row['Z']
                        dic['Activity'] = row['Activity']
                        master_list.append(dic)

    dict_to_csv(master_list,curDay)
                

In [59]:
def dict_to_csv(master_list, curDay):
    if len(master_list) == 0:
        return
    print("starting for " + curDay )
    master_list = sorted(master_list, key = lambda i: i['Time In Seconds'])            
    masterDataFrame = pd.DataFrame(master_list)
    beginTime = masterDataFrame['Time In Seconds'].iloc[0]
    split = beginTime + 3600
    hour = 0
    counter = 0
    master_sorted_list = []
    for index, row in masterDataFrame.iterrows():
        beginTime = row['Time In Seconds']
        if beginTime < split:
            dic = {}
            dic['Timestamp'] = row['Timestamp']
            dic['Time'] = row['Time']
            dic['Time In Seconds'] = row['Time In Seconds']
            dic['Microphone Number'] = row['Microphone Number']
            dic['Source ID'] = row['Source ID']
            dic['X'] = row['X']
            dic['Y'] = row['Y']
            dic['Z'] = row['Z']
            master_sorted_list.append(dic)
        else:
            hourlyCSV = pd.DataFrame(master_sorted_list)
            hourlyCSV['tempMic'] = hourlyCSV['Microphone Number'].shift(-1)
            hourlyCSV['tempTime'] = hourlyCSV['Time In Seconds'].shift(-1)
            hourlyCSV['Multi Source'] = (hourlyCSV['Microphone Number'] != hourlyCSV['tempMic']) 
            hourlyCSV = hourlyCSV.drop(columns = ['tempMic','tempTime'])
            hourlyCSV.to_csv(path_or_buf='/Users/brian_wangst/Desktop/mlr/data/Fridays/' + str(hourlyCSV['Time'][0]) + "hour" + str(hour) + '.csv')
            hour = hour + 1
            master_sorted_list = []
            split = beginTime + 3600
            
    hourlyCSV = pd.DataFrame(master_sorted_list)        
    hourlyCSV['tempMic'] = hourlyCSV['Microphone Number'].shift(-1)
    hourlyCSV['tempTime'] = hourlyCSV['Time In Seconds'].shift(-1)
    hourlyCSV['Multi Source'] = (hourlyCSV['Microphone Number'] != hourlyCSV['tempMic'])
    hourlyCSV = hourlyCSV.drop(columns = ['tempMic','tempTime'])
    hourlyCSV.to_csv(path_or_buf='/Users/brian_wangst/Desktop/mlr/data/Fridays' + str(hourlyCSV['Time'][0]) + "hour" + str(hour) + '.csv')

In [60]:
combine_CSV()



starting for cSST_2019-09-27
starting for cSST_2019-10-18
starting for cSST_2019-11-15
starting for cSST_2019-11-22
starting for cSST_2019-11-22


In [None]:
one = to_CSV(1)

In [None]:
two = to_CSV(2)

In [None]:
three = to_CSV(3)

In [None]:
def csv_to_dataframe(mic_number):
    df = pd.DataFrame(columns = ['Timestamp', 'Time', 'Time In Seconds', 'Microphone Number', 'Source ID', 'X', 'Y', 'Z', 'Activity'])
    for filename in glob.glob("/home/ardelalegre/CSE4223-ODAS/data/recordings" + str(mic_number) + '/*.csv'):
        df = df.append(pd.read_csv(filename))
    return df

In [None]:
df0 = csv_to_dataframe(0)
df1 = csv_to_dataframe(1)
df2 = csv_to_dataframe(2)
df3 = csv_to_dataframe(3)

In [None]:
df0.head()


In [None]:
df = pd.concat([df0, df1, df2, df3])
df = df.sort_values(['Time In Seconds'])
df

In [None]:
arr = np.array(df['Microphone Number'])
#arr = arr[4000000:]
dff = arr[1:] - arr[:-1]
dff = dff != 0
dff = np.append(dff, 0)
df['diff']  = dff
df

In [None]:
df.to_csv("combinedFor1-8-20.csv")

In [None]:
!ls -lh combinedFor1-8-20.csv

In [None]:
index = np.where(dff)[0]
index[:100]

In [None]:
kernel = np.ones(1000)
smooth = np.convolve(dff, kernel)
plt.plot(smooth)

In [None]:
x = df3[['X', 'Y', 'Z']].values

wcss = []
for i in range(1, 11):
    kmeans = KMeans(n_clusters=i, init='k-means++', max_iter=3000, n_init=10, random_state=0)
    kmeans.fit(x)
    wcss.append(kmeans.inertia_)
plt.plot(range(1, 11), wcss)
plt.title('Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.show()


In [None]:
kmeans = KMeans(n_clusters=6, init='k-means++', max_iter=300, n_init=10, random_state=0)
pred_y = kmeans.fit_predict(x)
plt.scatter(x[:,0], x[:,1], x[:,2])
plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], s=300, c='red')
plt.show()

In [None]:
ax = plt.axes(projection='3d')
#ax.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], kmeans.cluster_centers_[:, 2], c='red')
ax.scatter(df['X'].values,df['Y'].values, df['Z'].values)

In [None]:
x = df0[['X', 'Y', 'Z']].values

wcss = []
for i in range(1, 11):
    kmeans = KMeans(n_clusters=i, init='k-means++', max_iter=3000, n_init=10, random_state=0)
    kmeans.fit(x)
    wcss.append(kmeans.inertia_)
plt.plot(range(1, 11), wcss)
plt.title('Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.show()

In [None]:
kmeans = KMeans(n_clusters=2, init='k-means++', max_iter=300, n_init=10, random_state=0)
pred_y = kmeans.fit_predict(x)
plt.scatter(x[:,0], x[:,1], x[:,2])
plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], s=300, c='red')
plt.show()

In [None]:
ax = plt.axes(projection='3d')
#ax.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], kmeans.cluster_centers_[:, 2], c='red')
ax.scatter(df['X'].values,df['Y'].values, df['Z'].values)

In [None]:
df = extractDirectionalities("/Users/brian_wangst/Google Drive File Stream/My Drive/ODAS/recordings0/cSST_2020-01-08_09:00:03.log", 0)

In [None]:
print(extractDirectionalities("/Users/brian_wangst/Google Drive File Stream/My Drive/ODAS/recordings0/cSST_2020-01-08_09:00:03.log", 0))