In [1]:
import matplotlib
matplotlib.use('TkAgg')
import pickle
import datetime
import numpy as np
%matplotlib notebook
# %pylab inline

In [2]:
# importing saved data from our July 15th experiment
data_with_time_array = pickle.load(open('exp_07_15_good_data.p', 'rb'))

In [3]:
# checking end timestamp of data
datetime.datetime.fromtimestamp(data_with_time_array[-1,12])

datetime.datetime(2020, 7, 15, 16, 58, 59, 581090)

In [4]:
# omitting the last column which has timestamps
data = data_with_time_array[:,:12]

In [5]:
data.shape

(628426, 12)

In [6]:
# calculating nanmean and subtracting it from data. Then replacing all nans by 0

data_mean = np.nanmean(data,axis = 0,keepdims = True)

data = data - data_mean

cdata=np.nan_to_num(data)

  This is separate from the ipykernel package so we can avoid doing imports until


In [7]:
# calculating the covariance matrix


from sklearn import decomposition
from matplotlib import pyplot as plt
import matplotlib.cm as cm
from matplotlib.pyplot import figure
import seaborn as sns
from sklearn.cluster import KMeans

from numpy import linalg as LA

d=12  # dimension
n=cdata.shape[0]
block_size=10000
# calculate covariance matrix

outters = np.zeros((d, d))
for j in range(n):
    outters += np.outer(cdata[j,:],cdata[j,:])
    if j%block_size==0:
        print('\r %d: '%j,end='')
    
_cov = outters/n

#eigen values
eigen_values, eigen_vectors = LA.eig(_cov)


 620000: 

In [8]:
eigen_values

array([0.15070907, 0.1069781 , 0.06510312, 0.04753102, 0.03958284,
       0.01750848, 0.0097031 , 0.00254267, 0.00416056, 0.        ,
       0.        , 0.        ])

In [9]:
# sorting eigenvalues according to magnitude

eig_val_sorted_indices = np.argsort(eigen_values)
print(eig_val_sorted_indices)
eig_val_sorted_indices = eig_val_sorted_indices[-1::-1]
print(eig_val_sorted_indices)
eig_val_sorted_indices.shape

[ 9 10 11  7  8  6  5  4  3  2  1  0]
[ 0  1  2  3  4  5  6  8  7 11 10  9]


(12,)

In [12]:
eig_val_ordered=eigen_values[eig_val_sorted_indices]
eig_val_cumul = np.cumsum(eig_val_ordered)
eig_val_cumul /= eig_val_cumul[-1]
plt.plot([0]+list(eig_val_cumul))
plt.grid()

<IPython.core.display.Javascript object>

In [13]:
sorted_eigvec = eigen_vectors[:,eig_val_sorted_indices]

In [14]:
# projecting the mean subtracted data onto the most important eigenvectors

data_proj_ev1 = np.dot(cdata,sorted_eigvec[:,:2])

### Plotting on Eigenvectors

In [15]:
N=data_proj_ev1.shape[0]
time_axis = np.arange(N)
type(time_axis)

numpy.ndarray

In [16]:
#setting limits - right indicates upper limit and left indicates lower limit

righty = 1.25
lefty = -1
rightx = 1.5
leftx = -1.5

In [18]:
# plotting projections for all data (not just some interval during the day)

figure(figsize=[15,15])
Length=35000
print("Each plot represents activity for " + str(Length * 0.008) + " seconds")
i=1
for T1 in range(0,data.shape[0],Length):
    plt.subplot(4,4,i)
    
    plt.title(str("Time:")+str(datetime.datetime.fromtimestamp(data_with_time_array[T1,12]))[11:16])
    plt.scatter(data_proj_ev1[T1:T1+Length,0], data_proj_ev1[T1:T1+Length,1],marker='.',s=1,c=np.arange(T1,T1+Length))
    
    plt.xlim(leftx,rightx)
    plt.ylim(lefty,righty)
    i=i+1
    if (i>16):
        break

<IPython.core.display.Javascript object>

Each plot represents activity for 280.0 seconds


### Plotting interval data

In [19]:
# setting the start and end times in the correct format
# this cell converts date and hour into timestamp and prints the starting and ending timestamps

format_timestring = '%b %d %Y %I:%M%p'

start_timestring = 'Jul 15 2020 01:40PM'
end_timestring = 'Jul 15 2020 02:00PM'
start_standard_dt_string = datetime.datetime.strptime(start_timestring, format_timestring)
end_standard_dt_string = datetime.datetime.strptime(end_timestring, format_timestring)

print(start_standard_dt_string)
print(end_standard_dt_string)



2020-07-15 13:40:00
2020-07-15 14:00:00


In [20]:
# given the starting and ending timestamps, we map them to indices of the data so that
# we know our time of interest in terms on indices

for x in range(data_with_time_array.shape[0]):
    if(data_with_time_array[x,12] > datetime.datetime.timestamp(start_standard_dt_string) and data_with_time_array[x,12] < datetime.datetime.timestamp(end_standard_dt_string)):
        start_index = x
        break
for y in range(x,data_with_time_array.shape[0]):
    if(data_with_time_array[y,12] > datetime.datetime.timestamp(end_standard_dt_string)):
        end_index = y
        break

In [None]:
# we can plot the data for our time of interest

figure(figsize=[15,15])
Length= y - x
print("Each plot represents activity for " + str(Length * 0.008) + " seconds")

plt.title(str("Time:")+str(datetime.datetime.fromtimestamp(data_with_time_array[x,12]))[11:16])
scp = scatter(data_proj_ev1[x:y,0], data_proj_ev1[x:y,1],marker='.',s=1,c=np.arange(x,y))
cb_scp = colorbar(scp)
plt.xlim(leftx,rightx)
plt.ylim(lefty,righty)
plt.ylabel('Projection of data on 2nd largest eigenvector')
plt.xlabel('Projection of data on largest eigenvector')


### Plotting interesting data from the July 15th experiment

In [None]:
# extracting only the relevant x and y coordinates
relevant_data_projections = data_proj_ev1[x:y]
relevant_data_projections.shape

In [None]:
# plotting the relevant data

figure(figsize=[25,25])
Length=4000
print("Each plot represents activity for " + str(Length * 0.008) + " seconds")
i=1
for T1 in range(0,relevant_data_projections.shape[0],Length):
    subplot(4,4,i)
#     print(data_with_time[T1,12])
#     print(datetime.datetime.fromtimestamp(data_with_time[T1,12]))
    plt.title(str("Time:")+str(datetime.datetime.fromtimestamp(data_with_time_array[x+T1,12]))[11:16])
    scatter(relevant_data_projections[T1:T1+Length,0], relevant_data_projections[T1:T1+Length,1],marker='.',s=1,c=np.arange(T1,T1+Length))
    colorbar
    plt.xlim(leftx,rightx)
    plt.ylim(lefty,righty)
    plt.text(relevant_data_projections[T1,0],relevant_data_projections[T1,1], str(datetime.datetime.fromtimestamp(data_with_time_array[x+T1,12]))[14:19] + ' s')
    plt.xlabel('largest eigenvector')
    plt.ylabel('2nd largest eigenvector')
    i=i+1
    if (i>12):
        break

In [None]:
# plotting the relevant data

figure(figsize=[25,25])
Length=4000
print("Each plot represents activity for " + str(Length * 0.008) + " seconds")
i=1
for T1 in range(0,relevant_data_projections.shape[0],Length):
    subplot(4,4,i)
#     print(data_with_time[T1,12])
#     print(datetime.datetime.fromtimestamp(data_with_time[T1,12]))
    plt.title(str("Time:")+str(datetime.datetime.fromtimestamp(data_with_time_array[x+T1,12]))[11:16])
    scatter(relevant_data_projections[T1:T1+Length,0], relevant_data_projections[T1:T1+Length,1],marker='.',s=1,c=np.arange(T1,T1+Length))
    colorbar
    plt.xlim(leftx,rightx)
    plt.ylim(lefty,righty)
    k = 0
    #######################################################
    #comment in for multiple texts within the same plot and comment out the next plt.text command
    while(k!=4):
        plt.text(relevant_data_projections[T1+k*1000,0],relevant_data_projections[T1+k*1000,1], str(datetime.datetime.fromtimestamp(data_with_time_array[x+T1+k*1000,12]))[14:19] + ' s')
        k = k + 1
    ########################################################
    
#     plt.text(relevant_data_projections[T1,0],relevant_data_projections[T1,1], str(datetime.datetime.fromtimestamp(data_with_time_array[x+T1,12]))[14:19] + ' s')
    plt.xlabel('largest eigenvector')
    plt.ylabel('2nd largest eigenvector')
    i=i+1
    if (i>12):
        break

### Zooming into plot 4 (starting at 13:41)

In [None]:
# setting the start and end times in the correct format
# this cell converts date and hour into timestamp and prints the starting and ending timestamps

format_timestring = '%b %d %Y %I:%M:%S%p'

start_timestring = 'Jul 15 2020 01:41:40PM'
end_timestring = 'Jul 15 2020 01:42:10PM'
start_standard_dt_string = datetime.datetime.strptime(start_timestring, format_timestring)
end_standard_dt_string = datetime.datetime.strptime(end_timestring, format_timestring)

print(start_standard_dt_string)
print(end_standard_dt_string)

# given the starting and ending timestamps, we map them to indices of the data so that
# we know our time of interest in terms on indices

for x in range(data_with_time_array.shape[0]):
    if(data_with_time_array[x,12] > datetime.datetime.timestamp(start_standard_dt_string) and data_with_time_array[x,12] < datetime.datetime.timestamp(end_standard_dt_string)):
        start_index = x
        break
for y in range(x,data_with_time_array.shape[0]):
    if(data_with_time_array[y,12] > datetime.datetime.timestamp(end_standard_dt_string)):
        end_index = y
        break
        


In [None]:
print(x)
print(y)

In [None]:
# we can plot the data for our time of interest

figure(figsize=[15,15])
Length= y - x
print("Length:" + " " + str(Length))
print("Each plot represents activity for " + str(Length * 0.008) + " seconds")

plt.title(str("Time:")+str(datetime.datetime.fromtimestamp(data_with_time_array[x,12]))[11:16])
scp = scatter(data_proj_ev1[x:y,0], data_proj_ev1[x:y,1],marker='.',s=1,c=np.arange(x,y))
cb_scp = colorbar(scp)
plt.xlim(leftx,rightx)
plt.ylim(lefty,righty)
plt.ylabel('Projection of data on 2nd largest eigenvector')
plt.xlabel('Projection of data on largest eigenvector')


# printing timestamp after every s samples

s = 500

num_iter = int(Length/s)

k = 0
for k in range(num_iter+1):
    plt.text(data_proj_ev1[x+k*s,0],data_proj_ev1[x+k*s,1], str(datetime.datetime.fromtimestamp(data_with_time_array[x+k*s,12]))[14:19] + ' s')


### Using ginput to get nearest point, highlight it and obtain its timestamp

In [25]:
# setting the start and end times in the correct format
# this cell converts date and hour into timestamp and prints the starting and ending timestamps

format_timestring = '%b %d %Y %I:%M:%S%p'

start_timestring = 'Jul 15 2020 01:41:40PM'
end_timestring = 'Jul 15 2020 01:42:10PM'
start_standard_dt_string = datetime.datetime.strptime(start_timestring, format_timestring)
end_standard_dt_string = datetime.datetime.strptime(end_timestring, format_timestring)

print(start_standard_dt_string)
print(end_standard_dt_string)

# given the starting and ending timestamps, we map them to indices of the data so that
# we know our time of interest in terms on indices

for x in range(data_with_time_array.shape[0]):
    if(data_with_time_array[x,12] > datetime.datetime.timestamp(start_standard_dt_string) and data_with_time_array[x,12] < datetime.datetime.timestamp(end_standard_dt_string)):
        start_index = x
        break
for y in range(x,data_with_time_array.shape[0]):
    if(data_with_time_array[y,12] > datetime.datetime.timestamp(end_standard_dt_string)):
        end_index = y
        break
        


2020-07-15 13:41:40
2020-07-15 13:42:10


In [26]:
# we can plot the data for our time of interest

plt.figure(figsize=[15,15])
Length= y - x
print("Length:" + " " + str(Length))
print("Each plot represents activity for " + str(Length * 0.008) + " seconds")

plt.title(str("Time:")+str(datetime.datetime.fromtimestamp(data_with_time_array[x,12]))[11:16])
scp = plt.scatter(data_proj_ev1[x:y,0], data_proj_ev1[x:y,1],marker='.',s=1,c=np.arange(x,y))
cb_scp = plt.colorbar(scp)
plt.xlim(leftx,rightx)
plt.ylim(lefty,righty)
plt.ylabel('Projection of data on 2nd largest eigenvector')
plt.xlabel('Projection of data on largest eigenvector')

print("Please click")
x_list = plt.ginput(3)
print("clicked", x_list)
plt.show()

<IPython.core.display.Javascript object>

Length: 3751
Each plot represents activity for 30.008 seconds
Please click
clicked []


In [28]:
t = np.arange(10)
plt.plot(t, np.sin(t))
print("Please click")
x = plt.ginput(3)
print("clicked", x)
plt.show()

Please click
clicked []


### Taking projections onto each eigenvector and plotting magnitudes of the projections

In [None]:
# taking projections on all (sorted acc. to EV) eigenvectors instead of just the ones corr. to largest EVs

projection_matrix = np.dot(cdata,sorted_eigvec[:,:9])
projection_matrix.shape

In [None]:
# taking only the ones relevant to experiment
relevant_projection_matrix = projection_matrix[x:y]
relevant_projection_matrix.shape

In [None]:
plot(relevant_projection_matrix.T, 'o');

### Scratchpad

In [None]:
# start time:
# end time:
# find elements in data from start time to end time and plot
start_hour = 13
start_minute = 30
# convert start_hour:start_minute:0:0 to timestamp
# find value equal to or closest to that timestamp in data - set to start index
# find value equal to or closest to end timestamp in data - set to end index
# extract subsequence
end_hour = 14
end_minute = 0

In [None]:
format_timestring = '%b %d %Y %I:%M%p'

start_timestring = 'Jul 15 2020 01:40PM'
end_timestring = 'Jul 15 2020 02:00PM'
start_standard_dt_string = datetime.datetime.strptime(start_timestring, format_timestring)
end_standard_dt_string = datetime.datetime.strptime(end_timestring, format_timestring)

print(start_standard_dt_string)
print(end_standard_dt_string)

In [None]:
# convert array of lists to an array of arrays
data = np.zeros((data_with_time_array.shape[0],13))
print(data.shape)
for (index,i) in enumerate(data_with_time_array):
    data[index] = np.asarray(data_with_time_array[index])