In [None]:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
import pandas as pd

## Read in dataset

In [None]:
month1 = pd.read_csv(, sep='\t')
month2 = pd.read_csv("mar13_decoded", sep='\t')

# merge datasets into df
df = pd.concat([month1, month2])

## Let's inspect the dataset

In [None]:
df.head()

In [None]:
df.keys()

## Looks like we have some timestamps in the 'Time' column.
## Let's convert these to human readable timestamps

In [None]:
df['datetime'] = pd.to_datetime(df['Time'], unit='s')
df['datetime'] = df['datetime'].dt.tz_localize('America/Chicago', ambiguous=True)


In [None]:
print(min(df['datetime']))
print(max(df['datetime']))

## Hmm, we shouldn't have timestamps from 1970. This is probably bad data.
## Let's filter out these bad entries

In [None]:
# keep entries between Oct 31 and Jan 1
df = df[(1383091200 < df['Time']) & (df['Time'] < 1388599453)]

# If you open up the datasets in a text editor, you'll see that 
# Node Type entries with 'service_not_present' have incomplete information
df = df[df['Node Type'] != 'service_not_present']

In [None]:
print(min(df['datetime']))
print(max(df['datetime']))

## Much better

## Task 0 - Let's count some columns


In [None]:
# total number of entries
len(df)

In [None]:
# number of nodes
len(df['Complete Node'].unique())

In [None]:
# number of days
# normalize() sets the time to 00:00:00 giving us only the date portion of the timestamp/
len(df['datetime'].dt.normalize().unique())

In [None]:
# node types
df['Node Type'].unique()

In [None]:
df['Complete Node'].value_counts()

# don't forget to represent this data as a box plot

In [None]:
timevals = df['Time'].values

# this finds the difference between consecutive values in timevals
diffs = timevals[1:] - timevals[:-1]
diffs.mean(), diffs.std()

In [None]:
for nt in ['service', 'xe', 'xk']:
    timevals = df[df['Node Type'] == nt]['Time'].values
    timevals.sort()
    diffs = timevals[1:] - timevals[:-1]
    print (nt, diffs.mean(), diffs.std())