In [2]:
import matplotlib.pyplot as plt
from numpy import linspace, loadtxt, ones, convolve
import numpy as np
import pandas as pd
import collections
from random import randint
from matplotlib import style
style.use('fivethirtyeight')
%matplotlib inline

In [16]:
from itertools import zip_longest, count

In [3]:
# !mkdir -p dataset
# !wget -c -b http://www-personal.umich.edu/~mejn/cp/data/sunspots.txt -P dataset
data = loadtxt("dataset/sunspots.txt", float)

In [4]:
data_as_frame = pd.DataFrame(data, columns=['Months', 'SunSpots'])

In [5]:
data_as_frame.head()

Unnamed: 0,Months,SunSpots
0,0.0,58.0
1,1.0,62.6
2,2.0,70.0
3,3.0,55.7
4,4.0,85.0


In [6]:
def moving_average(data, window_size):
    # rectangular
    window = np.ones(int(window_size))/float(window_size)
    return np.convolve(data, window, 'same')

In [17]:
def explain_anomalies(y, window_size, sigma=1.0):
    avg = moving_average(y, window_size).tolist()
    residual = y - avg
    # Calculate the variation in the distribution of the residual
    std = np.std(residual)
    return {'standard_deviation': round(std, 3),
        'anomalies_dict': collections.OrderedDict([(index, y_i) for
                        index, y_i, avg_i in zip_longest(count(), y, avg)
                if (y_i > avg_i + (sigma*std)) | (y_i < avg_i - (sigma*std))])}

In [20]:
def explain_anomalies_rolling_std(y, window_size, sigma=1.0):
    avg = moving_average(y, window_size)
    avg_list = avg.tolist()
    residual = y - avg
    # Calculate the variation in the distribution of the residual
    testing_std = pd.rolling_std(residual, window_size)
    testing_std_as_df = pd.DataFrame(testing_std)
    rolling_std = testing_std_as_df.replace(np.nan,
            testing_std_as_df.ix[window_size - 1]).round(3).iloc[:,0].tolist()
    std = np.std(residual)
    return {'stationary standard_deviation': round(std, 3),
        'anomalies_dict': collections.OrderedDict([(index, y_i)
                for index, y_i, avg_i, rs_i in zip_longest(count(), y, avg_list, r)
        if (y_i > avg_i + (sigma * rs_i)) | (y_i < avg_i - (sigma * rs_i))])}

In [22]:
def plot_results(x, y, window_size, sigma_value=1,
text_xlabel="X Axis", text_ylabel="Y Axis", applying_rolling_std=False):
    plt.figure(figsize=(15, 8))
    plt.plot(x, y, "k.")
    y_av = moving_average(y, window_size)
    plt.plot(x, y_av, color='green')
    plt.xlim(0, 1000)
    plt.xlabel(text_xlabel)
    plt.ylabel(text_ylabel)
    # Query for the anomalies and plot the same
    events = {}
    if applying_rolling_std:
        events = explain_anomalies_rolling_std(y, window_size=window_size, sigma=sigma_value)
    else:
        events = explain_anomalies(y, window_size=window_size, sigma=sigma_value)
        
    x_anomaly = np.fromiter(events['anomalies_dict'].iterkeys(), dtype=int, count=len(events['anomalies_d
    y_anomaly = np.fromiter(events['anomalies_dict'].itervalues(), dtype=float,
    count=len(events['anomalies_dict']))
    plt.plot(x_anomaly, y_anomaly, "r*", markersize=12)
    # add grid and lines and enable the plot
    plt.grid(True)
    plt.show()

IndentationError: expected an indented block (<ipython-input-22-174260366424>, line 13)

In [23]:
x = data_as_frame['Months']
Y = data_as_frame['SunSpots']