### Collection of synthetic datasets

In [45]:
%%html
<style>
table,td,tr,th {border:0!important;cellpadding:0,cellspacing:0}
</style>

In [17]:
def plotscatter(data,labels):
    fig = plt.figure(figsize=[6,6])
    ax = plt.subplot(111)
    plt.scatter(data[np.where(labels.flatten()==0),0],data[np.where(labels.flatten()==0),1],marker='x',c='g')
    plt.scatter(data[np.where(labels.flatten()==1),0],data[np.where(labels.flatten()==1),1],marker='o',c='r',s=50,alpha=0.5)
    plt.axis('off')
    return fig,ax

#### Data Set 1:
<table border=none>
<tr>
<td>Single cluster with one anomaly</td>
<td><img src='example1.png' width=360px/></td>
</tr>
</table>

In [1]:
def genData1():
    n = 1000
    x1 = np.random.uniform(4,8,n)
    x2 = np.random.uniform(4,8,n)
    data = np.transpose(np.vstack([x1,x2]))
    data = data[np.sum(pow((data - [6,6]),2),axis=1) < 4,:]
    labels = np.zeros([data.shape[0],1])

    # add outliers
    data = np.vstack([data,[4,4]])
    labels = np.vstack([labels,[1]])

    data = np.array(data)
    data = np.vstack(data)
    return data,labels

#data,labels = genData1()
#fig,ax = plotscatter(data,labels)
#ax.annotate('$o_1$', xy=(4.1, 4.1),size=24)
#, xytext=(3, 4),arrowprops=dict(facecolor='black', shrink=0),)
#fig.savefig('example1.png', format='png', transparent=True,dpi=360)

#### Data Set 2:
<table border=none>
<tr>
<td>Two clusters with one anomaly</td>
<td><img src='example2.png' width=360px/></td>
</tr>
</table>

In [2]:
def genData2():
    n = 200
    x1 = np.random.uniform(4,8,n)
    x2 = np.random.uniform(4,8,n)
    data = np.random.multivariate_normal([6,6],[[0.6,0],[0,0.6]],n)
    #data = np.transpose(np.vstack([x1,x2]))
    data = data[np.sum(pow((data - [6,6]),2),axis=1) < 4,:]


    n = 200
    x1 = np.random.uniform(2,3,n)
    x2 = np.random.uniform(2,3,n)
    data1 = np.transpose(np.vstack([x1,x2]))
    data1 = data1[np.sum(pow((data1 - [2.5,2.5]),2),axis=1) < 0.2,:]

    data = np.vstack([data1,data])
    labels = np.zeros([data.shape[0],1])
    # add outliers
    data = np.vstack([data,[3.1,3.1]])
    labels = np.vstack([labels,[1]])

    data = np.array(data)
    data = np.vstack(data)
    return data,labels
#data,labels = genData2()
#fig,ax = plotscatter(data,labels)
#ax.annotate('$o_1$', xy=(3.2, 3.1),size=24)
#fig.savefig('example2.png', format='png', transparent=True,dpi=360)


#### Data Set 3:
<table border=none>
<tr>
<td>2D Gaussian with one anomaly</td>
<td><img src='example3.png' width=360px/></td>
</tr>
</table>

In [39]:
def genData3():
    n = 1000
    x1 = np.random.uniform(4,8,n)
    x2 = np.random.uniform(4,8,n)
    data = np.random.multivariate_normal([6,6],[[4,3],[1,1]],n)
    #data = np.transpose(np.vstack([x1,x2]))
    #data = data[np.sum(pow((data - [6,6]),2),axis=1) < 4,:]

    labels = np.zeros([data.shape[0],1])
    # add outliers
    data = np.vstack([data,[6.9,3.7]])
    labels = np.vstack([labels,[1]])

    data = np.array(data)
    data = np.vstack(data)

    return data,labels
#data,labels = genData3()
#fig,ax = plotscatter(data,labels)
#ax.annotate('$o_1$', xy=(7.1, 3.8),size=24)
#fig.savefig('example3.png', format='png', transparent=True,dpi=360)

#### Data Set 4:
<table border=none>
<tr>
<td>Heterogeneous clusters with several anomalies</td>
<td><img src='example4.png' width=360px/></td>
</tr>
</table>

In [40]:
def genData4():
    n = 500
    x = np.random.uniform(4,12,n)
    y = pow(np.sin(x),2)
    x = x + np.random.uniform(0,0.4,n)
    y = y + np.random.uniform(0,0.4,n)
    data = np.transpose(np.vstack([x,y]))

    n=100
    x1 = np.random.uniform(5,5.6,n)
    x2 = np.random.uniform(5,5.6,n)
    data1 = np.transpose(np.vstack([x1,x2]))
    data1 = data1[np.sum(pow((data1 - [5.3,5.3]),2),axis=1) < 0.08,:]
    data = np.vstack([data1,data])

    n=300
    n1 = np.random.multivariate_normal([4,10],[[2,0],[0,2]],n)
    dist = scipy.stats.multivariate_normal(mean=[4,10],cov=[[2,0],[0,2]]).pdf(n1)
    n1= n1[dist>0.05,:]
    data = np.vstack([data,n1])

    n=800
    x1 = np.random.uniform(8,14,n)
    x2 = np.random.uniform(4,12,n)
    data1 = np.transpose(np.vstack([x1,x2]))
    data1 = data1[np.sum(pow((data1 - [11,8]),2),axis=1) > 2,:]
    data1 = data1[np.sum(pow((data1 - [11,8]),2),axis=1) < 4,:]
    data = np.vstack([data1,data])

    labels = np.zeros([data.shape[0],1])


    # add outliers
    data = np.vstack([data,[[6.4,1.1],[8,0.2],[11,8]]])
    labels = np.vstack([labels,[[1],[1],[1]]])

    o = np.random.multivariate_normal([6,3],[[0.01,0],[0,0.01]],4)
    data = np.vstack([data,o])
    labels = np.vstack([labels,np.ones([o.shape[0],1])])
    return data,labels
#data,labels = genData4()
#fig,ax = plotscatter(data,labels)
#ax.annotate('$o_2$', xy=(11.1, 8.1),size=24)
#ax.annotate('$o_1$', xy=(6.3, 3.1),size=24)
#ax.annotate('$o_3$', xy=(6.4, 1.2),size=24)
#ax.annotate('$o_4$', xy=(8.1, 0.1),size=24)
#fig.savefig('example4.png', format='png', transparent=True,dpi=360)

#### Data Set 5:
<table border=none>
<tr>
<td>Dense anomalous region within a sparse region</td>
<td><img src='example5.png' width=360px/></td>
</tr>
</table>

In [41]:
def genData5():
    n=1000
    data = np.random.multivariate_normal([3,3],[[4,0],[4,2]],n)
    dist = scipy.stats.multivariate_normal(mean=[3,3],cov=[[2,0],[0,2]]).pdf(data)
    data= data[dist>0.05,:]
    labels = np.zeros([data.shape[0],1])
    n = 50
    o = np.random.multivariate_normal([3.3,3.4],[[0.001,0],[0,0.001]],n)
    data = np.vstack([data,o])
    labels = np.vstack([labels,np.ones([o.shape[0],1])])
    return data,labels
#data,labels = genData5()
#fig,ax=plotscatter(data,labels)
#ax.annotate('$o_1$', xy=(3.4, 3.4),size=24
#, xytext=(4.4, 3.4),
#            arrowprops=dict(facecolor='black',width=1,headwidth=4),
#            )
#fig.savefig('example5.png', format='png', transparent=True,dpi=360)

#### Data Set 6:
<table border=none>
<tr>
<td>Sparse anomalies within a dense region.</td>
<td><img src='example6.png' width=360px/></td>
</tr>
</table>

In [10]:
def genData6():
    n=5000
    data = np.random.multivariate_normal([3,3],[[1,0],[0,1]],n)
    dist = scipy.stats.multivariate_normal(mean=[3,3],cov=[[2,0],[0,2]]).pdf(data)
    data= data[dist>0.05,:]

    data1 = []
    for d in data:
        if np.sum(pow(d - [3.3,3.4],2)) > 0.15:
            data1.append(d)
    data = np.array(data1)
    labels = np.zeros([data.shape[0],1])

    n=10
    x1 = np.random.uniform(3,4,n)
    x2 = np.random.uniform(3,4,n)
    o = np.transpose(np.vstack([x1,x2]))
    o = o[np.sum(pow((o - [3.3,3.4]),2),axis=1) < 0.15,:]

    data = np.vstack([data,o])
    labels = np.vstack([labels,np.ones([o.shape[0],1])])
    return data,labels
#data,labels = genData6()
#fig,ax=plotscatter(data,labels)
#ax.annotate('$o_1$', xy=(3.4, 3.4),size=24
#, xytext=(4.4, 3.4),
#            arrowprops=dict(facecolor='black',width=1,headwidth=4),
#            )
#fig.savefig('example6.png', format='png', transparent=True,dpi=360)

In [14]:
def precAtK(true,predicted):
    # find number of anomalies
    k = np.sum(true)
    # find the score of the k^th predicted anomaly
    v = np.sort(predicted,axis=0)[::-1][k-1]
    # find all objects that are above the threshold
    inds = np.where(predicted >= v)[0]
    return np.sum(true[inds])/len(inds)

In [15]:
def averageRank(true,predicted):
    inds = np.where(true == 1)[0]
    s = np.argsort(predicted)[::-1]
    v = []
    for ind in inds:
        v.append(float(np.where(s == ind)[0]+1))
    return np.mean(v)