# Seleksi Sampel

Mari kita buat beberapa sampel

In [11]:
import pandas as pd
import statistics, itertools
from IPython.display import HTML, display
from tabulate import tabulate
import scipy.spatial.distance as spad

def table(df): display(HTML(tabulate(df, tablefmt='html', headers='keys', showindex=False)))

In [17]:
df = pd.read_csv('outlier.csv', usecols=['user_id', 'pause_video', 'play_video', 'seek_video', 'stop_video'], nrows=20)
table(df)

user_id,pause_video,play_video,seek_video,stop_video
0,1,4,1,1
1,14,14,0,1
2,0,0,0,0
3,2,2,0,1
4,3,22,18,0
5,1,5,9,1
6,5,9,6,1
7,1,18,16,0
8,7,9,2,1
9,1,1,0,0


## Outlier Detection

Outlier adalah samples janggal yang keluar dari kerumuman. Mereka membuat integritas data tidak sehat.

![](https://paper-attachments.dropbox.com/s_1185AEC62427E23657579AF288686866FF5B3F65A0E36E86D1A293C6B0CCF4B4_1553405161903_sqDCqTEGAmcjqerU4VmkGaw.png)

Suatu sampel (A) dapat dikatakan sebagai outlier dalam data (D), jika 
$$  \left(\sum^n_{i=1}\left[\operatorname{dist}(A, D_i) > r\right]\right) > \pi{n} $$

dimana $r$ adalah batas normal jarak dan $\pi$ adalah rasio toleransi (antara 0...1). Kedua $r$ dan  $\pi$ dapat diatur secara empiris untuk mendapatkan data yang ideal

In [32]:
r = 10
pi = 0.5
d = df.values.tolist()
n = len(d)
outliers = []
for i in range(n):
    count = 0
    outlie = True
    for j in range(n):
        delta = spad.euclidean(d[i][1:],d[j][1:])
        if (i!=j and delta <= r):
            count += 1
            if count >= pi*(len(d[i])-1):
                #print("not outlier", i)
                outlie = False
                break
    outliers.append(outlie)

print("Deteksi outlie dengan r =",r,'dan pi =',pi)
table(pd.DataFrame([[*x,outliers[i]] for i,x in enumerate(d)], columns=list(df.columns)+["Outliers?"]))

Deteksi outlie dengan r = 10 dan pi = 0.5


user_id,pause_video,play_video,seek_video,stop_video,Outliers?
0,1,4,1,1,False
1,14,14,0,1,False
2,0,0,0,0,False
3,2,2,0,1,False
4,3,22,18,0,True
5,1,5,9,1,False
6,5,9,6,1,False
7,1,18,16,0,True
8,7,9,2,1,False
9,1,1,0,0,False


In [87]:
# Outliers 2
avgs = [df[x].mean() for x in df.columns][1:]
rows = []
for i in range(n+1):
    rows.append(sum([(c-avgs[j])**2/avgs[j]
                     for j,c in enumerate(d[i][:-1])]))
rows
    

[24.440706298789895,
 5.766179649985803,
 29.310911504014665,
 20.28380479311244,
 1348.4436221977958,
 337.13116307390675,
 152.51811684638722,
 1066.5719114275103,
 29.147084127200422,
 38.7428626914868,
 115.35303016679867,
 50.82912868051177,
 57.32025666236852,
 740.1251620192588,
 71.75352982843752,
 466.1785744954909,
 87.73298060246267,
 138.6175919001129,
 100.82034066092497,
 1207.267490806541,
 314.7197358571051,
 142.28991835484706,
 150.49079474757843,
 1204.5971689746689,
 373.44836630156107,
 674.0001814127046,
 197.10565499002522,
 217.54981019421874,
 249.91230275943587,
 402.3421724938883,
 4547.679317193522,
 2096.3048938041366,
 319.9230085906409,
 335.12932656813126,
 4914.973480383538,
 4712.328633798548,
 416.61449409586476,
 423.51652600822234,
 648.1953005006719,
 482.25273988714326,
 509.40016873516663,
 548.4914267790268,
 577.8759380548522,
 594.8606236794406,
 818.7911012746358,
 744.6802346202797,
 9017.307354414937,
 709.2493886882089,
 1225.4013361568377