In [None]:
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.ensemble import IsolationForest
from ast import literal_eval

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
info_df = pd.read_csv('/content/drive/MyDrive/dmp/f_dataset/info.csv', encoding='utf-8', index_col=0)
login1_df = pd.read_csv('/content/drive/MyDrive/dmp/f_dataset/login_1.csv', encoding='utf-8', index_col=0)
login2_df = pd.read_csv('/content/drive/MyDrive/dmp/f_dataset/login_2.csv', encoding='utf-8',index_col=0)
login3_df = pd.read_csv('/content/drive/MyDrive/dmp/f_dataset/login_3.csv', encoding='utf-8',index_col=0)
login4_df = pd.read_csv('/content/drive/MyDrive/dmp/f_dataset/login_4.csv', encoding='utf-8',index_col=0)
login5_df = pd.read_csv('/content/drive/MyDrive/dmp/f_dataset/login_5.csv', encoding='utf-8',index_col=0)
menu1_df = pd.read_csv('/content/drive/MyDrive/dmp/f_dataset/menu_1.csv', encoding='utf-8',index_col=0)
menu2_df = pd.read_csv('/content/drive/MyDrive/dmp/f_dataset/menu_2.csv', encoding='utf-8',index_col=0)
menu3_df = pd.read_csv('/content/drive/MyDrive/dmp/f_dataset/menu_3.csv', encoding='utf-8',index_col=0)
menu4_df = pd.read_csv('/content/drive/MyDrive/dmp/f_dataset/menu_4.csv', encoding='utf-8',index_col=0)
session1_df = pd.read_csv('/content/drive/MyDrive/dmp/f_dataset/session1.csv', encoding='utf-8',index_col=0)
session2_df = pd.read_csv('/content/drive/MyDrive/dmp/f_dataset/session2.csv', encoding='utf-8',index_col=0)
session3_df = pd.read_csv('/content/drive/MyDrive/dmp/f_dataset/session3.csv', encoding='utf-8',index_col=0)

In [None]:
dataframe_name_list = ['info_df', 'login1_df', 'login2_df', 'login3_df', 'login4_df', 'login5_df', 'menu1_df', 'menu2_df', 'menu3_df', 'menu4_df', 'session1_df', 'session2_df', 'session3_df']


In [None]:
def getDistanceByPoint(data, model):
    """ Function that calculates the distance between a point and centroid of a cluster, 
            returns the distances in pandas series"""
    distance = []
    for i in range(0,len(data)):
        Xa = np.array(data.iloc[i])
        Xb = model.cluster_centers_[model.labels_[i]-1]
        distance.append(np.linalg.norm(Xa-Xb))
    return pd.Series(distance, index=data.index)

for name in dataframe_name_list:
    kmeans = KMeans(n_clusters=2, random_state=5)
    temp = locals()[name].values
    kmeans.fit(temp)
    labels = kmeans.predict(locals()[name].values)
    unique_elements, counts_elements = np.unique(labels, return_counts=True)
    clusters = np.asarray((unique_elements, counts_elements))


    # Assume that 13% of the entire data set are anomalies 
    outliers_fraction = 0.002
    # get the distance between each point and its nearest centroid. The biggest distances are considered as anomaly
    distance = getDistanceByPoint(locals()[name], kmeans)
    # number of observations that equate to the 13% of the entire data set
    number_of_outliers = int(outliers_fraction*len(distance))
    # Take the minimum of the largest 13% of the distances as the threshold
    threshold = distance.nlargest(number_of_outliers).min()
    # anomaly1 contain the anomaly result of the above method Cluster (0:normal, 1:anomaly) 
    locals()[name]['kmeans_anomaly'] = (distance >= threshold).astype(int)

In [None]:
for name in dataframe_name_list:
    print(locals()[name]['kmeans_anomaly'].value_counts())

0    104909
1       210
Name: kmeans_anomaly, dtype: int64
0    104909
1       210
Name: kmeans_anomaly, dtype: int64
0    104909
1       210
Name: kmeans_anomaly, dtype: int64
0    104909
1       210
Name: kmeans_anomaly, dtype: int64
0    104909
1       210
Name: kmeans_anomaly, dtype: int64
0    104909
1       210
Name: kmeans_anomaly, dtype: int64
0    104909
1       210
Name: kmeans_anomaly, dtype: int64
0    104909
1       210
Name: kmeans_anomaly, dtype: int64
0    104909
1       210
Name: kmeans_anomaly, dtype: int64
0    104909
1       210
Name: kmeans_anomaly, dtype: int64
0    104895
1       224
Name: kmeans_anomaly, dtype: int64
0    104908
1       211
Name: kmeans_anomaly, dtype: int64
0    104908
1       211
Name: kmeans_anomaly, dtype: int64


In [None]:
for name in dataframe_name_list:
    outliers_fraction = 0.00
    if_model = IsolationForest(contamination=outliers_fraction)
    temp = locals()[name].iloc[:, :-1].values
    if_model.fit(temp)
    locals()[name]['IF_anomaly'] = if_model.predict(temp)

In [None]:
for name in dataframe_name_list:
    locals()[name].loc[locals()[name].IF_anomaly == 1, 'IF_anomaly'] = 0
    locals()[name].loc[locals()[name].IF_anomaly == -1, 'IF_anomaly'] = 1
    print(locals()[name]['IF_anomaly'].value_counts())
    

0    104916
1       203
Name: IF_anomaly, dtype: int64
0    104911
1       208
Name: IF_anomaly, dtype: int64
0    104908
1       211
Name: IF_anomaly, dtype: int64
0    104908
1       211
Name: IF_anomaly, dtype: int64
0    104908
1       211
Name: IF_anomaly, dtype: int64
0    104909
1       210
Name: IF_anomaly, dtype: int64
0    104908
1       211
Name: IF_anomaly, dtype: int64
0    104908
1       211
Name: IF_anomaly, dtype: int64
0    104908
1       211
Name: IF_anomaly, dtype: int64
0    104908
1       211
Name: IF_anomaly, dtype: int64
0    104910
1       209
Name: IF_anomaly, dtype: int64
0    104908
1       211
Name: IF_anomaly, dtype: int64
0    104917
1       202
Name: IF_anomaly, dtype: int64


In [None]:
for name in dataframe_name_list:
    locals()[name].to_csv("/content/drive/MyDrive/dmp/fraction_ratio/{}.csv".format(name))