## Unsupervised Anomaly Detection:
Using: <br>
1- K-means <br>
2- SVM <br>
3- Isolation Forest <br>

### Data Preparation

In [1]:
import pandas as pd
from sklearn.cluster import KMeans # KMeans
from sklearn.decomposition import PCA # PCA
from sklearn.preprocessing import StandardScaler # StandardScale to resize the distribution of values 
from sklearn.metrics import silhouette_score # Silhouette method
from sklearn.metrics import calinski_harabasz_score # Calinski Harabasz method

# plotting dependencies and setup  
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.io as pio

In [2]:
SEVENSET   =["#ffd700","#ffb14e","#fa8775","#ea5f94","#cd34b5","#9d02d7","#0000ff"]

def clusters_methods(df, methods):
    methods_list = []
    optimal_ks = []
    for method in methods:
        scores = []
        
        # create a for loop to compute the inertia with each possible value of k
        for k in range(2, 11):
            
            # create a KMeans model using the loop counter for the n_clusters
            km = KMeans(n_clusters=k, n_init=40, random_state=1)
            
            # fit the model to the data using dataframe
            km.fit(df)
            
            # append the model to the inertia list
            # wcss elbow 
            if method == "wcss_elbow":
                scores.append(km.inertia_)
                
            #others    
            else:
                query = f"""scores.append({method}_score(df, km.labels_))"""
                exec(query)
                
        # create a series with the data
        method_series = pd.Series(scores, index=range(2, 11), name=method.replace("_", " ").title())
        
        #finding best k
        if method == "wcss_elbow": # for elbow method
            
            # calculate the percentage of variance explained for each value of k
            ms_index = list(method_series.index)
            pve = [100 * (1 - (method_series[i] / method_series[ms_index[0]])) for i in ms_index]

            # Find the elbow point (i.e., the value of k where the PVE starts to level off)
            threshold = 11
            for i in range(1, len(pve)):
                if abs(pve[i] - pve[i-1]) < threshold:
                    optimal_k = i + 1
                    break
                    
        elif method == "silhouette": # for silhouette method
            point = method_series.max()
            optimal_k = method_series.index[method_series == point][0]

        elif method == "calinski_harabasz": # for calinski method
            optimal_k = method_series.idxmax()
                    
        # create list of results
        methods_list.append(method_series)
        optimal_ks.append(optimal_k)
        
    return methods_list, optimal_ks

def scatter_cluster(n, df, columns):
    km = KMeans(n_clusters = n, n_init = 25, random_state = 1)
    km.fit(df)
    cluster_centers = pd.DataFrame(km.cluster_centers_, columns=df.columns)
    
    # predict the clusters to group the cryptocurrencies using the scaled data
    prediction = km.predict(df)
    
    # create the trace for the data points
    trace_points = go.Scatter(
        x=df[columns[0]],
        y=df[columns[1]],
        mode='markers',
        name='Coins',
        marker=dict(
            size=7.5,
            color=km.labels_,
            colorscale=SEVENSET,
            opacity=0.9,
            line=dict(
                width=1,
                color='black'
            )
        ),
        text=df.index,  # Set the hover text to the index value
        showlegend=False
    )

    # create the trace for the centroid points
    trace_centroids = go.Scatter(
        x=cluster_centers[columns[0]],
        y=cluster_centers[columns[1]],
        mode='markers',
        name='Cluster Centers',
        marker=dict(
            size=30,
            color=cluster_centers.index,
            colorscale=SEVENSET,
            symbol='circle',
            opacity=0.3,
            line=dict(
                width=1,
                color='black'
            )
        ),
        text=[f"Centroid {i}" for i in range(len(cluster_centers))],  # Set the hover text to "Centroid {i}"
        showlegend=False
    )

    # create dummy trace for legend
    dummy_point = go.Scatter(
        x=[None],
        y=[None],
        mode='markers',
        marker=dict(
            size=7.5,
            color="lightgray",
            colorscale=SEVENSET,
            opacity=1,
            line=dict(
                width=1,
                color='black'
            )
            ),
        name="Coins"  # set the name to an empty string so it is not visible in the legend
    )
    
    dummy_centroids = go.Scatter(
            x=[None],
            y=[None],
            mode='markers',
            marker=dict(
                size=30,
                color="lightgray",
                colorscale=SEVENSET,
                symbol='circle',
                opacity=1,
                line=dict(
                    width=1,
                    color='lightgray'
                )),
            name="Cluster Centers"  # set the name to a visible string so it appears in the legend
        )
    
    # define the layout of the plot
    layout = go.Layout(
        legend=dict(
            yanchor="top",
            y=0.99,
            xanchor="left",
            x=0.01,
            bgcolor= '#ffffff',
            font=dict(color='black', size=14)

    ),
        width=700,
        height=700,
        title=dict(text="Clustering with k= "+str(n),
                  font=dict(size= 20, color= 'black', family= "Times New Roman"),
                  x=0.5,
                  y=0.9),
        xaxis=dict(title='Price Change Percentage 24h',
                  showline=True,
            linewidth=0.5,
            linecolor='black',
            mirror=True,
                  color= 'black',
                   gridcolor='white'),
        yaxis=dict(title='Price Change Percentage 7d',
                   showline=True,
                   linewidth=0.5,
                   linecolor='black',
                   mirror=True,
                   color= 'black',
                   gridcolor='white'),
        hovermode='closest',
        plot_bgcolor='#ffffff',
        paper_bgcolor="#f7f7f7"
    )

    # create the figure object and add the traces to it
    fig = go.Figure(data=[trace_points, trace_centroids, dummy_point, dummy_centroids], layout=layout)
    from plotly.offline import plot

    # Show the figure
    return plot(fig), prediction

In [3]:
df_market_data = pd.read_csv("data.csv", delimiter=',', low_memory=False)

# display sample data
df_market_data.head()

Unnamed: 0,date,account_id,type,amount,sum_3,mean_3,count_3,sum_15,mean_15,count_15,sum_30,mean_30,count_30
0,1997-10-05,1,WITHDRAWAL,2452.0,2452.0,2452.0,1.0,2452.0,2452.0,1.0,2452.0,2452.0,1.0
1,1997-10-31,1,WITHDRAWAL,14.6,14.6,14.6,1.0,14.6,14.6,1.0,2466.6,1233.3,2.0
2,1997-10-03,2,WITHDRAWAL,9800.0,9800.0,9800.0,1.0,9800.0,9800.0,1.0,9800.0,9800.0,1.0
3,1997-10-05,2,WITHDRAWAL,7266.0,17066.0,8533.0,2.0,17066.0,8533.0,2.0,17066.0,8533.0,2.0
4,1997-10-14,2,WITHDRAWAL,3800.0,3800.0,3800.0,1.0,20866.0,6955.333333,3.0,20866.0,6955.333333,3.0


In [4]:
# Convert the "date" column to datetime if it's not already
df_market_data['date'] = pd.to_datetime(df_market_data['date'])

# Extract the day number and convert it to int
df_market_data['day_number'] = df_market_data['date'].dt.day.astype(int)

df_market_data.head(3)

Unnamed: 0,date,account_id,type,amount,sum_3,mean_3,count_3,sum_15,mean_15,count_15,sum_30,mean_30,count_30,day_number
0,1997-10-05,1,WITHDRAWAL,2452.0,2452.0,2452.0,1.0,2452.0,2452.0,1.0,2452.0,2452.0,1.0,5
1,1997-10-31,1,WITHDRAWAL,14.6,14.6,14.6,1.0,14.6,14.6,1.0,2466.6,1233.3,2.0,31
2,1997-10-03,2,WITHDRAWAL,9800.0,9800.0,9800.0,1.0,9800.0,9800.0,1.0,9800.0,9800.0,1.0,3


In [5]:
df_market_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14455 entries, 0 to 14454
Data columns (total 14 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   date        14455 non-null  datetime64[ns]
 1   account_id  14455 non-null  int64         
 2   type        14455 non-null  object        
 3   amount      14455 non-null  float64       
 4   sum_3       14455 non-null  float64       
 5   mean_3      14455 non-null  float64       
 6   count_3     14455 non-null  float64       
 7   sum_15      14455 non-null  float64       
 8   mean_15     14455 non-null  float64       
 9   count_15    14455 non-null  float64       
 10  sum_30      14455 non-null  float64       
 11  mean_30     14455 non-null  float64       
 12  count_30    14455 non-null  float64       
 13  day_number  14455 non-null  int32         
dtypes: datetime64[ns](1), float64(10), int32(1), int64(1), object(1)
memory usage: 1.5+ MB


In [6]:
# generate the summary statistics
df_market_data.describe(include = 'all').round(2)

Unnamed: 0,date,account_id,type,amount,sum_3,mean_3,count_3,sum_15,mean_15,count_15,sum_30,mean_30,count_30,day_number
count,14455,14455.0,14455,14455.0,14455.0,14455.0,14455.0,14455.0,14455.0,14455.0,14455.0,14455.0,14455.0,14455.0
unique,,,1,,,,,,,,,,,
top,,,WITHDRAWAL,,,,,,,,,,,
freq,,,14455,,,,,,,,,,,
mean,1997-10-17 17:24:54.541681024,2999.73,,4835.91,6135.73,4906.67,1.24,10632.64,5388.37,1.87,14581.47,5831.64,2.56,17.73
min,1997-10-01 00:00:00,1.0,,0.1,0.9,0.9,1.0,0.9,0.9,1.0,0.9,0.9,1.0,1.0
25%,1997-10-09 00:00:00,1229.0,,14.6,100.0,100.0,1.0,1983.5,1189.75,1.0,3609.8,1929.6,1.0,9.0
50%,1997-10-14 00:00:00,2465.0,,2180.0,2935.0,2400.0,1.0,5514.6,3160.5,2.0,8507.0,3757.3,2.0,14.0
75%,1997-10-31 00:00:00,3691.5,,5893.5,7582.0,6137.0,1.0,13768.5,7100.0,2.0,18490.6,7300.0,3.0,31.0
max,1997-10-31 00:00:00,11382.0,,64900.0,148118.0,64900.0,5.0,223518.0,64700.0,8.0,223532.6,64700.0,11.0,31.0


In [7]:
df_data = df_market_data[['count_3','sum_3','day_number']]

# Get distinct values of column 'A' as a list
transaction_number = df_data['count_3'].unique().tolist()

print(transaction_number)
df_data.head(10)

[1.0, 2.0, 3.0, 4.0, 5.0]


Unnamed: 0,count_3,sum_3,day_number
0,1.0,2452.0,5
1,1.0,14.6,31
2,1.0,9800.0,3
3,2.0,17066.0,5
4,1.0,3800.0,14
5,1.0,14.6,31
6,1.0,6900.0,5
7,1.0,2078.0,6
8,1.0,1300.0,13
9,2.0,2585.0,13


In [8]:
df_2D = df_data[['sum_3','day_number']]
# use the StandardScaler() module to normalize the data from the CSV file
data_scaled = StandardScaler().fit_transform(df_2D)

# create a df for the scaled data and set the coinid column as index
df_market_scaled = pd.DataFrame(data_scaled, columns=df_2D.columns, index=df_2D.index)

# determine the optimal value of k by "clusters_methods" function in the "helpers" package located at "./src/package/helpers" is 
cluster_results, optimal_ks = clusters_methods(df_market_scaled, ["wcss_elbow", "silhouette", "calinski_harabasz"])

print(optimal_ks)

# Define the column names for the CSV file
column_names = ["wcss_elbow", "silhouette", "calinski_harabasz"]

data_dict = {column_names[i]: series for i, series in enumerate(cluster_results)}

# Create a DataFrame from the series list
df = pd.DataFrame(data_dict, columns=column_names)

# Write the DataFrame to a CSV file
df.to_csv('validation_all.csv', index=False)

# clustering the optimal value of k=3
plot3, prediction_3=scatter_cluster(optimal_ks[0], df_market_scaled, ['sum_3','day_number'])
df_market_data['prediction_all'] = prediction_3

[4, 3, 7]


In [9]:
for t_num in transaction_number:
    print(f"iteration: {t_num}")
    df_2D = df_data[df_data['count_3']==t_num][['sum_3','day_number']]
    # use the StandardScaler() module to normalize the data from the CSV file
    data_scaled = StandardScaler().fit_transform(df_2D)
    
    # create a df for the scaled data and set the coinid column as index
    df_market_scaled = pd.DataFrame(data_scaled, columns=df_2D.columns, index=df_2D.index)
    
    # determine the optimal value of k by "clusters_methods" function in the "helpers" package located at "./src/package/helpers" is 
    try:
        cluster_results, optimal_ks = clusters_methods(df_market_scaled, ["wcss_elbow", "silhouette", "calinski_harabasz"])
    except:
        optimal_ks = [2]
    
    print(optimal_ks)
    
    # Define the column names for the CSV file
    column_names = ["wcss_elbow", "silhouette", "calinski_harabasz"]
    
    data_dict = {column_names[i]: series for i, series in enumerate(cluster_results)}
    
    # Create a DataFrame from the series list
    df = pd.DataFrame(data_dict, columns=column_names)
    
    # Write the DataFrame to a CSV file
    df.to_csv(f'validation_{t_num}.csv', index=False)
    
    # clustering the optimal value of k=3
    plot3 , predictions=scatter_cluster(optimal_ks[0], df_market_scaled, ['sum_3','day_number'])
    
    # Add a new column to the original DataFrame and insert calculated values where count_3 == 3
    df_market_data.loc[df_market_scaled.index, f"prediction_{t_num}"] = predictions


iteration: 1.0
[4, 3, 10]
iteration: 2.0
[4, 3, 10]
iteration: 3.0
[4, 3, 10]
iteration: 4.0
[4, 3, 10]
iteration: 5.0
[2]


In [10]:
df_market_data.head(20)

Unnamed: 0,date,account_id,type,amount,sum_3,mean_3,count_3,sum_15,mean_15,count_15,sum_30,mean_30,count_30,day_number,prediction_all,prediction_1.0,prediction_2.0,prediction_3.0,prediction_4.0,prediction_5.0
0,1997-10-05,1,WITHDRAWAL,2452.0,2452.0,2452.0,1.0,2452.0,2452.0,1.0,2452.0,2452.0,1.0,5,2,1.0,,,,
1,1997-10-31,1,WITHDRAWAL,14.6,14.6,14.6,1.0,14.6,14.6,1.0,2466.6,1233.3,2.0,31,1,0.0,,,,
2,1997-10-03,2,WITHDRAWAL,9800.0,9800.0,9800.0,1.0,9800.0,9800.0,1.0,9800.0,9800.0,1.0,3,2,1.0,,,,
3,1997-10-05,2,WITHDRAWAL,7266.0,17066.0,8533.0,2.0,17066.0,8533.0,2.0,17066.0,8533.0,2.0,5,0,,3.0,,,
4,1997-10-14,2,WITHDRAWAL,3800.0,3800.0,3800.0,1.0,20866.0,6955.333333,3.0,20866.0,6955.333333,3.0,14,2,1.0,,,,
5,1997-10-31,2,WITHDRAWAL,14.6,14.6,14.6,1.0,14.6,14.6,1.0,20880.6,5220.15,4.0,31,1,0.0,,,,
6,1997-10-05,3,WITHDRAWAL,6900.0,6900.0,6900.0,1.0,6900.0,6900.0,1.0,6900.0,6900.0,1.0,5,2,1.0,,,,
7,1997-10-06,4,WITHDRAWAL,2078.0,2078.0,2078.0,1.0,2078.0,2078.0,1.0,2078.0,2078.0,1.0,6,2,1.0,,,,
8,1997-10-13,4,WITHDRAWAL,1300.0,1300.0,1300.0,1.0,3378.0,1689.0,2.0,3378.0,1689.0,2.0,13,2,1.0,,,,
9,1997-10-13,4,WITHDRAWAL,1285.0,2585.0,1292.5,2.0,4663.0,1554.333333,3.0,4663.0,1554.333333,3.0,13,2,,1.0,,,


In [11]:
# Write the DataFrame to a CSV file
df_market_data.to_csv('predictions.csv', index=False)