In [2]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from category_encoders import TargetEncoder
from sklearn.ensemble import IsolationForest

In [3]:
df = pd.read_csv('data_for_unsupervised_learning.csv')

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8981 entries, 0 to 8980
Data columns (total 41 columns):
 #   Column                                                Non-Null Count  Dtype  
---  ------                                                --------------  -----  
 0   FLAG                                                  8981 non-null   int64  
 1   Avg-min-between-sent-tnx                              8981 non-null   float64
 2   Avg-min-between-received-tnx                          8981 non-null   float64
 3   Time-Diff-between-first-and-last (Mins)               8981 non-null   float64
 4   Sent-tnx                                              8981 non-null   int64  
 5   Received-Tnx                                          8981 non-null   int64  
 6   Number-of-Created-Contracts                           8981 non-null   int64  
 7   Unique-Received-From-Addresses                        8981 non-null   int64  
 8   Unique-Sent-To-Addresses                              8981

In [5]:
X = df.drop('FLAG', axis=1)
y = df['FLAG']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 123)

Encoding the categorical variables with TargetEncoder

In [6]:
categories = df.select_dtypes(include='object').columns
categories

Index(['ERC20-most-sent-token-type', 'ERC20_most_rec_token_type'], dtype='object')

In [7]:
encoder = TargetEncoder(cols=categories)
X_train_encoded = encoder.fit_transform(X_train, y_train)
X_test_encoded = encoder.transform(X_test)

In [10]:
train = pd.concat([X_train_encoded, y_train], axis=1)
test = pd.concat([X_test_encoded, y_test], axis=1)

In [26]:
isolation_forest = IsolationForest(contamination='auto', random_state=123)
isolation_forest.fit(train)

training_anomaly_prediction = isolation_forest.predict(train)
test_anomaly_prediction = isolation_forest.predict(test)

is_train = np.sum(training_anomaly_prediction == -1)
is_test = np.sum(test_anomaly_prediction == -1)

print("Number of anomalies in the training set: ", is_train)
#print("Actual anomalies in training: ", np.sum(train['FLAG'] == 1))
print("Number of anomalies in testing set: ", is_test)

Number of anomalies in the training set:  219
Number of anomalies in testing set:  51
