In [None]:
# Install the boto3 library for AWS interactions
!pip install boto3

In [None]:
# Import boto3 and set up AWS credentials and S3 client for data access
import boto3
aws_access_key_id = ''
aws_secret_access_key = ''

s3 = boto3.client('s3', aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key)


In [None]:
# Set up Weights & Biases (wandb) logging for experiment tracking (optional)
from getpass import getpass
wandb_logging = True
if wandb_logging:
    wandb_api_key = getpass("Copy your WANDB_API_KEY:")

In [None]:
# Install and import wandb for experiment tracking
!pip install wandb
import wandb

In [None]:
# Initialize a wandb project for logging and tracking the anomaly detection model
wandb.init(project="anomaly_detection")

In [None]:
# Dowload the data
bucket_name = 'aws-public-blockchain'
date_strings = [
    "2023-10-23",
    "2023-10-24",
    "2023-10-25",
    "2023-10-26",
    "2023-10-27",
    "2023-10-28",
    "2023-10-29",
    "2023-10-30"
]
for date in date_strings:
  prefix = f'v1.0/btc/transactions/date={date}/'
  response = s3.list_objects_v2(Bucket=bucket_name,  Prefix=prefix)
  for item in response.get('Contents', []):
      print(item.get('Key'))
  file_name = item.get('Key')
  s3.download_file(bucket_name, file_name, f'test/{date}.snappy.parquet')

In [None]:
# Choose some columns
columns_to_use = ['size', 'virtual_size', 'input_count', 'output_count', 'input_value', 'output_value', 'fee']

In [None]:
import pandas as pd
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler
df = pd.read_parquet(f'test/{date}.snappy.parquet')
df_dropped = df.dropna()
used = df_dropped[columns_to_use]
scaler = StandardScaler()
np_scaled = scaler.fit_transform( df_dropped[columns_to_use])
outliers_fraction=0.05
data = pd.DataFrame(np_scaled)
model =  IsolationForest(contamination=outliers_fraction)

In [None]:
# Train the model
model.fit(data)
df_dropped['anomaly_IsolationForest'] = pd.Series(model.predict(df_dropped[columns_to_use]))

In [None]:
import joblib

# Save the model to a file
model_file = "isolation_forest_model.joblib"
scaler_file = "scaler.joblib"
joblib.dump(model, model_file)
joblib.dump(scaler, scaler_file)

In [None]:
if wandb_logging:
    wandb.login(key=wandb_api_key, relogin=True)
else:
    logger = None

In [None]:
wandb.save(model_file)

In [None]:
wandb.save(scaler_file)