In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import json
import requests

## Loading in the dataset and collecting random samples from it
The dataset contains over 30 million transactions.

We'll randomly sample 10,000 non-fraudulent transactions and combine them with the fraudulent transactions

In [None]:
bitcoin_txs = pd.DataFrame()
fraud_txs = pd.DataFrame()

# Load the dataset in chunks
for chunk in pd.read_csv('/content/drive/MyDrive/final year/ml/data/DG_out.csv', chunksize=100000):

    # Filter rows a transaction has any malicious inputs, outputs or itself is malicious
    malicious_rows = chunk[(chunk['all_malicious']==1)]

    # Concatenate the malicious rows to the DataFrame
    fraud_txs = pd.concat([fraud_txs, malicious_rows])

    # Take a random sample from each chunk making sure they are non-fraudulent
    data_samples = chunk[(chunk['in_malicious']==0) & (chunk['out_malicious']==0) & (chunk['is_malicious']==0) & (chunk['out_and_tx_malicious']==0) & (chunk['all_malicious']==0)].sample(n=33, random_state=42)

    # Add each sample
    bitcoin_txs = pd.concat([bitcoin_txs, data_samples])

# Concat to a single dataset
bitcoin_txs = pd.concat([bitcoin_txs, fraud_txs])

# Write file
bitcoin_txs.to_csv('/content/drive/MyDrive/final year/ml/data/bitcoin_txs.csv',index=False)

## Get info from Blockchain.com

In [None]:
metadata_list = []
result_df = pd.DataFrame()

# Assuming 'malicious' is a DataFrame with a 'tx_hash' column
for txhash in sample_df['tx_hash']:
    # Specify the URL you want to request
    url = "https://blockchain.info/rawtx/" + txhash

    # Make the GET request
    response = requests.get(url)

    # Check if the request was successful (status code 200)
    if response.status_code == 200:
        count+=1
        # Normalize the JSON response to a dataframe and add to the list
        metadata_df = pd.json_normalize(response.json())
        metadata_list.append(metadata_df)
        print('API CALL: ' + str(count))

    else:
        # Print an error message if the request was not successful
        print(f"Error: {response.status_code} - {response.text}")


# Concatenate list of dataframes to a single one and save it to a file
result_df = pd.concat(metadata_list, ignore_index=True).to_csv('/content/drive/MyDrive/final year/ml/data/blockchaincom_data.csv')
