# Sentiment Dataset Verification Demo


This sample verifies a tamper-proof dataset history.<br>


## Imports

In [None]:
import pprint
import random
import sys
from dotenv import load_dotenv
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from vbase import (
    VBaseClient,
    VBaseDataset,
)
from aws_utils import (
    create_s3_client_from_env,
    init_vbase_dataset_from_s3_objects,
)

#  Install vBase requirements.
!pip install git+https://github.com/validityBase/vbase-py.git
!wget --no-clobber https://raw.githubusercontent.com/validityBase/vbase-py-samples-collab/main/samples/collab_utils.py

## Configuration

The dataset owner address.

In [None]:
DATASET_OWNER = "0xA401F59d7190E4448Eb60691E3bc78f1Ef03e88C"

The dataset name.

In [None]:
DATASET_NAME = "sentiment_dataset_20240620103503"

Additional configuration.

In [None]:
BUCKET_NAME = "vbase-test"
FOLDER_NAME = "samples/sentiment_dataset_history/"
DATASET_FOLDER_NAME = FOLDER_NAME + DATASET_NAME

## Setup

Load the information necessary to call vBase APIs.

In [None]:
# Initialize the environment using Google Collab secrets, if possible.
try_add_user_secrets_to_env([
    "VBASE_API_KEY",
    "VBASE_FORWARDER_URL",
    "VBASE_COMMITMENT_SERVICE_PRIVATE_KEY",
    "AWS_ACCESS_KEY_ID",
    "AWS_SECRET_ACCESS_KEY"
])
assert load_dotenv(verbose=True, override=True)

Connect to AWS.

In [None]:
boto_client = create_s3_client_from_env()

Connect to vBase.

In [None]:
vbc = VBaseClient.create_instance_from_env()

Initialize the dataset object.

In [None]:
ds = VBaseDataset(
    vbc,
    init_dict={
        "name": DATASET_NAME,
        "owner": DATASET_OWNER,
        "record_type_name": "VBaseJsonObject",
        "records": [],
    },
)

Additional Setup.

In [None]:
if "ipykernel" not in sys.modules and "IPython" in sys.modules:
    # Configure plot backend if running in interactive mode.
    # The following line creates overactive warning.
    # We want the import within the clause.
    # pylint: disable=ungrouped-imports
    import matplotlib

    # Set plot backend to WebAgg.
    # This backend provides interactive web charts.
    matplotlib.use("WebAgg")

## Validate the Dataset History

Load the dataset records.

In [None]:
ds = init_vbase_dataset_from_s3_objects(
    ds, boto_client, BUCKET_NAME, DATASET_FOLDER_NAME
)

Restore timestamps using the blockchain stamps.

In [None]:
assert ds.try_restore_timestamps_from_index()

Verify the records.

In [None]:
assert ds.verify_commitments()

Build and display the verified records.

In [None]:
l_receipts = ds.get_commitment_receipts()
html = "<table>"
html += "<tr><th>num</th><th>record</th><th>record_hash</th><th>tx</th></tr>"
# Populate the table with data.
for i, record in enumerate(ds.records):
    html += (
        f"<tr><td>{i}</td><td>{record.data}</td><td>{record.cid}</td>"
        f"<td>{l_receipts[i]['transactionHash']}</td></tr>"
    )
html += "</table>"
# Check if the script is running in an interactive mode or a Jupyter notebook.
if "ipykernel" not in sys.modules and "IPython" in sys.modules:
    pprint.pprint(html)
else:
    # Load support for HTML display, if necessary.
    from IPython.display import display, HTML

    # Display the HTML table in the Jupyter notebook.
    display(HTML(html))

## Display Analytics

Convert dataset data to a Pandas DataFrame.

In [None]:
df_dataset = ds.get_pd_data_frame()
print("Dataset DataFrame:\n", df_dataset)

Convert data to a signal.

In [None]:
df_signal = (df_dataset - 50) / 50
print("Signal DataFrame:\n", df_signal)

Plot validated signal return.

In [None]:
random.seed(1)
df_asset_returns = pd.DataFrame(
    (np.random.random(size=df_signal.shape) * 2 - 1) / 20,
    index=df_signal.index,
    columns=df_signal.columns,
)
df_signal_returns = (df_signal.shift(1) * df_asset_returns).sum(axis=1)
print("\nReturns DataFrame:\n", df_signal_returns)
(1 + df_signal_returns).cumprod().fillna(1).plot()
plt.show()

## Summary


Process<br>
* We used only a link to the dataset history, name and owner.<br>
* We validated data integrity and timestamps using public blockchain records.<br>
* We converted the historical data to a Pandas DataFrame for easy analysis.<br>



Key Implications<br>
* The track record and all analytics can be independently calculated and verified forever.<br>
* Data can be validated with a single line.<br>
* vBase integrates smoothly with existing data science libraries and workflows.<br>
