In [1]:
import os
from dotenv import load_dotenv

def load_variables():
    """Load authentication details"""
    env_var=load_dotenv('./variables.env')
    auth_dict = {
            "subscription_id":os.environ['SUB_ID'],
            "resource_group":os.environ['RESOURCE_GROUP'],
            "workspace":os.environ['WORKSPACE_NAME'],
            }
    return auth_dict

auth_var = load_variables()

In [2]:
# Authentication package
from azure.identity import DefaultAzureCredential, InteractiveBrowserCredential

try:
    credential = DefaultAzureCredential()
    # Check if given credential can get token successfully.
    credential.get_token("https://management.azure.com/.default")
except Exception as ex:
    # Fall back to InteractiveBrowserCredential in case DefaultAzureCredential not work
    credential = InteractiveBrowserCredential()

In [3]:
# Handle to the workspace
from azure.ai.ml import MLClient
# Get a handle to the workspace
ml_client = MLClient(
    credential=credential,
    subscription_id=auth_var['subscription_id'],
    resource_group_name=auth_var['resource_group'],
    workspace_name=auth_var['workspace'],
)

In [4]:
from azure.ai.ml.entities import Data
from azure.ai.ml.constants import AssetTypes

web_path = "https://archive.ics.uci.edu/ml/machine-learning-databases/00350/default%20of%20credit%20card%20clients.xls"

credit_data = Data(
    name="creditcard_defaults",
    path=web_path,
    type=AssetTypes.URI_FILE,
    description="Dataset for credit card defaults",
    tags={"source_type": "web", "source": "UCI ML Repo"},
    version="1.0.0",
)

In [5]:
credit_data = ml_client.data.create_or_update(credit_data)
print(f"Dataset with name {credit_data.name} was registered to workspace, the dataset version is {credit_data.version}")

Dataset with name creditcard_defaults was registered to workspace, the dataset version is 1.0.0


In [6]:
from azure.ai.ml.entities import AmlCompute

cpu_compute_target = "cpu-cluster"

try:
    # let's see if the compute target already exists
    cpu_cluster = ml_client.compute.get(cpu_compute_target)
    print(
        f"You already have a cluster named {cpu_compute_target}, we'll reuse it as is."
    )

except Exception:
    print("Creating a new cpu compute target...")

    # Let's create the Azure ML compute object with the intended parameters
    cpu_cluster = AmlCompute(
        # Name assigned to the compute cluster
        name="cpu-cluster",
        # Azure ML Compute is the on-demand VM service
        type="amlcompute",
        # VM Family
        size="STANDARD_DS3_V2",
        # Minimum running nodes when there is no job running
        min_instances=1,
        # Nodes in cluster
        max_instances=4,
        # How many seconds will the node running after the job termination
        idle_time_before_scale_down=180,
        # Dedicated or LowPriority. The latter is cheaper but there is a chance of job termination
        tier="Dedicated",
    )

    # Now, we pass the object to MLClient's create_or_update method
    cpu_cluster = ml_client.begin_create_or_update(cpu_cluster)

Creating a new cpu compute target...


In [7]:
import os
dependencies_dir = "./dependencies"
os.makedirs(dependencies_dir, exist_ok=True)

In [8]:
%%writefile {dependencies_dir}/conda.yml
name: model-env
channels:
  - conda-forge
dependencies:
  - python=3.8
  - numpy=1.21.2
  - pip=21.2.4
  - scikit-learn=0.24.2
  - scipy=1.7.1
  - pandas>=1.1,<1.2
  - pip:
    - inference-schema[numpy-support]==1.3.0
    - xlrd==2.0.1
    - mlflow== 1.26.1
    - azureml-mlflow==1.42.0

Writing ./dependencies/conda.yml


In [9]:
from azure.ai.ml.entities import Environment

custom_env_name = "aml-scikit-learn"

pipeline_job_env = Environment(
    name=custom_env_name,
    description="Custom environment for Credit Card Defaults pipeline",
    tags={"scikit-learn": "0.24.2"},
    conda_file=os.path.join(dependencies_dir, "conda.yml"),
    image="mcr.microsoft.com/azureml/openmpi3.1.2-ubuntu18.04:latest",
    version="1.0",
)
pipeline_job_env = ml_client.environments.create_or_update(pipeline_job_env)

print(
    f"Environment with name {pipeline_job_env.name} is registered to workspace, the environment version is {pipeline_job_env.version}"
)

Environment with name aml-scikit-learn is registered to workspace, the environment version is 1.0


In [10]:
import os
data_prep_src_dir = "./components/data_prep"
os.makedirs(data_prep_src_dir, exist_ok=True)

In [11]:
%%writefile {data_prep_src_dir}/data_prep.py
import os
import argparse
import pandas as pd
from sklearn.model_selection import train_test_split
import logging
import mlflow


def main():
    """Main function of the script."""

    # input and output arguments
    parser = argparse.ArgumentParser()
    parser.add_argument("--data", type=str, help="path to input data")
    parser.add_argument("--test_train_ratio", type=float, required=False, default=0.25)
    parser.add_argument("--train_data", type=str, help="path to train data")
    parser.add_argument("--test_data", type=str, help="path to test data")
    args = parser.parse_args()

    # Start Logging
    mlflow.start_run()

    print(" ".join(f"{k}={v}" for k, v in vars(args).items()))

    print("input data:", args.data)

    credit_df = pd.read_excel(args.data, header=1, index_col=0)

    mlflow.log_metric("num_samples", credit_df.shape[0])
    mlflow.log_metric("num_features", credit_df.shape[1] - 1)

    credit_train_df, credit_test_df = train_test_split(
        credit_df,
        test_size=args.test_train_ratio,
    )

    # output paths are mounted as folder, therefore, we are adding a filename to the path
    credit_train_df.to_csv(os.path.join(args.train_data, "data.csv"), index=False)

    credit_test_df.to_csv(os.path.join(args.test_data, "data.csv"), index=False)

    # Stop Logging
    mlflow.end_run()


if __name__ == "__main__":
    main()

Writing ./components/data_prep/data_prep.py


In [12]:
from azure.ai.ml import command
from azure.ai.ml import Input, Output

data_prep_component = command(
    name="data_prep_credit_defaults",
    display_name="Data preparation for training",
    description="reads a Excel input, split the input to train and test",
    inputs={
        "data": Input(type="uri_folder"),
        "test_train_ratio": Input(type="number"),
    },
    outputs=dict(
        train_data=Output(type="uri_folder", mode="rw_mount"),
        test_data=Output(type="uri_folder", mode="rw_mount"),
    ),
    # The source folder of the component
    code=data_prep_src_dir,
    command="""python data_prep.py \
            --data ${{inputs.data}} --test_train_ratio ${{inputs.test_train_ratio}} \
            --train_data ${{outputs.train_data}} --test_data ${{outputs.test_data}} \
            """,
    environment=f"{pipeline_job_env.name}:{pipeline_job_env.version}",
)

In [13]:
import os

train_src_dir = "./components/train"
os.makedirs(train_src_dir, exist_ok=True)

In [14]:
%%writefile {train_src_dir}/train.py
import argparse
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report
import os
import pandas as pd
import mlflow


def select_first_file(path):
    """Selects first file in folder, use under assumption there is only one file in folder
    Args:
        path (str): path to directory or file to choose
    Returns:
        str: full path of selected file
    """
    files = os.listdir(path)
    return os.path.join(path, files[0])


# Start Logging
mlflow.start_run()

# enable autologging
mlflow.sklearn.autolog()

os.makedirs("./outputs", exist_ok=True)


def main():
    """Main function of the script."""

    # input and output arguments
    parser = argparse.ArgumentParser()
    parser.add_argument("--train_data", type=str, help="path to train data")
    parser.add_argument("--test_data", type=str, help="path to test data")
    parser.add_argument("--n_estimators", required=False, default=100, type=int)
    parser.add_argument("--learning_rate", required=False, default=0.1, type=float)
    parser.add_argument("--registered_model_name", type=str, help="model name")
    parser.add_argument("--model", type=str, help="path to model file")
    args = parser.parse_args()

    # paths are mounted as folder, therefore, we are selecting the file from folder
    train_df = pd.read_csv(select_first_file(args.train_data))

    # Extracting the label column
    y_train = train_df.pop("default payment next month")

    # convert the dataframe values to array
    X_train = train_df.values

    # paths are mounted as folder, therefore, we are selecting the file from folder
    test_df = pd.read_csv(select_first_file(args.test_data))

    # Extracting the label column
    y_test = test_df.pop("default payment next month")

    # convert the dataframe values to array
    X_test = test_df.values

    print(f"Training with data of shape {X_train.shape}")

    clf = GradientBoostingClassifier(
        n_estimators=args.n_estimators, learning_rate=args.learning_rate
    )
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    print(classification_report(y_test, y_pred))

    # Registering the model to the workspace
    print("Registering the model via MLFlow")
    mlflow.sklearn.log_model(
        sk_model=clf,
        registered_model_name=args.registered_model_name,
        artifact_path=args.registered_model_name,
    )

    # Saving the model to a file
    mlflow.sklearn.save_model(
        sk_model=clf,
        path=os.path.join(args.model, "trained_model"),
    )

    # Stop Logging
    mlflow.end_run()


if __name__ == "__main__":
    main()

Writing ./components/train/train.py


In [15]:
%%writefile {train_src_dir}/train.yml
name: train_credit_defaults_model
display_name: Train Credit Defaults Model
version: 1 # Not specifying a version will automatically update the version
type: command
inputs:
  train_data: 
    type: uri_folder
  test_data: 
    type: uri_folder
  learning_rate:
    type: number     
  registered_model_name:
    type: string
outputs:
  model:
    type: uri_folder
code: .
environment:
  # for this step, we'll use an AzureML curate environment
  azureml:AzureML-sklearn-0.24-ubuntu18.04-py37-cpu:21
command: >-
  python train.py 
  --train_data ${{inputs.train_data}} 
  --test_data ${{inputs.test_data}} 
  --learning_rate ${{inputs.learning_rate}}
  --registered_model_name ${{inputs.registered_model_name}} 
  --model ${{outputs.model}}

Writing ./components/train/train.yml


In [16]:
# importing the Component Package
from azure.ai.ml import load_component

# Loading the component from the yml file
train_component = load_component(source=os.path.join(train_src_dir, "train.yml"))

In [17]:
# Now we register the component to the workspace
train_component = ml_client.create_or_update(train_component)

# Create (register) the component in your workspace
print(
    f"Component {train_component.name} with Version {train_component.version} is registered"
)

[32mUploading train (0.0 MBs): 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3379/3379 [00:00<00:00, 42357.72it/s][0m
[39m



Component train_credit_defaults_model with Version 1 is registered


In [18]:
# the dsl decorator tells the sdk that we are defining an Azure ML pipeline
from azure.ai.ml import dsl, Input, Output


@dsl.pipeline(
    compute=cpu_compute_target,
    description="E2E data_perp-train pipeline",
)
def credit_defaults_pipeline(
    pipeline_job_data_input,
    pipeline_job_test_train_ratio,
    pipeline_job_learning_rate,
    pipeline_job_registered_model_name,
):
    # using data_prep_function like a python call with its own inputs
    data_prep_job = data_prep_component(
        data=pipeline_job_data_input,
        test_train_ratio=pipeline_job_test_train_ratio,
    )

    # using train_func like a python call with its own inputs
    train_job = train_component(
        train_data=data_prep_job.outputs.train_data,  # note: using outputs from previous step
        test_data=data_prep_job.outputs.test_data,  # note: using outputs from previous step
        learning_rate=pipeline_job_learning_rate,  # note: using a pipeline input as parameter
        registered_model_name=pipeline_job_registered_model_name,
    )

    # a pipeline returns a dictionary of outputs
    # keys will code for the pipeline output identifier
    return {
        "pipeline_job_train_data": data_prep_job.outputs.train_data,
        "pipeline_job_test_data": data_prep_job.outputs.test_data,
    }

In [19]:
registered_model_name = "credit_defaults_model"

# Let's instantiate the pipeline with the parameters of our choice
pipeline = credit_defaults_pipeline(
    # pipeline_job_data_input=credit_data,
    pipeline_job_data_input=Input(type="uri_file", path=web_path),
    pipeline_job_test_train_ratio=0.2,
    pipeline_job_learning_rate=0.25,
    pipeline_job_registered_model_name=registered_model_name,
)

In [20]:
import webbrowser

# submit the pipeline job
pipeline_job = ml_client.jobs.create_or_update(
    pipeline,
    # Project's name
    experiment_name="e2e_registered_components",
)
# open the pipeline in web browser
webbrowser.open(pipeline_job.services["Studio"].endpoint)

[32mUploading data_prep (0.0 MBs): 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1386/1386 [00:00<00:00, 30589.10it/s][0m
[39m



True

### Get the latest model (after the training pipeline has completed)

In [21]:
# Let's pick the latest version of the model
latest_model_version = max([int(m.version) for m in ml_client.models.list(name=registered_model_name)])

# Get the latest model
model = ml_client.models.get(name=registered_model_name, version=latest_model_version)

## Create real-time endpoint and deployment

In [22]:
import uuid

# Creating a unique name for the endpoint
first_endpoint = "basic-endpoint-" + str(uuid.uuid4())[:8]

In [23]:
from azure.ai.ml.entities import (
    ManagedOnlineEndpoint,
    ManagedOnlineDeployment,
    Model,
    Environment,
)

    
# create an online endpoint
endpoint = ManagedOnlineEndpoint(
    name=first_endpoint,
    description="Real-time online endpoint",
    auth_mode="key",
    tags={
        "training_dataset": "credit_defaults",
        "model_type": "sklearn.GradientBoostingClassifier",
    },
)

endpoint = ml_client.begin_create_or_update(endpoint)

Once you've created an endpoint, you can retrieve it as below:

In [24]:
endpoint = ml_client.online_endpoints.get(name=first_endpoint)
print(f'Endpoint "{endpoint.name}" with provisioning state "{endpoint.provisioning_state}" is retrieved')

Endpoint "basic-endpoint-27ba77d2" with provisioning state "Creating" is retrieved


In [31]:
blue_deployment = ManagedOnlineDeployment(
    name="blue",
    endpoint_name=first_endpoint
    model=model,
    instance_type="Standard_DS3_v2",
    instance_count=2,
)

blue_deployment = ml_client.begin_create_or_update(blue_deployment)

Check: endpoint basic-endpoint-9ce568c0 exists
data_collector is not a known attribute of class <class 'azure.ai.ml._restclient.v2022_02_01_preview.models._models_py3.ManagedOnlineDeployment'> and will be ignored


..

### Create inference files for the real-time endpoint

In [44]:
# Remove any existing json files in the test-endpoint folder
!'./test-endpoint/remove-json.sh'

In [48]:
%run './test-endpoint/sample_data_generator.py' --number_of_records 10000
%run './test-endpoint/sample_data_generator.py' --number_of_records 100000

Number of records: 10000, number of batches: None, remaining records: None
Final dataframe length: 10000
Number of records: 100000, number of batches: 3, remaining records: 10000
Final dataframe length: 100000
Number of records: 1000000, number of batches: 33, remaining records: 10000
Final dataframe length: 1000000


In [49]:
import os

def absoluteFilePaths(directory):
    fp=[]
    for dirpath,_,filenames in os.walk(directory):
        for f in filenames:
#             yield os.path.abspath(os.path.join(dirpath, f))
            fp.append(os.path.abspath(os.path.join(dirpath, f)))
    fp = [x for x in fp if '.json' in x]
    return fp

fl = absoluteFilePaths('./test-endpoint/')
print(fl)

['/Users/userid/GithubProjects/managed-online-endpoint-testing/test-endpoint/10000_inference_data.json', '/Users/userid/GithubProjects/managed-online-endpoint-testing/test-endpoint/100000_inference_data.json', '/Users/userid/GithubProjects/managed-online-endpoint-testing/test-endpoint/1000000_inference_data.json']


In [50]:
# Filter out the largest file -> which will be used for the batch deployment
real_time_endpoint_files = [x for x in fl if '1000000_inference_data.json' not in x]
print(real_time_endpoint_files)

['/Users/userid/GithubProjects/managed-online-endpoint-testing/test-endpoint/10000_inference_data.json', '/Users/userid/GithubProjects/managed-online-endpoint-testing/test-endpoint/100000_inference_data.json']


In [51]:
# %%writefile {deploy_dir}/sample-request.json
# {
#   "input_data": {
#     "columns": [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22],
#     "index": [0, 1],
#     "data": [
#             [20000,2,2,1,24,2,2,-1,-1,-2,-2,3913,3102,689,0,0,0,0,689,0,0,0,0],
#             [10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 10, 9, 8]
#         ]
#   }
# }

### Invoke the real-time endpoint

In [52]:
import time
for item in endpoint_list:
    for file in real_time_endpoint_files:    
        ml_client.online_endpoints.invoke(
            endpoint_name=item,#online_endpoint_name,
            request_file=file,#"./deploy/inference_data.json",
        #     request_file = 'https://amlpipelstoragea4362a6ac.blob.core.windows.net/azureml-blobstore-1ce0bd5c-e972-4484-8bff-c01a8d5d7aaf/LocalUpload/inference_data.json',
            deployment_name="blue",
        )
        print(f"Invoked the endpoint: {item} for file:{file}")
        time.sleep(5)

Invoked the endpoint: basic-endpoint-9ce568c0 for file:/Users/userid/GithubProjects/managed-online-endpoint-testing/test-endpoint/10000_inference_data.json
Invoked the endpoint: basic-endpoint-9ce568c0 for file:/Users/userid/GithubProjects/managed-online-endpoint-testing/test-endpoint/100000_inference_data.json


In [53]:
# # Manual test
# ml_client.online_endpoints.invoke(
#     endpoint_name="basic-endpoint-5437043f",
#     request_file="./test-endpoint/10000_inference_data.json",
#     deployment_name="blue",
# )

## Create batch endpoint and deployment

In [161]:
from azure.ai.ml.entities import BatchEndpoint

bendpoint = BatchEndpoint(
    name="creditops-batch" + str(uuid.uuid4())[:8],
    description="Credit operations classifier for defaults",
)

In [162]:
ml_client.batch_endpoints.begin_create_or_update(bendpoint)

<azure.core.polling._poller.LROPoller at 0x7f77df0217f0>

In [163]:
import time

from azure.ai.ml.constants import BatchDeploymentOutputAction
from azure.ai.ml.entities import BatchDeployment, BatchRetrySettings

deployment = BatchDeployment(
    name="classifier-random-forest",
#     description="A classifier based on XGBoost",
    endpoint_name=bendpoint.name,
    model=model,
    compute="cpu-cluster",
    instance_count=4,
    max_concurrency_per_instance=4,
    mini_batch_size=20,
    output_action=BatchDeploymentOutputAction.APPEND_ROW,
    output_file_name="predictions.csv",
    retry_settings=BatchRetrySettings(max_retries=3, timeout=300),
    logging_level="info",
)
time.sleep(5)

In [164]:
# If this references an existing cluster that is used for training, this will go through fairly quickly
ml_client.batch_deployments.begin_create_or_update(deployment)

<azure.core.polling._poller.LROPoller at 0x7f79b1c44b50>

For the third file (`1000000_inference_data.csv`) which gets close to slightly <100Mb, it is advisable to load this into the blob store using AZCOPY since that will asynchronously copy that from a local storage drive. A SAS token for the default blobstore needs to be generated for the AZCOPY command to work. Then, manually or programmatically, mark it as a Data object for input into the batch deployment.

In [165]:
# Create the file to run for batch inferences
%run './test-endpoint/sample_data_generator.py' --number_of_records 1000000

Number of records: 1000000, number of batches: 33, remaining records: 10000
Final dataframe length: 1000000


In [166]:
# Copy the sample file to the default blob stores
# Get the full URL with a SAS token for the default blobstore, and include a generous list of privileges
# Wait for this to finish
!azcopy copy './test-endpoint/1000000_inference_data.csv' "https://creditopstorage44a6fab42.blob.core.windows.net/azureml-blobstore-dfbc852f-833b-448a-b143-a346fea2fd58?sp=racwdl&st=2022-11-17T17:49:03Z&se=2022-11-18T01:49:03Z&spr=https&sv=2021-06-08&sr=c&sig=QM9Qcl8QYl2ga7itCzqH2EmCF7tCeWNbpgJL%2FRDUO9M%3D"

INFO: Scanning...
INFO: Any empty folders will not be processed, because source and/or destination doesn't have full folder support

Job e5148886-559e-a242-74d2-82966711be80 has started
Log file is located at: /Users/userid/.azcopy/e5148886-559e-a242-74d2-82966711be80.log

100.0 %, 1 Done, 0 Failed, 0 Pending, 0 Skipped, 1 Total, 2-sec Throughput (Mb/s): 1.4413


Job e5148886-559e-a242-74d2-82966711be80 summary
Elapsed Time (Minutes): 0.7335
Number of File Transfers: 1
Number of Folder Property Transfers: 0
Total Number of Transfers: 1
Number of Transfers Completed: 1
Number of Transfers Failed: 0
Number of Transfers Skipped: 0
TotalBytesTransferred: 87937385
Final Job Status: Completed



In [167]:
# # At attempt at programmatic update of the Data object, however, this did not work with SAS tokens 
# dataset_name = "1m-inferences"

# # Give the path to the file in the default blob container
# filename = "1000000_inference_data.csv"
# data_path="https://creditopstorage44a6fab42.blob.core.windows.net/azureml-blobstore-dfbc852f-833b-448a-b143-a346fea2fd58/" + filename

# inf_data = Data(
#     path=data_path,
#     type=AssetTypes.URI_FILE,
#     description="A one million inference dataset",
#     name=dataset_name,
# )

# ml_client.data.create_or_update(inf_data)

In [168]:
## MANUALLY UPDATE A DATA ASSET THROUGH THE PORTAL

In [169]:
# After manually updating the Data object through the Portal, linking it to the source file (Data source should be 'workspaceblobstore')
# Input the name in manually
ds_name = 'khabib'
inf_dataset = ml_client.data.get(name=ds_name, label="latest")
print(inf_dataset.id)

/subscriptions/<subid>/resourceGroups/creditops1847/providers/Microsoft.MachineLearningServices/workspaces/creditops1847ws/data/khabib/versions/1


In [170]:
input_data = Input(type=AssetTypes.URI_FILE, path=inf_dataset.id)

### Invoke the batch endpoint to setup the job

In [171]:
print(f"Batch endpoint name: {bendpoint.name}")
print(f"Batch deployment name: {deployment.name}")
print(f"Initiating the job...")
job = ml_client.batch_endpoints.invoke(
    deployment_name = deployment.name,
    endpoint_name=bendpoint.name, 
    input=input_data)
print(f"Job response: {job}")

Batch endpoint name: creditops-batchc0e61b0f
Batch deployment name: classifier-random-forest
Initiating the job...
Job response: {'additional_properties': {}, 'id': '/subscriptions/<sub-id>/resourceGroups/creditops1847/providers/Microsoft.MachineLearningServices/workspaces/creditops1847ws/batchEndpoints/creditops-batchc0e61b0f/deployments/classifier-random-forest/jobs/aae7a9c8-7c17-44f6-8e20-25be4381c78a', 'name': 'aae7a9c8-7c17-44f6-8e20-25be4381c78a', 'type': 'Microsoft.MachineLearningServices/workspaces/batchEndpoints/deployments/jobs', 'properties': <azure.ai.ml._restclient.v2020_09_01_dataplanepreview.models._models_py3.BatchJob object at 0x7f79a1131310>, 'system_data': <azure.ai.ml._restclient.v2020_09_01_dataplanepreview.models._models_py3.SystemData object at 0x7f79a11318e0>}


In [173]:
ml_client.jobs.get(job.name)

Experiment,Name,Type,Status,Details Page
creditops-batchc0e61b0f,aae7a9c8-7c17-44f6-8e20-25be4381c78a,pipeline,Completed,Link to Azure Machine Learning studio


In [174]:
ml_client.jobs.download(name=job.name, download_path=".", output_name="score")

Downloading artifact azureml://datastores/workspaceblobstore/paths/azureml/2508e05f-fb88-4de2-9eb6-5173395f73f8/score/ to .


In [175]:
with open("./predictions.csv", "r") as f:
    data = f.read()

In [176]:
from ast import literal_eval
import pandas as pd

score = pd.DataFrame(
    literal_eval(data.replace("\n", ",")), columns=["file", "prediction"]
)
score

Unnamed: 0,file,prediction
0,1000000_inference_data.csv,1
1,1000000_inference_data.csv,0
2,1000000_inference_data.csv,0
3,1000000_inference_data.csv,0
4,1000000_inference_data.csv,0
...,...,...
999995,1000000_inference_data.csv,0
999996,1000000_inference_data.csv,0
999997,1000000_inference_data.csv,0
999998,1000000_inference_data.csv,0


In [177]:
score['prediction'].value_counts()

0    876033
1    123967
Name: prediction, dtype: int64