In [1]:
import os
import pandas as pd
from tqdm import tqdm
import boto3
import timeit
from typing import List
from ast import literal_eval
import pickle
import warnings
warnings.filterwarnings('ignore')

import sys
sys.path.append('../../../../')

from deep.constants import *
from deep.utils import *
import sagemaker

sess = sagemaker.Session(default_bucket=DEV_BUCKET.name)
role = SAGEMAKER_ROLE
role_arn = SAGEMAKER_ROLE_ARN
tracking_uri = MLFLOW_SERVER

from mlflow import sagemaker

### Generate predictions

In [2]:
gender_df = pd.read_csv(os.path.join('final_data', 'test_gender_df.csv'))
minorities_df = pd.read_csv(os.path.join('final_data', 'test_minorities_df.csv'))

test_df = pd.read_csv(
    os.path.join("..", "..", "..", "..", "data", "frameworks_data", "data_v0.7.1", "new_columns_test_v0.7.1.csv.gz"),
    compression='gzip'
)[["excerpt", "entry_id", "lang"]]
test_df = test_df[test_df.lang=='en']

In [3]:
sagemaker.deploy(
    "tmp-hum",
    "s3://deep-mlflow-artifact/29/ad58f6c521e44c5f8cdd653f319344f3/artifacts/two_steps_models",
    execution_role_arn=SAGEMAKER_ROLE_ARN,
    image_url="961104659532.dkr.ecr.us-east-1.amazonaws.com/mlflow-pyfunc:latest",
    region_name="us-east-1",
    instance_type= "ml.c5.2xlarge", # "ml.g4dn.xlarge", #
    flavor="python_function",
    synchronous=False,
    archive=True,
)


2022/09/22 13:22:47 INFO mlflow.sagemaker: Using the python_function flavor for deployment!
2022/09/22 13:22:48 INFO mlflow.sagemaker: No model data bucket specified, using the default bucket
2022/09/22 13:22:54 INFO mlflow.sagemaker: Default bucket `mlflow-sagemaker-us-east-1-961104659532` already exists. Skipping creation.
2022/09/22 13:28:19 INFO mlflow.sagemaker: tag response: {'ResponseMetadata': {'RequestId': 'XSR3QSE3ANJMQP9T', 'HostId': '31MvJpcTcDtuDdoYuUTvp7XhHdOGeBYrbhieAD/F30uUxG581NwI2gXCTmFKPBOQP7ezOgnhZD4=', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amz-id-2': '31MvJpcTcDtuDdoYuUTvp7XhHdOGeBYrbhieAD/F30uUxG581NwI2gXCTmFKPBOQP7ezOgnhZD4=', 'x-amz-request-id': 'XSR3QSE3ANJMQP9T', 'date': 'Thu, 22 Sep 2022 11:28:20 GMT', 'server': 'AmazonS3', 'content-length': '0'}, 'RetryAttempts': 0}}
2022/09/22 13:28:19 INFO mlflow.sagemaker: Creating new endpoint with name: tmp-hum ...
2022/09/22 13:28:20 INFO mlflow.sagemaker: Created model with arn: arn:aws:sagemaker:us-east-1:9611046

In [16]:
def flatten(t: List[List]) -> List:
    return [item for sublist in t for item in sublist]

def get_probas(df):

    client = boto3.session.Session().client("sagemaker-runtime", region_name='us-east-1')

    all_outputs = []
    batch_size = 128
    for i in tqdm(range(0, df.shape[0], batch_size)):
        inputs = df.iloc[i : i + batch_size][['excerpt']]  
        inputs['return_type'] = "default_analyis" 
        inputs['analyis_framework_id'] = 'all'
        
        #kw for interpretability
        inputs['interpretability'] = False
        #minimum ratio between proba and threshold to perform interpretability
        inputs['ratio_interpreted_labels'] = 0.5
        inputs['attribution_type'] = 'Layer DeepLift'

        # predictions
        inputs['return_prediction_labels'] = True

        #kw for embeddings
        inputs['output_backbone_embeddings'] = False
        inputs['pooling_type'] = "['cls', 'mean_pooling']"
        inputs['finetuned_task'] = "['first_level_tags', 'secondary_tags', 'subpillars']"
        inputs['embeddings_return_type'] = 'array'
        
        backbone_inputs_json = inputs.to_json(orient="split")

        response = client.invoke_endpoint(
            EndpointName='tmp-hum',
            Body=backbone_inputs_json,
            ContentType="application/json; format=pandas-split",
        )
        output = response["Body"].read().decode("ascii")

        #output = literal_eval(output)
        
        all_outputs.append(output)

    clean_outputs = []
    for batch in all_outputs:
        eval_batch = literal_eval(batch)
        clean_outputs.append(eval_batch['raw_predictions'])
    clean_outputs = flatten(clean_outputs)

    thresholds = eval_batch['thresholds']

    output_predictions = []
    for i in range (len(clean_outputs)):
        tags_one_entry = clean_outputs[i]
        output_predictions.append({
            tag: round(100 * thresholds[tag] * ratio, 3) for tag, ratio in tags_one_entry.items()
        })

    return output_predictions


def interpret_models(df):

    client = boto3.session.Session().client("sagemaker-runtime", region_name='us-east-1')

    all_outputs = []
    batch_size = 1
    for i in tqdm(range(0, df.shape[0], batch_size)):
        inputs = df.iloc[i : i + batch_size][['excerpt']]  
        inputs['return_type'] = "default_analyis" 
        inputs['analyis_framework_id'] = 'all'
        
        #kw for interpretability
        inputs['interpretability'] = True
        #minimum ratio between proba and threshold to perform interpretability
        inputs['ratio_interpreted_labels'] = 0.5
        inputs['attribution_type'] = 'Layer DeepLift'

        # predictions
        inputs['return_prediction_labels'] = False

        #kw for embeddings
        inputs['output_backbone_embeddings'] = False
        inputs['pooling_type'] = "['cls', 'mean_pooling']"
        inputs['finetuned_task'] = "['first_level_tags', 'secondary_tags', 'subpillars']"
        inputs['embeddings_return_type'] = 'array'
        
        backbone_inputs_json = inputs.to_json(orient="split")

        response = client.invoke_endpoint(
            EndpointName='tmp-hum',
            Body=backbone_inputs_json,
            ContentType="application/json; format=pandas-split",
        )
        output = response["Body"].read().decode("ascii")

        #output = literal_eval(output)
        
        all_outputs.append(output)

    return all_outputs


In [None]:
gender_df['probability'] = get_probas(gender_df)
minorities_df['probability'] = get_probas(minorities_df)

#gender_df.to_csv('final_data/gender_df_with_outputs.csv.gz', index=None, compression='gzip')
#minorities_df.to_csv('final_data/minorities_df_with_outputs.csv.gz', index=None, compression='gzip')

In [15]:
gender_df

Unnamed: 0,entry_id,excerpt,kw,type,probability
0,2276.0,"Another attack in February 2017 hit a farm, wo...",['girl'],augmented,{'first_level_tags->pillars_1d->Casualties': 1...
1,2276.0,"Another attack in February 2017 hit a farm, wo...",['person'],augmented,{'first_level_tags->pillars_1d->Casualties': 1...
2,2276.0,"Another attack in February 2017 hit a farm, wo...",['boys'],original,{'first_level_tags->pillars_1d->Casualties': 1...
3,2576.0,"In Benghazi and Sirte, 78,868 children (39,667...",['male'],augmented,{'first_level_tags->pillars_1d->Casualties': 0...
4,2576.0,"In Benghazi and Sirte, 78,868 children (39,667...",['person'],augmented,{'first_level_tags->pillars_1d->Casualties': 0...
...,...,...,...,...,...
3256,498062.0,In Argentina it is almost a routine procedure ...,['person'],augmented,{'first_level_tags->pillars_1d->Casualties': 0...
3257,498062.0,In Argentina it is almost a routine procedure ...,['women'],original,{'first_level_tags->pillars_1d->Casualties': 0...
3258,498072.0,"""The fucking father, I can not go to sleep,"" w...",['father'],augmented,{'first_level_tags->pillars_1d->Casualties': 0...
3259,498072.0,"""The fucking person, I can not go to sleep,"" w...",['person'],augmented,{'first_level_tags->pillars_1d->Casualties': 0...


### Generate interpretability results with DEEPLift

In [17]:
interpret_results = interpret_models(test_df)

100%|██████████| 17199/17199 [2:43:12<00:00,  1.76it/s]  


In [20]:
test_df['interpretability_col'] = interpret_results

In [21]:
#test_df.to_csv('final_data/test_df_interpretability_DEEPLift.csv.gz', index=None, compression='gzip')