# Evaluation

References for text similarity metrics: \
[BLEU Score](https://towardsdatascience.com/foundations-of-nlp-explained-bleu-score-and-wer-metrics-1a5ba06d812b) -
[ROUGE Score](https://medium.com/@eren9677/text-summarization-387836c9e178) -
[NIST Score](https://aclanthology.org/www.mt-archive.info/HLT-2002-Doddington.pdf) -
[METEOR Score](https://aclanthology.org/W05-0909.pdf) -
[BERT Score](https://arxiv.org/pdf/1904.09675) -
[BLEURT Score](https://aclanthology.org/2020.acl-main.704.pdf)

In [None]:
# Installazione di BLEURT
#import os
#!git clone https://github.com/google-research/bleurt.git
#os.chdir('bleurt')
#!pip install .

In [None]:
import pandas as pd
from sklearn import metrics
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score, classification_report
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.translate.meteor_score import meteor_score
from nltk.translate.nist_score import sentence_nist
from rouge_score import rouge_scorer
import bert_score
from bleurt import score as bleurt_score
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import glob
import json
import warnings
from transformers import logging

In [None]:
warnings.filterwarnings("ignore")
logging.set_verbosity_error()

In [None]:
data_folder = './data/'
result_folder = './results/'

In [None]:
debug_mode = True # set to True to test evaluation of stage 3

In [None]:
model_name = 'mistral'

In [None]:
#model_name = 'deepseek'

# Evaluation Stage 1

In [None]:
test_file_stg1 = data_folder + "implicit_hate_test_stg1.csv"

In [None]:
def evaluate_stage_1(strategy, gold_file, pred_file):
    
    df_gold = pd.read_csv(gold_file, encoding = 'utf-8')
    df_pred = pd.read_csv(pred_file, encoding = 'utf-8')

    label_selector = ['implicit_hate', 'not_hate' ]
    
    df_gold = df_gold[df_gold['post_id'].isin(df_pred['post_id'])]
    df_pred = df_pred[df_pred['post_id'].isin(df_gold['post_id'])]
    
    df_gold.sort_values("post_id", axis=0, ascending=True, inplace=True)
    df_pred.sort_values("post_id", axis=0, ascending=True, inplace=True)
    
    labels_gold = df_gold['class']
    predictions = df_pred['class']

    f1_macro = f1_score(labels_gold, predictions, average = "macro")
    accuracy = accuracy_score(labels_gold, predictions)
    precision = precision_score(labels_gold, predictions, average = "macro")
    recall = recall_score(labels_gold, predictions, average = "macro")
    
    clf_report = classification_report(labels_gold, predictions, labels = label_selector, target_names = label_selector, digits=4)
    clf_report_dict = classification_report(labels_gold, predictions, labels = label_selector, target_names = label_selector,
                                            digits=4, output_dict=True)
    clf_report_df = pd.DataFrame(clf_report_dict)
    
    values_export = clf_report_df.loc["f1-score", label_selector].T.values
    values_export = "\t".join([str(round(v,4)) for v in values_export])
    
    report = f"""F1-macro: {str(round(f1_macro, 4))} accuracy: {str(round(accuracy, 4))} precision: {str(round(precision, 4))} recall: {str(round(recall, 4))} \n\n{clf_report}"""

    print(strategy + ": " + report)    

    confusion_matrix = metrics.confusion_matrix(labels_gold, predictions,labels = label_selector)
    cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = confusion_matrix, display_labels = label_selector)
    cm_display.plot(cmap=plt.cm.Blues)
    plt.show()

    return pd.DataFrame([{'Strategy' : strategy, "F1-macro" : f1_macro, 'Precision' : precision, 'Recall' : recall, 'Accuracy' : accuracy}])

    

### Baseline

In [None]:
result_file_stg1 = result_folder + model_name + "_result_baseline_stg1.csv"
df_baseline_stg1 = evaluate_stage_1(
    'Baseline', 
    test_file_stg1, 
    result_file_stg1
)

### Prompt Tree of Thoughts

In [None]:
result_file_stg1 = result_folder + model_name + "_result_prompt_tot_stg1.csv"
df_prompt_tot_stg1 = evaluate_stage_1(
    "ToT Prompt",
    test_file_stg1, 
    result_file_stg1 
)

### Graph Tree of Thoughts

In [None]:
result_file_stg1 = result_folder + model_name + "_result_graph_stg1.csv"
df_graph_stg1 = evaluate_stage_1(
    "ToT Graph", 
    test_file_stg1, 
    result_file_stg1
)

### Optimizer MIPRO

In [None]:
result_file_stg1 = result_folder + model_name + "_result_optimizer_stg1.csv"
df_optim_stg1 = evaluate_stage_1(
    "Optimization Mipro", 
    test_file_stg1, 
    result_file_stg1
)

## Results Stage 1

In [None]:
df_result_stg1 = pd.concat([df_baseline_stg1, df_prompt_tot_stg1, df_graph_stg1, df_optim_stg1])

In [None]:
df_result_stg1.to_excel(result_folder + model_name + '_evaluation_stg1.xlsx', index = False)

In [None]:
df_result_stg1 = pd.read_excel(result_folder + model_name + '_evaluation_stg1.xlsx')

In [None]:
df_result_stg1.head()

### Qualitative analysis (MIPRO results)

In [None]:
df_gold_stg1 = pd.read_csv(data_folder + "implicit_hate_test_stg1.csv", encoding = 'utf-8')
df_pred_stg1 = pd.read_csv(result_folder + model_name + "_result_optimizer_stg1.csv", encoding = 'utf-8')

#### Confidence

In [None]:
print(df_pred_stg1['confidence'].mean())

In [None]:
df_pred_stg1.rename(columns={'class': 'class_pred'}, inplace=True)

In [None]:
df_pred_stg1.head()

In [None]:
df_merge_stg1 = pd.merge(df_gold_stg1, df_pred_stg1, on = 'post_id')

In [None]:
df_merge_stg1.head()

#### Confidence of correct predictions

In [None]:
print(df_merge_stg1[df_merge_stg1['class'] == df_merge_stg1['class_pred']]['confidence'].mean())

#### Confidence of wrong predictions

In [None]:
print(df_merge_stg1[df_merge_stg1['class'] != df_merge_stg1['class_pred']]['confidence'].mean())

#### Examples of misclassified not hate posts with explanation

In [None]:
df_fp = df_merge_stg1[(df_merge_stg1['class'] == 'not_hate') & (df_merge_stg1['class_pred'] == 'implicit_hate')]

In [None]:
for idx, row in df_fp.sample(10).iterrows():
    print('\nPost: ' + row['post'])
    print('Confidence: ' + str(row['confidence']))
    print('Explanation: ' + row['explanation'])

#### Was deepening used?

In [None]:
from pydantic import BaseModel, Field
from typing import Optional, TypedDict, Literal

class HateClassification(BaseModel):
    hate_class: Literal['implicit_hate','not_hate']
    interpretations : Optional[str]
    explanation : Optional[str]
    confidence : Optional[float]
    recursion_level : int = 0

In [None]:
df_resp = pd.DataFrame(columns = ['post', 'explanation', 'recursion_level'])
files = glob.glob(result_folder + '/responses/*.json')

for filename in files:
    try: 
        with open(filename, 'r') as f:
            text = f.read()
            obj = json.loads(text)
            hate = eval(obj['hate_class']['repr'])
            new_row = {
                'post': obj['post'], 
                'explanation' : hate.explanation, 
                'recursion_level' : hate.recursion_level
            }
            df_resp = pd.concat([df_resp, pd.DataFrame([new_row])])
    except:
        print(filename)


#### Deepening was never used :(

In [None]:
df_resp[df_resp['recursion_level'] > 1]

# Evaluation Stage 2

In [None]:
test_file_stg2 = data_folder + "implicit_hate_test_stg2.csv"

In [None]:
def evaluate_stage_2(strategy, gold_file, pred_file):

    df_gold = pd.read_csv(gold_file, encoding = "utf-8")
    df_pred = pd.read_csv(pred_file, encoding = "utf-8")
    
    label_selector = ['incitement', 'white_grievance', 'inferiority', 'stereotypical', 'irony', 'threatening', 'other']
    
    df_gold = df_gold[df_gold['post_id'].isin(df_pred['post_id'])]
    df_pred = df_pred[df_pred['post_id'].isin(df_gold['post_id'])]
    
    df_gold.sort_values("post_id", axis=0, ascending=True, inplace=True)
    df_pred.sort_values("post_id", axis=0, ascending=True, inplace=True)
        
    labels_gold = df_gold['implicit_class']
    predictions = df_pred['implicit_class']
    
    f1_macro = f1_score(labels_gold, predictions, average = "macro")
    accuracy = accuracy_score(labels_gold, predictions)
    precision = precision_score(labels_gold, predictions, average = "macro")
    recall = recall_score(labels_gold, predictions, average = "macro")
    
    clf_report = classification_report(labels_gold, predictions, labels = label_selector, target_names = label_selector, digits=4)
    clf_report_dict = classification_report(labels_gold, predictions, labels = label_selector, target_names = label_selector,
                                            digits=4, output_dict=True)
    clf_report_df = pd.DataFrame(clf_report_dict)
    
    values_export = clf_report_df.loc["f1-score", label_selector].T.values
    values_export = "\t".join([str(round(v,4)) for v in values_export])
    
    report = f"""F1-macro: {str(round(f1_macro, 4))} accuracy: {str(round(accuracy, 4))} precision: {str(round(precision, 4))} recall: {str(round(recall, 4))} \n\n{clf_report}
    """
    print(strategy + ": " + report)
    print(values_export)
    print("\n\n")
    
    # Compute the confusion matrix
    cm = metrics.confusion_matrix(labels_gold, predictions, labels = label_selector)
    
    # Plot the confusion matrix with percentages
    fig, ax = plt.subplots()
    im = ax.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
    ax.figure.colorbar(im, ax=ax)
    
    # We want to show all ticks and label them with the respective list entries
    ax.set(xticks=np.arange(cm.shape[1]),
           yticks=np.arange(cm.shape[0]),
           xticklabels=label_selector, yticklabels=label_selector,
           title='Implicit hate classification',
           ylabel='True label',
           xlabel='Predicted label')
    
    # Rotate the tick labels and set their alignment
    plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
             rotation_mode="anchor")
    
    # Loop over data dimensions and create text annotations
    fmt = '.0f'
    thresh = cm.max() / 2.
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            ax.text(j, i, format(cm[i, j], fmt),
                    ha="center", va="center",
                    color="white" if cm[i, j] > thresh else "#274c81")
    #fig.tight_layout()
    plt.show()

    return pd.DataFrame([{'Strategy' : strategy, "F1-macro" : f1_macro, 'Precision' : precision, 'Recall' : recall, 'Accuracy' : accuracy}])


### Baseline

In [None]:
result_file_stg2 = result_folder + model_name + "_result_baseline_stg2.csv"
df_baseline_stg2 = evaluate_stage_2(
    "Baseline",
    test_file_stg2,
    result_file_stg2
)

### Prompt Tree of Thoughts

In [None]:
result_file_stg2 = result_folder + model_name + "_result_prompt_tot_stg2.csv"
df_prompt_tot_stg2 = evaluate_stage_2(
    "Prompt ToT",
    test_file_stg2,
    result_file_stg2
)

### Graph Tree of Thoughts

In [None]:
result_file_stg2 = result_folder + model_name + "_result_graph_stg2.csv"
df_graph_stg2 = evaluate_stage_2(
    "Graph ToT",
    test_file_stg2,
    result_file_stg2
)

### Optimizer MIPRO

In [None]:
result_file_stg2 = result_folder + model_name + "_result_optimizer_stg2.csv"
df_optim_stg2 = evaluate_stage_2(
    "Optimizer",
    test_file_stg2,
    result_file_stg2
)

## Final results Stage 2

In [None]:
df_result_stg2 = pd.concat([df_baseline_stg2, df_prompt_tot_stg2, df_graph_stg2, df_optim_stg2])

In [None]:
df_result_stg2.to_excel(result_folder +  model_name + '_evaluation_stg2.xlsx', index = False)

In [None]:
df_result_stg2 = pd.read_excel(result_folder + model_name + '_evaluation_stg2.xlsx')

In [None]:
df_result_stg2.head()

### Qualitative analysis (results of MIPRO optimizer)

In [None]:
df_gold_stg2 = pd.read_csv(data_folder + "implicit_hate_test_stg2.csv", encoding = 'utf-8')
df_pred_stg2 = pd.read_csv(result_folder + model_name + "_result_optimizer_stg2.csv", encoding = 'utf-8')

#### Overall confidence

In [None]:
print(df_pred_stg2['confidence'].mean())

In [None]:
df_pred_stg2.rename(columns={'implicit_class': 'implicit_class_pred'}, inplace=True)

In [None]:
df_pred_stg2.head()

In [None]:
df_merge_stg2 = pd.merge(df_gold_stg2, df_pred_stg2, on = 'post_id')

In [None]:
df_merge_stg2.head()

#### Confidence of correct predictions

In [None]:
print(df_merge_stg2[df_merge_stg2['implicit_class'] == df_merge_stg2['implicit_class']]['confidence'].mean())

#### Confidence of wrong predictions

In [None]:
print(df_merge_stg2[df_merge_stg2['implicit_class'] != df_merge_stg2['implicit_class_pred']]['confidence'].mean())

#### Examples of misclassified posts

In [None]:
df_errors = df_merge_stg2[df_merge_stg2['implicit_class'] != df_merge_stg2['implicit_class_pred']]

In [None]:
for idx, row in df_errors.sample(10).iterrows():
    print('\nPost: ' + row['post'])
    print('Confidence: ' + str(row['confidence']) + ' - Gold class: ' + row['implicit_class'] +' - Pred class: ' + row['implicit_class_pred'])
    print('Explanation: ' + row['explanation'])

# Evaluation Stage 3

[BLEURT installation](https://github.com/google-research/bleurt)


In [None]:
# Initialize BLEURT Scorer
checkpoint =  "../bleurt/bleurt/BLEURT-20"
bleurt_scorer = bleurt_score.BleurtScorer(checkpoint)

In [None]:
result_file_stg3 = data_folder + 'implicit_hate_test_stg3.csv'

### Text similarity metrics

In [None]:
def compute_similarity_metrics(generated, reference, bleurt_scorer, bert_lang='en'):

    generated_tokens = nltk.word_tokenize(generated.lower())
    reference_tokens = nltk.word_tokenize(reference.lower())

    # Compute BLEU Score

    # Smoothing method1: adds 1 to the numerator and denominator for higher-order n-grams with no matches, akin to additive smoothing.
    # Works well for very short sentences or cases with few matches.
    smoothing = SmoothingFunction().method1

    bleu = sentence_bleu([reference_tokens], generated_tokens, smoothing_function = smoothing)
    bleu_1 = sentence_bleu([reference_tokens], generated_tokens, weights=(1, 0, 0, 0), smoothing_function = smoothing)
    bleu_2 = sentence_bleu([reference_tokens], generated_tokens, weights=(0.5, 0.5, 0, 0), smoothing_function = smoothing) # Cumulative

    # Compute ROUGE Score (F1)
    rouge_scorer_obj = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    rouge_scores = rouge_scorer_obj.score(reference, generated)
    rouge_f1 = {
        'ROUGE-1': rouge_scores['rouge1'].fmeasure,
        'ROUGE-2': rouge_scores['rouge2'].fmeasure,
        'ROUGE-L': rouge_scores['rougeL'].fmeasure
    }
    # Compute METEOR Score
    meteor = meteor_score([reference_tokens], generated_tokens)

    # Compute NIST Score
    n = np.min((len(reference_tokens),len(generated_tokens),2))
    nist = sentence_nist([reference_tokens], generated_tokens, n = n)

    # Compute BERTScore (F1)
    P, R, F1 = bert_score.score([generated], [reference], lang=bert_lang, verbose=False)
    bert_f1 = F1.mean().item()

    # Compute BLUERT score
    bleurt_scores = bleurt_scorer.score(references = [reference], candidates = [generated])
    if isinstance(bleurt_scores, list) and len(bleurt_scores) == 1:
        bleurt = bleurt_scores[0]
    else:
        0

    # Combine scores
    return {
        'BLEU-1' : bleu_1,
        'BLEU-2' : bleu_2,
        'BLEU': bleu,
        **rouge_f1,
        'METEOR' : meteor,
        'NIST' : nist,
        'BERTScore': bert_f1,
        'BLEURTScore' : bleurt
    }
    

## Target similarity

In [None]:
def compute_target_similarity(strategy, gold_file, pred_file, bleurt_scorer, debug_mode = False):

    df_gold = pd.read_csv(gold_file, encoding="utf-8")
    df_pred = pd.read_csv(pred_file, encoding="utf-8")    
    
    df_pred = df_pred[df_pred['target'].notnull()]
    df_pred = df_pred[df_pred['target'].notna()] 
    
    if debug_mode:
        df_pred = df_pred.sample(2)
    
    df_gold.rename(columns={'target': 'target_gold'}, inplace=True)
    df_pred.rename(columns={'target': 'target_pred'}, inplace=True)
    
    df_gold = df_gold[df_gold['post_id'].isin(df_pred['post_id'])]
    df_pred = df_pred[df_pred['post_id'].isin(df_gold['post_id'])]
    
    df_gold.sort_values("post_id", axis=0, ascending=True, inplace=True)
    df_pred.sort_values("post_id", axis=0, ascending=True, inplace=True)
    
    df_merge = pd.merge(df_gold, df_pred, on = "post_id")
    df_target = df_merge[['post_id', 'post', 'target_gold', 'target_pred']]
    
    # Compute metrics
    similarity_metrics = df_target.apply(lambda row: compute_similarity_metrics(row['target_pred'], row['target_gold'], bleurt_scorer), axis=1)
    
    # Flatten the metrics into separate columns
    metrics_df = pd.DataFrame(similarity_metrics.tolist())
    
    # Combine the original DataFrame with the metrics
    df_target_result = pd.concat([df_target, metrics_df], axis=1)
    df_target_result_max = df_target_result.groupby('post_id')[df_target_result.columns[4: ].tolist()].max()
    
    # Average results
    df_result = df_target_result_max[1: ].mean().to_frame().T
    df_result.insert(loc = 0, column = 'Strategy', value = strategy)
    
    return df_result

### Baseline

In [None]:
target_similiarty_baseline = compute_target_similarity(
    'Baseline',
    result_file_stg3,
    result_folder + model_name + '_result_baseline_stg3.csv',
    bleurt_scorer,
    debug_mode
)

### Prompt Tree of Thoughts

In [None]:
target_similiarty_prompt_tot = compute_target_similarity(
    'Prompt ToT',
    result_file_stg3,
    result_folder + model_name + '_result_prompt_tot_stg3.csv',
    bleurt_scorer,
    debug_mode
)

### Graph Tree of Thoughts

In [None]:
target_similiarty_graph = compute_target_similarity(
    'Graph ToT',
    result_file_stg3,
    result_folder + model_name + '_result_graph_stg3.csv',
    bleurt_scorer,
    debug_mode
)

### Otimizer MIPRO

In [None]:
target_similiarty_optimized = compute_target_similarity(
    'Optimized',
    result_file_stg3,
    result_folder + model_name + '_result_optimizer_stg3.csv',
    bleurt_scorer,
    debug_mode
)

## Result Target similarity

In [None]:
target_similarity_result = pd.concat(
    [target_similiarty_baseline, 
     target_similiarty_prompt_tot, 
     target_similiarty_graph, 
     target_similiarty_optimized]
)

In [None]:
if debug_mode == False:
    target_similarity_result.to_excel(result_folder + model_name + '_evaluation_stg3_target.xlsx', index = False)

In [None]:
target_similarity_result = pd.read_excel(result_folder + model_name + '_evaluation_stg3_target.xlsx')

In [None]:
target_similarity_result.head()

## Meaning similarity

In [None]:
def compute_meaning_similarity(strategy, gold_file, pred_file, bleurt_scorer, debug_mode = False):

    df_gold = pd.read_csv(gold_file, encoding="utf-8")
    df_pred = pd.read_csv(pred_file, encoding="utf-8")

    df_pred = df_pred[df_pred['implied_statement'].notnull()]
    df_pred = df_pred[df_pred['implied_statement'].notna()]    
    
    if debug_mode:
        df_pred = df_pred.sample(2)
        
    df_gold.rename(columns={'implied_statement': 'implied_statement_gold'}, inplace=True)
    df_pred.rename(columns={'implied_statement': 'implied_statement_pred'}, inplace=True)
    
    df_gold = df_gold[df_gold['post_id'].isin(df_pred['post_id'])]
    df_pred = df_pred[df_pred['post_id'].isin(df_gold['post_id'])]
    
    df_gold.sort_values("post_id", axis=0, ascending=True, inplace=True)
    df_pred.sort_values("post_id", axis=0, ascending=True, inplace=True)
    
    df_merge = pd.merge(df_gold, df_pred, on = "post_id")
    df_meaning = df_merge[['post_id', 'post', 'implied_statement_gold', 'implied_statement_pred']]
    
    # Compute metrics
    similarity_metrics = df_meaning.apply(lambda row: compute_similarity_metrics(row['implied_statement_pred'], row['implied_statement_gold'], bleurt_scorer), axis=1)
    
    # Flatten the metrics into separate columns
    metrics_df = pd.DataFrame(similarity_metrics.tolist())
    
    # Combine the original DataFrame with the metrics
    df_meaning_result = pd.concat([df_meaning, metrics_df], axis=1)
    df_meaning_result_max = df_meaning_result.groupby('post_id')[df_meaning_result.columns[4: ].tolist()].max()
    
    # Average results
    df_result = df_meaning_result_max[1: ].mean().to_frame().T
    df_result.insert(loc = 0, column = 'Strategy', value = strategy)

    return df_result
    

### Baseline

In [None]:
meaning_similiarty_baseline = compute_meaning_similarity(
    'Baseline',
    result_file_stg3,
    result_folder + model_name + '_result_baseline_stg3.csv',
    bleurt_scorer,
    debug_mode
)

### Prompt Tree of Thoughts

In [None]:
meaning_similiarty_prompt_tot = compute_meaning_similarity(
    'Prompt ToT',
    result_file_stg3,
    result_folder +  model_name + '_result_prompt_tot_stg3.csv',
    bleurt_scorer,
    debug_mode
)

### Graph Tree of Thoughts

In [None]:
meaning_similiarty_graph = compute_meaning_similarity(
    'Graph ToT',
    result_file_stg3,
    result_folder + model_name + '_result_graph_stg3.csv',
    bleurt_scorer,
    debug_mode
)

### Optimizer MIPRO

In [None]:
meaning_similiarty_optimized = compute_meaning_similarity(
    'Optimized',
    result_file_stg3,
    result_folder + model_name + '_result_optimizer_stg3.csv',
    bleurt_scorer,
    debug_mode
)

## Results Meaning similarity

In [None]:
meaning_similarity_result = pd.concat(
    [meaning_similiarty_baseline,
     meaning_similiarty_prompt_tot, 
     meaning_similiarty_graph, 
     meaning_similiarty_optimized]
)

In [None]:
if debug_mode == False:
    meaning_similarity_result.to_excel(result_folder + model_name + '_evaluation_stg3_meaning.xlsx', index = False)

In [None]:
meaning_similarity_result = pd.read_excel(result_folder + model_name + '_evaluation_stg3_meaning.xlsx')

In [None]:
meaning_similarity_result.round(3).head()

### Qualitative analysis (target, MIPRO results)

In [None]:
df_gold_stg3 = pd.read_csv("./implicit-hate-data/implicit_hate_test_stg3.csv", encoding = 'utf-8')
df_pred_stg3 = pd.read_csv("./implicit-hate-results/" + model_name + "_result_optimizer_stg3.csv", encoding = 'utf-8')

In [None]:
df_gold_stg3['target'] = df_gold_stg3['target'].str.lower()
df_pred_stg3['target'] = df_pred_stg3['target'].str.lower()

In [None]:
p = df_gold_stg3.groupby(['target'])['post_id'].count().div(len(df_gold_stg3)).multiply(100).sort_values().tail(15).plot(
    kind = 'barh', 
    xlabel='Number of posts (%)', 
    ylabel = 'Target',  
    title = 'Most frequent targets \n'
)                                                                               

In [None]:
p = df_pred_stg3.groupby(['target'])['post_id'].count().div(len(df_pred_stg3)).multiply(100).sort_values().tail(15).plot(
    kind = 'barh', 
    xlabel='Number of posts (%)', 
    ylabel = 'Target',  
    title = 'Most predicted targets \n'
)

#### Immigrants

In [None]:
df_immigrants = df_gold_stg3[df_gold_stg3['target'] == 'immigrants']

In [None]:
df_pred_immigrants = df_pred_stg3[df_pred_stg3['post_id'].isin(df_immigrants['post_id'])]
df_pred_immigrants = df_pred_immigrants[df_pred_immigrants['target'] != 'immigrants']

In [None]:
p = df_pred_immigrants.groupby(['target'])['post_id'].count().sort_values().tail(10).plot(
    kind = 'barh', 
    xlabel='Number of posts', 
    ylabel = 'Target',  
    title = 'Most frequent targets \n'
) 

In [None]:
df_pred_immigrants[df_pred_immigrants['target'] != 'immigrants'].sample(20)