In [1]:
import numpy as np
import joblib
from sklearn import metrics
from sklearn.linear_model import LogisticRegression

import pandas as pd 
import os

# Prepare data


## groud truth of validation set and testing seting set

In [2]:
train = pd.read_json('../../data/hateful_memes/train.jsonl', lines=True)
train['label'].value_counts()

0    5481
1    3019
Name: label, dtype: int64

In [3]:
dev_unseen = pd.read_json('../../data/hateful_memes/dev_unseen.jsonl', lines=True)
dev_seen = pd.read_json('../../data/hateful_memes/dev_seen.jsonl', lines=True)

test_unseen = pd.read_json('../../data/hateful_memes/test_unseen.jsonl', lines=True)
test_seen = pd.read_json('../../data/hateful_memes/test_seen.jsonl', lines=True)

print([dev_unseen.shape, dev_seen.shape, test_unseen.shape])

[(540, 4), (500, 4), (2000, 4)]


## Prepare training and validation data for Ensemble

### Negative Tag
Find negative tags in the text of the memes.

In [4]:
dev_unseen_tag = pd.read_csv('../keyword_search/dev_unseen.csv')
dev_seen_tag = pd.read_csv('../keyword_search/dev_seen.csv') 
test_unseen_tag = pd.read_csv('../keyword_search/test_unseen.csv')

dev_full_tag = dev_seen_tag.append(dev_unseen_tag)
print(dev_full_tag.shape)

# remove duplicates
dev_full_tag = dev_full_tag.drop_duplicates(['id'])
print(dev_full_tag.shape) 

dev_full_tag.loc[:, 'profanity':'religion'] = np.where(dev_full_tag.loc[:, 'profanity':'religion'].isnull(), 0, 1)
#print(dev_full_tag.head())

test_unseen_tag.loc[:, 'profanity':'religion'] = np.where(test_unseen_tag.loc[:, 'profanity':'religion'].isnull(), 0, 1)
# hateful speech
dev_unseen_htspch = pd.read_csv('../model-result/dev_unseen_concat.csv')
dev_seen_htspch = pd.read_csv('../model-result/dev_seen_concat.csv')
test_unseen_htsptch = pd.read_csv('../model-result/test_unseen_concat.csv')
#print(dev_unseen_htspch.head())
dev_full_htspch = dev_seen_htspch.append(dev_unseen_htspch).drop_duplicates(['id']).loc[:, ['id', 'hatespeech_prob']]
print(dev_full_htspch.shape)

test_unseen_htsptch = test_unseen_htsptch.loc[:, ['id', 'hatespeech_prob']]

(1040, 12)
(640, 12)
(640, 2)


## Prediction results from diffferent model
Combine prediction with the actual data, joining by id

In [5]:
models = ['vilbert', 'visualbert', 'visualbert_coco', 'detectron_VisualBert_cc_10pct', 
'detectron_VisualBert_cc_50pct', 'detectron_VisualBert_coco', 'detectron_VisualBert_coco_RoBERTa', 'detectron_VisualBert_coco_focalLoss','detectron_VisualBert_cc_50pct_focalLoss','uniter_36feats', 'uniter_50feats']

dev_seen_all = dev_seen.copy()
dev_unseen_all = dev_unseen.copy()

test_seen_all = test_seen.copy()
test_unseen_all = test_unseen.copy()

for model in models:

    # dev_unseen
    pred_dev_unseen = pd.read_csv('../model-result/'+model+'_dev_unseen'+'.csv')
    if(model in ['uniter_36feats', 'uniter_50feats']):
       pred_dev_unseen['proba']=np.exp(pred_dev_unseen['proba']) / (1 +np.exp(pred_dev_unseen['proba']))
    pred_dev_unseen = pred_dev_unseen.rename(columns={'proba':model+'_proba', 'label':model+'_label'})
    dev_unseen_all = dev_unseen_all.merge(pred_dev_unseen, on = 'id', how = 'inner')
   
    # dev_seen
    pred_dev_seen = pd.read_csv('../model-result/'+model+'_dev_seen'+'.csv')
    if(model in ['uniter_36feats', 'uniter_50feats']):
       pred_dev_seen['proba']=np.exp(pred_dev_seen['proba']) / (1 +np.exp(pred_dev_seen['proba']))
    pred_dev_seen = pred_dev_seen.rename(columns={'proba':model+'_proba', 'label':model+'_label'})
    dev_seen_all = dev_seen_all.merge(pred_dev_seen, on = 'id', how = 'inner') 
    #print(pred_dev_seen.shape)

    # test_unseen
    pred_test_unseen = pd.read_csv('../model-result/'+model+'_test_unseen'+'.csv')
    if(model in ['uniter_36feats', 'uniter_50feats']):
       pred_test_unseen['proba']=np.exp(pred_test_unseen['proba']) / (1 +np.exp(pred_test_unseen['proba']))
    pred_test_unseen = pred_test_unseen.rename(columns={'proba':model+'_proba', 'label':model+'_label'})
    test_unseen_all = test_unseen_all.merge(pred_test_unseen, on = 'id', how = 'inner')

print([dev_unseen_all.shape,dev_seen_all.shape, test_unseen_all.shape])
    # test_seen
    # pred_test_seen = pd.read_csv('../model-result/'+model+'_test_seen'+'.csv')
    # pred_test_seen = pred_test_seen.rename(columns={'proba':model+'_proba', 'label':model+'_label'})
    # test_seen_all = test_seen_all.merge(pred_test_seen, on = 'id', how = 'inner')  


[(540, 26), (500, 26), (2000, 26)]


In [6]:
# function report accuracy, f1_score, auc w.r.t model
def model_metrics(df, model_name):
    
    model_all = []
    acc_all = []
    f1_all = []
    auc_all = []
    for model in model_name:
        model_all.append(model)
        model_label = model + '_label'
        model_prob = model + '_proba'
        acc_all.append(metrics.accuracy_score(df.loc[:, 'label'], df.loc[:, model_label]))
        f1_all.append(metrics.f1_score(df.loc[:,'label'], df.loc[:, model_label]))
        fpr, tpr, _ = metrics.roc_curve(df.loc[:,'label'], df.loc[:, model_prob])
        auc_all.append(metrics.auc(fpr, tpr))
    
    df = pd.DataFrame()
    df['Model'] = model_all
    df['acc'] = acc_all
    df['f1'] = f1_all
    df['auc'] = auc_all
    return(df)

In [7]:
dev_seen_summary = model_metrics(dev_seen_all, models)
dev_seen_summary.to_csv('../model-result/dev_seen_summary'+'.csv')

dev_unseen_summary = model_metrics(dev_unseen_all, models)
dev_unseen_summary.to_csv('../model-result/dev_unseen_summary'+'.csv')

# test_seen_summary = model_metrics(test_seen_all, models)
# test_seen_summary.to_csv('../model-result/test_seen_summary'+'.csv')

test_unseen_summary = model_metrics(test_unseen_all, models)
test_unseen_summary.to_csv('../model-result/test_unseen_summary'+'.csv')

## Validation Set for Ensemble

In [8]:
# combine dev_unseen prediction
pred_dev_all = dev_seen_all.append(dev_unseen_all).drop_duplicates('id')
# print(pred_dev_all.shape)
pred_dev_all = pred_dev_all.drop(['img', 'label', 'text'], axis = 1)
#print(pred_dev_all.head())

# join pred_dev with the tags
dev_full = pd.merge(dev_full_tag, pred_dev_all, on = 'id')
dev_full = pd.merge(dev_full, dev_full_htspch, on = 'id')
dev_full.to_csv('../model-result/ensemble_train.csv')
dev_full.columns

Index(['Unnamed: 0', 'id', 'img', 'label', 'text', 'profanity', 'nationality',
       'racism', 'gender', 'disability', 'pregnancy', 'religion',
       'vilbert_proba', 'vilbert_label', 'visualbert_proba',
       'visualbert_label', 'visualbert_coco_proba', 'visualbert_coco_label',
       'detectron_VisualBert_cc_10pct_proba',
       'detectron_VisualBert_cc_10pct_label',
       'detectron_VisualBert_cc_50pct_proba',
       'detectron_VisualBert_cc_50pct_label',
       'detectron_VisualBert_coco_proba', 'detectron_VisualBert_coco_label',
       'detectron_VisualBert_coco_RoBERTa_proba',
       'detectron_VisualBert_coco_RoBERTa_label',
       'detectron_VisualBert_coco_focalLoss_proba',
       'detectron_VisualBert_coco_focalLoss_label',
       'detectron_VisualBert_cc_50pct_focalLoss_proba',
       'detectron_VisualBert_cc_50pct_focalLoss_label', 'uniter_36feats_proba',
       'uniter_36feats_label', 'uniter_50feats_proba', 'uniter_50feats_label',
       'hatespeech_prob'],
      dtyp

# Test Set for Ensemble

In [9]:
# combine dev_unseen prediction
pred_test_unseen_all = test_unseen_all.drop(['img', 'label', 'text'], axis = 1)

# join pred_dev with the tags
test_unseen_tag = test_unseen_tag.drop(['img', 'label','text'], axis = 1)
test_full = pd.merge(test_unseen, pred_test_unseen_all, on = 'id')
test_full = pd.merge(test_unseen_tag, test_full, on = 'id')
test_full = pd.merge(test_full, test_unseen_htsptch,on = 'id')
test_full.to_csv('../model-result/ensemble_test.csv')
test_full.columns

Index(['Unnamed: 0', 'id', 'profanity', 'nationality', 'racism', 'gender',
       'disability', 'pregnancy', 'religion', 'img', 'label', 'text',
       'vilbert_proba', 'vilbert_label', 'visualbert_proba',
       'visualbert_label', 'visualbert_coco_proba', 'visualbert_coco_label',
       'detectron_VisualBert_cc_10pct_proba',
       'detectron_VisualBert_cc_10pct_label',
       'detectron_VisualBert_cc_50pct_proba',
       'detectron_VisualBert_cc_50pct_label',
       'detectron_VisualBert_coco_proba', 'detectron_VisualBert_coco_label',
       'detectron_VisualBert_coco_RoBERTa_proba',
       'detectron_VisualBert_coco_RoBERTa_label',
       'detectron_VisualBert_coco_focalLoss_proba',
       'detectron_VisualBert_coco_focalLoss_label',
       'detectron_VisualBert_cc_50pct_focalLoss_proba',
       'detectron_VisualBert_cc_50pct_focalLoss_label', 'uniter_36feats_proba',
       'uniter_36feats_label', 'uniter_50feats_proba', 'uniter_50feats_label',
       'hatespeech_prob'],
      dtyp

# Ensemble
## Helper functions
Functions: calcualte metrics: accuary and auc

In [10]:
ensemble_model = ['visualbert_coco', 'detectron_VisualBert_cc_10pct', 'detectron_VisualBert_cc_50pct', 'detectron_VisualBert_coco','detectron_VisualBert_coco_focalLoss','detectron_VisualBert_cc_50pct_focalLoss','uniter_36feats', 'uniter_50feats']


## Max Voting
https://www.analyticsvidhya.com/blog/2018/06/comprehensive-guide-for-ensemble-models/ 

Only reading the labels from different models. The final label is the majority labels

In [11]:
def max_vote(df, model_name):
    np.random.seed(1)
    model_label = [s + '_label' for s in model_name]
    
    major_vote = np.array(df[model_label].median(axis=1).values)
    # for major half vs half, random choose one
    equal_idx = np.where(major_vote==0.5)[0]
    
    equal_idx_label = np.random.choice([0,1], size = equal_idx.size)
    major_vote[equal_idx] = equal_idx_label
    
    df['major_vote_label'] = major_vote
    major_vote_proba = np.zeros(df.shape[0])
    model_name = np.array(model_name)
    for i in np.arange(major_vote.size):
        row_i_idx = np.where(df.loc[i][model_label] == major_vote[i])[0]
        model_i = model_name[row_i_idx]
        model_proba_i = [s + '_proba' for s in model_i]
        major_vote_proba[i] = np.mean(df.loc[i][model_proba_i].values)
    df['major_vote_proba'] = major_vote_proba 
    
    return(model_metrics(df, ['major_vote']))
# val
# max_vote(dev_full, models)
max_vote(dev_unseen_all, ensemble_model)
# test

#max_vote(pred_test_tmp, models)

Unnamed: 0,Model,acc,f1,auc
0,major_vote,0.731481,0.545455,0.757897


## Average
Proba is calcuated by averaging all the model probability. New label = 1 if proba is larger than 0.5, o.w label = 0

In [12]:
def average_vote(df, model_name):
   
    model_proba = [s + '_proba' for s in model_name]
    
    average_vote_proba = np.array(df[model_proba].mean(axis=1).values)
    average_vote_label = [1 if i > 0.5 else 0 for i in average_vote_proba]
    df['average_vote_proba'] = average_vote_proba 
    df['average_vote_label'] = average_vote_label
    return(model_metrics(df, ['average_vote']))
    
# val
average_vote(dev_unseen_all, ensemble_model)



Unnamed: 0,Model,acc,f1,auc
0,average_vote,0.718519,0.486486,0.778397


In [13]:
import pickle

# model 
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint as sp_randint

# plot
from sklearn.metrics import average_precision_score, auc, roc_curve, precision_recall_curve
import matplotlib.pyplot as plt


In [21]:
train_data = pd.read_csv('../model-result/ensemble_train.csv')
test_data = pd.read_csv('../model-result/ensemble_test.csv')
vars = ['profanity', 'nationality', 'racism', 'gender', 'disability', 'pregnancy', 'religion',  'uniter_36feats_proba', 'uniter_50feats_proba','hatespeech_prob']
extra_var = ['profanity', 'nationality', 'racism', 'gender', 'disability', 'pregnancy', 'religion',  'uniter_36feats_proba', 'uniter_50feats_proba', 'hatespeech_prob', 'detectron_VisualBert_cc_10pct_proba',
       'detectron_VisualBert_cc_50pct_proba',
       'detectron_VisualBert_coco_proba',
       'detectron_VisualBert_coco_focalLoss_proba',
       'detectron_VisualBert_cc_50pct_focalLoss_proba'] 
X_train = train_data.loc[:, vars]
y_train = train_data['label']

X_test = test_data.loc[:, vars]
y_test = test_data['label']
print(X_train.shape)



(640, 10)


In [31]:
def main(train_data, test_data, vars, param_list, random_state, file_path, n_iter):

    if not os.path.exists(file_path):
        os.makedirs(file_path)
    
    X_train = train_data.loc[:, vars]
    y_train = train_data['label']

    X_test = test_data.loc[:, vars]
    y_test = test_data['label']

    rf = RandomForestClassifier()
    # Random search of parameters, using 3 fold cross validation, 
    # search across 100 different combinations, and use all available cores
    rf_random = RandomizedSearchCV(estimator = rf, param_distributions = param_list,n_iter = n_iter, cv = 3, verbose=2, random_state=random_state, n_jobs = -1)

    rf_random.fit(X_train, y_train)
    print(rf_random.best_estimator_)

    # save 
    pickle.dump(rf_random, open(file_path +'/rf'+str(random_state)+'.pkl', 'wb'))
    with open(file_path +'/rf'+str(random_state)+'.txt', 'w') as f:
        f.write(str(rf_random.best_params_))
        
    # prediction
    y_pred = rf_random.best_estimator_.predict_proba(X_test)[:,1]
    y_pred_label = rf_random.best_estimator_.predict(X_test) 
    acc = metrics.accuracy_score(y_test, y_pred_label)
    f1 = metrics.f1_score(y_test, y_pred_label)
    fpr, tpr, _ = metrics.roc_curve(y_test, y_pred)
    auc = metrics.auc(fpr, tpr)
    measure_tab = pd.DataFrame([{'Acc': acc,'F1': f1, 'auc': auc}])
    measure_tab.to_csv(file_path +'/rf'+str(random_state)+'.csv',index = False)
    return measure_tab, rf_random

### Random Forecast


In [32]:
# Modeling -----------------
# Number of trees in random forest
n_estimators = sp_randint(2, 50)
# Number of features to consider at every split
max_features = ['log2', 'sqrt']
# Maximum number of levels in tree
max_depth = sp_randint(2, 20)
# Minimum number of samples required to split a node
min_samples_split = sp_randint(2, 10)
# Minimum number of samples required at each leaf node
min_samples_leaf =sp_randint(1, 5)
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}


In [33]:
text_feature =  ['profanity', 'nationality', 'racism', 'gender', 'disability', 'pregnancy', 'religion' , 'hatespeech_prob']


## All models
['visualbert_coco','detectron_VisualBert_cc_10pct', 'detectron_VisualBert_cc_50pct', 'detectron_VisualBert_coco','detectron_VisualBert_coco_focalLoss','detectron_VisualBert_cc_50pct_focalLoss','detectron_VisualBert_coco_RoBERTa','uniter_36feats', 'uniter_50feats']

In [37]:
all_vars

['profanity',
 'nationality',
 'racism',
 'gender',
 'disability',
 'pregnancy',
 'religion',
 'hatespeech_prob',
 'visualbert_coco',
 'detectron_VisualBert_cc_10pct',
 'detectron_VisualBert_cc_50pct',
 'detectron_VisualBert_coco',
 'detectron_VisualBert_coco_focalLoss',
 'detectron_VisualBert_cc_50pct_focalLoss',
 'detectron_VisualBert_coco_RoBERTa',
 'uniter_36feats',
 'uniter_50feats']

In [38]:

version = 'all_vars2'
n_iter = 1000
file_path = '../model-result/randomForest/model'+str(version)

all_models = ['visualbert_coco','detectron_VisualBert_cc_10pct', 'detectron_VisualBert_cc_50pct', 'detectron_VisualBert_coco','detectron_VisualBert_coco_focalLoss','detectron_VisualBert_cc_50pct_focalLoss','detectron_VisualBert_coco_RoBERTa','uniter_36feats', 'uniter_50feats']
all_vars = text_feature + [s + '_proba' for s in all_models]
measure_tab, rf_random = main(train_data, test_data, all_vars, param_list=random_grid, file_path = file_path, random_state = 17, n_iter = n_iter)  
print('test_unseen')
print(max_vote(test_data, all_models))
print(average_vote(test_data, all_models))

print('dev_unseen')
print(max_vote(dev_unseen_all, all_models))
print(average_vote(dev_unseen_all, all_models))

Fitting 3 folds for each of 1000 candidates, totalling 3000 fits
RandomForestClassifier(bootstrap=False, max_depth=3, max_features='sqrt',
                       min_samples_leaf=4, min_samples_split=5,
                       n_estimators=12)
test_unseen
        Model     acc        f1       auc
0  major_vote  0.7485  0.594682  0.788378
          Model     acc        f1      auc
0  average_vote  0.7395  0.555081  0.80313
dev_unseen
        Model       acc        f1      auc
0  major_vote  0.738889  0.557994  0.75775
          Model       acc        f1       auc
0  average_vote  0.716667  0.488294  0.780882


## VisualVert 

In [39]:
version = 'visual_bert'
n_iter = 1000
file_path = '../model-result/randomForest/model_'+str(version)

all_models = ['visualbert_coco','detectron_VisualBert_cc_10pct', 'detectron_VisualBert_cc_50pct', 'detectron_VisualBert_coco','detectron_VisualBert_coco_focalLoss','detectron_VisualBert_cc_50pct_focalLoss','detectron_VisualBert_coco_RoBERTa']
all_vars = text_feature + [s + '_proba' for s in all_models]
measure_tab, rf_random = main(train_data, test_data, all_vars, param_list=random_grid, file_path = file_path, random_state = 17, n_iter = n_iter)  
print('test_unseen')
print(max_vote(test_data, all_models))
print(average_vote(test_data, all_models))

print('dev_unseen')
print(max_vote(dev_unseen_all, all_models))
print(average_vote(dev_unseen_all, all_models))

Fitting 3 folds for each of 1000 candidates, totalling 3000 fits
RandomForestClassifier(max_depth=2, max_features='sqrt', min_samples_leaf=3,
                       min_samples_split=8, n_estimators=30)
test_unseen
        Model     acc        f1       auc
0  major_vote  0.7435  0.585287  0.782558
          Model    acc        f1       auc
0  average_vote  0.746  0.588331  0.790391
dev_unseen
        Model       acc        f1       auc
0  major_vote  0.724074  0.544343  0.752015
          Model       acc      f1       auc
0  average_vote  0.724074  0.5387  0.770426


### Uniter


In [40]:
version = 'uniter2'
n_iter = 1000
file_path = '../model-result/randomForest/model_'+str(version)

all_models = ['uniter_36feats', 'uniter_50feats']
all_vars = text_feature + [s + '_proba' for s in all_models]
measure_tab, rf_random = main(train_data, test_data, all_vars, param_list=random_grid, file_path = file_path, random_state = 17, n_iter = n_iter)  
print('test_unseen')
print(max_vote(test_data, all_models))
print(average_vote(test_data, all_models))

print('dev_unseen')
print(max_vote(dev_unseen_all, all_models))
print(average_vote(dev_unseen_all, all_models))

Fitting 3 folds for each of 1000 candidates, totalling 3000 fits
RandomForestClassifier(bootstrap=False, max_depth=2, max_features='log2',
                       min_samples_leaf=2, min_samples_split=4,
                       n_estimators=26)
test_unseen
        Model    acc        f1       auc
0  major_vote  0.749  0.631424  0.795156
          Model    acc   f1       auc
0  average_vote  0.625  0.0  0.802213
dev_unseen
        Model       acc       f1       auc
0  major_vote  0.725926  0.54878  0.810588
          Model      acc   f1       auc
0  average_vote  0.62963  0.0  0.811603


In [27]:
version = 'uniter'
n_iter = 1000
file_path = '../model-result/randomForest/model_'+str(version)
      
measure_tab, rf_random = main(train_data, test_data, vars, param_list=random_grid, file_path = file_path, random_state = 17, n_iter = n_iter)  

Fitting 3 folds for each of 1000 candidates, totalling 3000 fits
RandomForestClassifier(bootstrap=False, max_depth=2, max_features='sqrt',
                       min_samples_leaf=2, min_samples_split=4, n_estimators=5)


In [29]:
vars = ['profanity', 'nationality', 'racism', 'gender', 'disability', 'pregnancy', 'religion' , 'hatespeech_prob', 
'detectron_VisualBert_cc_10pct_proba',
       'detectron_VisualBert_cc_50pct_proba',
       'detectron_VisualBert_coco_proba',
       'detectron_VisualBert_coco_focalLoss_proba',
       'detectron_VisualBert_cc_50pct_focalLoss_proba'] 

version = 'no_uniter'
file_path = '../model-result/randomForest/model_'+str(version)
measure_tab, rf_random = main(train_data, test_data, vars, param_list=random_grid, file_path = file_path, random_state = 17, n_iter = n_iter)  

Fitting 3 folds for each of 1000 candidates, totalling 3000 fits
RandomForestClassifier(max_depth=2, max_features='sqrt', min_samples_split=6,
                       n_estimators=49)


### Logistc Regression

In [61]:
def lgst(train_data, test_data, vars, file_path,random_state):
    if not os.path.exists(file_path):
        os.makedirs(file_path)
    
    X_train = train_data.loc[:, vars]
    y_train = train_data['label']

    X_test = test_data.loc[:, vars]
    y_test = test_data['label']
    # build model
    model = LogisticRegression(penalty='none', fit_intercept=False, random_state=random_state, class_weight='balanced').fit(X_train, y_train)
    y_pred = model.predict_proba(X_test)[:,1]
    y_pred_label = model.predict(X_test)
    acc = metrics.accuracy_score(y_test, y_pred_label)
    f1 = metrics.f1_score(y_test, y_pred_label)
    fpr, tpr, _ = metrics.roc_curve(y_test, y_pred)
    auc = metrics.auc(fpr, tpr)
    measure_tab = pd.DataFrame([{'Acc': acc,'F1': f1, 'auc': auc}])
    measure_tab.to_csv(file_path +'/lgst'+str(random_state)+'.csv',index = False)
    #print(y_fitted)
    return measure_tab, model


version = 'more_vars'
file_path = '../model-result/lgst/model'+str(version)
measure_tab, model = lgst(train_data, test_data, extra_var, file_path = file_path, random_state = 17)  

In [62]:
vars = ['profanity', 'nationality', 'racism', 'gender', 'disability', 'pregnancy', 'religion', 'hatespeech_prob', 'detectron_VisualBert_cc_10pct_proba',
       'detectron_VisualBert_cc_50pct_proba',
       'detectron_VisualBert_coco_proba',
       'detectron_VisualBert_coco_focalLoss_proba'] 

version = 'no_uniter'
file_path = '../model-result/lgst/model_'+str(version)
measure_tab, model = lgst(train_data, test_data, vars, file_path = file_path, random_state = 17)  

### Add-hoc
Add memes will be treated as hateful if they trigger at least 2 sensitive works

In [35]:
dev_unseen_tag.loc[:, 'profanity':'religion'] = np.where(dev_unseen_tag.loc[:, 'profanity':'religion'].isnull(), 0, 1)
#print(dev_unseen_tag)
#dev_unseen_tag.loc[:, 'profanity':'religion'].(axis = 1)

In [55]:
np.where(dev_unseen_tag.loc[:, 'profanity':'religion'].sum(axis = 1)>2)
print(np.where(test_unseen_tag.loc[:, 'profanity':'religion'].sum(axis = 1)>2)[0])

tmp = pd.read_csv('../model-result/uniter_36feats_test_unseen.csv')
tmp.loc[np.where(test_unseen_tag.loc[:, 'profanity':'religion'].sum(axis = 1)>2)[0],:]

[386 516 743]


Unnamed: 0,id,proba,label
386,87549,-0.002486,1
516,71420,-4e-05,1
743,31957,-3.4e-05,1
