In [2]:
import json
import os
import csv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
from IPython.display import Markdown, display
def printmd(string):
    display(Markdown(string))
printmd('**bold**')

**bold**

In [4]:
data_path = "data/train.csv"
data_raw = pd.read_csv(data_path)
#data_raw = data_raw.loc[np.random.choice(data_raw.index, size=2000)]
data_raw.shape

(159571, 8)

In [5]:
categories = list(data_raw.columns.values)
categories = categories[2:]
print(categories)

['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']


In [8]:
query = "toxic&obscene"
tasks = query.split('&')
tasks

['toxic', 'obscene']

In [9]:
query.split('^')

['toxic&obscene']

## Model Repository
### From JSON to DataFrame

In [35]:
import glob
import re
files = glob.glob("model/*.json")
print(files)
re.split('/|\\.', files[0])

['model/OvR_identity_hate.json', 'model/BinaryRelevance.json', 'model/MLkNN.json', 'model/OvR_threat.json', 'model/OvR_obscene.json', 'model/OvR_severe_toxic.json', 'model/ClassifierChain.json', 'model/OvR_insult.json', 'model/LabelPowerset.json', 'model/OvR_toxic.json']


['model', 'OvR_identity_hate', 'json']

In [77]:
model_repository_accuracy = {}
model_repository_cost = {}
for file in files:
    print(file)
    model = re.split('/|\\.', file)[1]
#     print(model)
    with open(file) as json_file: 
        data = json.load(json_file)
        print(list(data['task'].keys()))
        model_repository_accuracy[model] = {}
        model_repository_cost[model] = {}
        for key in list(data['task'].keys()):
            model_repository_accuracy[model][key] = data['task'][key]['accuracy']
            model_repository_cost[model][key] = data['task'][key]['cost']

model/OvR_identity_hate.json
['identity_hate']
model/BinaryRelevance.json
['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
model/MLkNN.json
['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
model/OvR_threat.json
['threat']
model/OvR_obscene.json
['obscene']
model/OvR_severe_toxic.json
['severe_toxic']
model/ClassifierChain.json
['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
model/OvR_insult.json
['insult']
model/LabelPowerset.json
['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
model/OvR_toxic.json
['toxic']


In [81]:
from pandas import DataFrame
model_repository_acc_df = DataFrame.from_dict(model_repository_accuracy,orient='index')
model_repository_acc_df = model_repository_acc_df.fillna(0.0)
print(model_repository_acc_df)

model_repository_cost_df = DataFrame.from_dict(model_repository_cost,orient='index')
model_repository_cost_df = model_repository_cost_df.fillna(200)
print(model_repository_cost_df)

                   identity_hate  toxic  severe_toxic  obscene  threat  insult
OvR_identity_hate          0.995  0.000         0.000    0.000   0.000   0.000
BinaryRelevance            0.990  0.887         0.978    0.937   0.990   0.932
MLkNN                      0.995  0.910         0.992    0.955   0.992   0.957
ClassifierChain            0.995  0.898         0.988    0.950   0.992   0.942
LabelPowerset              0.995  0.898         0.988    0.950   0.992   0.942
OvR_toxic                  0.000  0.898         0.000    0.000   0.000   0.000
OvR_severe_toxic           0.000  0.000         0.988    0.000   0.000   0.000
OvR_obscene                0.000  0.000         0.000    0.950   0.000   0.000
OvR_threat                 0.000  0.000         0.000    0.000   0.992   0.000
OvR_insult                 0.000  0.000         0.000    0.000   0.000   0.942
                   identity_hate       toxic  severe_toxic     obscene  \
OvR_identity_hate       0.000397  200.000000    200.00000

In [80]:
model_repository_cost_df.fillna(200)
print(model_repository_cost_df)

                   identity_hate      toxic  severe_toxic    obscene  \
OvR_identity_hate       0.000397        NaN           NaN        NaN   
BinaryRelevance         3.456550   3.456550      3.456550   3.456550   
MLkNN                  51.265170  51.265170     51.265170  51.265170   
ClassifierChain         0.627300   0.627300      0.627300   0.627300   
LabelPowerset           0.137730   0.137730      0.137730   0.137730   
OvR_toxic                    NaN   0.000367           NaN        NaN   
OvR_severe_toxic             NaN        NaN      0.000361        NaN   
OvR_obscene                  NaN        NaN           NaN   0.000355   
OvR_threat                   NaN        NaN           NaN        NaN   
OvR_insult                   NaN        NaN           NaN        NaN   

                      threat     insult  
OvR_identity_hate        NaN        NaN  
BinaryRelevance     3.456550   3.456550  
MLkNN              51.265170  51.265170  
ClassifierChain     0.627300   0.627300

In [64]:
model_repository_acc_df.stack()

OvR_identity_hate  identity_hate    0.995
BinaryRelevance    identity_hate    0.990
                   toxic            0.887
                   severe_toxic     0.978
                   obscene          0.937
                   threat           0.990
                   insult           0.932
MLkNN              identity_hate    0.995
                   toxic            0.910
                   severe_toxic     0.992
                   obscene          0.955
                   threat           0.992
                   insult           0.957
ClassifierChain    identity_hate    0.995
                   toxic            0.898
                   severe_toxic     0.988
                   obscene          0.950
                   threat           0.992
                   insult           0.942
LabelPowerset      identity_hate    0.995
                   toxic            0.898
                   severe_toxic     0.988
                   obscene          0.950
                   threat         

## From DataFrame to Gurobi multidict

In [52]:
model_repository_df.columns

Index(['identity_hate', 'toxic', 'severe_toxic', 'obscene', 'threat',
       'insult'],
      dtype='object')

In [53]:
model_repository_df.index

Index(['OvR_identity_hate', 'BinaryRelevance', 'MLkNN', 'ClassifierChain',
       'LabelPowerset', 'OvR_toxic', 'OvR_severe_toxic', 'OvR_obscene',
       'OvR_threat', 'OvR_insult'],
      dtype='object')

In [82]:
repository_acc_dict = {(idx,col): model_repository_acc_df[col][idx] for col in model_repository_acc_df.columns for idx in model_repository_df.index}
repository_acc_dict

{('OvR_identity_hate', 'identity_hate'): 0.995,
 ('BinaryRelevance', 'identity_hate'): 0.99,
 ('MLkNN', 'identity_hate'): 0.995,
 ('ClassifierChain', 'identity_hate'): 0.995,
 ('LabelPowerset', 'identity_hate'): 0.995,
 ('OvR_toxic', 'identity_hate'): 0.0,
 ('OvR_severe_toxic', 'identity_hate'): 0.0,
 ('OvR_obscene', 'identity_hate'): 0.0,
 ('OvR_threat', 'identity_hate'): 0.0,
 ('OvR_insult', 'identity_hate'): 0.0,
 ('OvR_identity_hate', 'toxic'): 0.0,
 ('BinaryRelevance', 'toxic'): 0.887,
 ('MLkNN', 'toxic'): 0.91,
 ('ClassifierChain', 'toxic'): 0.898,
 ('LabelPowerset', 'toxic'): 0.898,
 ('OvR_toxic', 'toxic'): 0.898,
 ('OvR_severe_toxic', 'toxic'): 0.0,
 ('OvR_obscene', 'toxic'): 0.0,
 ('OvR_threat', 'toxic'): 0.0,
 ('OvR_insult', 'toxic'): 0.0,
 ('OvR_identity_hate', 'severe_toxic'): 0.0,
 ('BinaryRelevance', 'severe_toxic'): 0.978,
 ('MLkNN', 'severe_toxic'): 0.992,
 ('ClassifierChain', 'severe_toxic'): 0.988,
 ('LabelPowerset', 'severe_toxic'): 0.988,
 ('OvR_toxic', 'severe_toxi

### From JSON to multidict

In [84]:
multi_index = pd.MultiIndex.from_product([model_repository_df.index, model_repository_df.columns], names=['first', 'second'])
print(type(multi_index))
list_multi_index = multi_index.tolist()
print(list_multi_index)

<class 'pandas.core.indexes.multi.MultiIndex'>
[('OvR_identity_hate', 'identity_hate'), ('OvR_identity_hate', 'toxic'), ('OvR_identity_hate', 'severe_toxic'), ('OvR_identity_hate', 'obscene'), ('OvR_identity_hate', 'threat'), ('OvR_identity_hate', 'insult'), ('BinaryRelevance', 'identity_hate'), ('BinaryRelevance', 'toxic'), ('BinaryRelevance', 'severe_toxic'), ('BinaryRelevance', 'obscene'), ('BinaryRelevance', 'threat'), ('BinaryRelevance', 'insult'), ('MLkNN', 'identity_hate'), ('MLkNN', 'toxic'), ('MLkNN', 'severe_toxic'), ('MLkNN', 'obscene'), ('MLkNN', 'threat'), ('MLkNN', 'insult'), ('ClassifierChain', 'identity_hate'), ('ClassifierChain', 'toxic'), ('ClassifierChain', 'severe_toxic'), ('ClassifierChain', 'obscene'), ('ClassifierChain', 'threat'), ('ClassifierChain', 'insult'), ('LabelPowerset', 'identity_hate'), ('LabelPowerset', 'toxic'), ('LabelPowerset', 'severe_toxic'), ('LabelPowerset', 'obscene'), ('LabelPowerset', 'threat'), ('LabelPowerset', 'insult'), ('OvR_toxic', 'id

In [113]:
dict_repository_acc_from_json = {}
dict_repository_cost_from_json = {}
dict_repository_from_json = {}
list_of_model = []
for file in files:
    model = re.split('/|\\.', file)[1]
    list_of_model.append(model)
    with open(file) as json_file: 
        data = json.load(json_file)
#         print(list(data['task'].keys()))
        model_repository_cost[model] = {}
        for key in list(data['task'].keys()):
            dict_repository_acc_from_json[(model,key)] = data['task'][key]['accuracy']
            dict_repository_cost_from_json[(model,key)] = data['task'][key]['cost']
            dict_repository_from_json[(model,key)] = [data['task'][key]['accuracy'], data['task'][key]['cost'], log(data['task'][key]['accuracy'])]
            
for tuple_key in list_multi_index:
    if tuple_key not in dict_repository_from_json.keys():
        dict_repository_acc_from_json[tuple_key] = 0.0
        dict_repository_cost_from_json[tuple_key] = 200
        dict_repository_from_json[tuple_key] = [0.0,200,0.0005]
dict_repository_from_json

{('OvR_identity_hate', 'identity_hate'): [0.995,
  0.00039699999999999995,
  -0.005012541823544286],
 ('BinaryRelevance', 'toxic'): [0.887, 3.45655, -0.11991029667255755],
 ('BinaryRelevance', 'severe_toxic'): [0.978, 3.45655, -0.022245608947319737],
 ('BinaryRelevance', 'obscene'): [0.937, 3.45655, -0.0650719967437148],
 ('BinaryRelevance', 'threat'): [0.99, 3.45655, -0.01005033585350145],
 ('BinaryRelevance', 'insult'): [0.932, 3.45655, -0.07042246429654582],
 ('BinaryRelevance', 'identity_hate'): [0.99, 3.45655, -0.01005033585350145],
 ('MLkNN', 'toxic'): [0.91, 51.26517, -0.09431067947124129],
 ('MLkNN', 'severe_toxic'): [0.992, 51.26517, -0.008032171697264267],
 ('MLkNN', 'obscene'): [0.955, 51.26517, -0.046043938501406846],
 ('MLkNN', 'threat'): [0.992, 51.26517, -0.008032171697264267],
 ('MLkNN', 'insult'): [0.957, 51.26517, -0.04395188752918283],
 ('MLkNN', 'identity_hate'): [0.995, 51.26517, -0.005012541823544286],
 ('OvR_threat', 'threat'): [0.992,
  0.00046699999999999997,
 

## Gurobi Optimization

In [89]:
import gurobipy as gp
from gurobipy import GRB
from math import log, exp

In [126]:
M = list_of_model
T = ['toxic','obscene']
combinations, accuracy, speed, log_accuracy = gp.multidict(
    dict_repository_from_json
)
dict_model_acc_cost = {model:[model_repository_acc_df[t][model] for t in T]+[model_repository_cost_df[t][model] for t in T] for model in M}
Model, acc_task1, acc_task2, cost_task1, cost_task2 = gp.multidict(dict_model_acc_cost)
# Objective Constraint
Speed = 60
Accuracy = 0.85

In [124]:
dict_model_acc_cost

{'OvR_identity_hate': [0.0, 0.0, 200.0, 200.0],
 'BinaryRelevance': [0.887, 0.937, 3.45655, 3.45655],
 'MLkNN': [0.91, 0.955, 51.26517, 51.26517],
 'OvR_threat': [0.0, 0.0, 200.0, 200.0],
 'OvR_obscene': [0.0, 0.95, 200.0, 0.000355],
 'OvR_severe_toxic': [0.0, 0.0, 200.0, 200.0],
 'ClassifierChain': [0.898, 0.95, 0.6273, 0.6273],
 'OvR_insult': [0.0, 0.0, 200.0, 200.0],
 'LabelPowerset': [0.898, 0.95, 0.13773, 0.13773],
 'OvR_toxic': [0.898, 0.0, 0.00036700000000000003, 200.0]}

In [110]:
accuracy

{('OvR_identity_hate', 'identity_hate'): 0.995,
 ('BinaryRelevance', 'toxic'): 0.887,
 ('BinaryRelevance', 'severe_toxic'): 0.978,
 ('BinaryRelevance', 'obscene'): 0.937,
 ('BinaryRelevance', 'threat'): 0.99,
 ('BinaryRelevance', 'insult'): 0.932,
 ('BinaryRelevance', 'identity_hate'): 0.99,
 ('MLkNN', 'toxic'): 0.91,
 ('MLkNN', 'severe_toxic'): 0.992,
 ('MLkNN', 'obscene'): 0.955,
 ('MLkNN', 'threat'): 0.992,
 ('MLkNN', 'insult'): 0.957,
 ('MLkNN', 'identity_hate'): 0.995,
 ('OvR_threat', 'threat'): 0.992,
 ('OvR_obscene', 'obscene'): 0.95,
 ('OvR_severe_toxic', 'severe_toxic'): 0.988,
 ('ClassifierChain', 'toxic'): 0.898,
 ('ClassifierChain', 'severe_toxic'): 0.988,
 ('ClassifierChain', 'obscene'): 0.95,
 ('ClassifierChain', 'threat'): 0.992,
 ('ClassifierChain', 'insult'): 0.942,
 ('ClassifierChain', 'identity_hate'): 0.995,
 ('OvR_insult', 'insult'): 0.942,
 ('LabelPowerset', 'toxic'): 0.898,
 ('LabelPowerset', 'severe_toxic'): 0.988,
 ('LabelPowerset', 'obscene'): 0.95,
 ('LabelPo

In [111]:
m=gp.Model('RAP_document')

Using license file /Users/lizy/gurobi.lic
Academic license - for non-commercial use only


In [118]:
# decision variables
v = m.addVars(combinations, vtype=GRB.BINARY, name="assign")

v1 = m.addVars(Model, vtype=GRB.BINARY, name="toxic")
v2 = m.addVars(Model, vtype=GRB.BINARY, name="obscene")

g = m.addVars(Model, name="gap")

In [119]:
v

{('OvR_identity_hate', 'identity_hate'): <gurobi.Var *Awaiting Model Update*>,
 ('BinaryRelevance', 'toxic'): <gurobi.Var *Awaiting Model Update*>,
 ('BinaryRelevance', 'severe_toxic'): <gurobi.Var *Awaiting Model Update*>,
 ('BinaryRelevance', 'obscene'): <gurobi.Var *Awaiting Model Update*>,
 ('BinaryRelevance', 'threat'): <gurobi.Var *Awaiting Model Update*>,
 ('BinaryRelevance', 'insult'): <gurobi.Var *Awaiting Model Update*>,
 ('BinaryRelevance', 'identity_hate'): <gurobi.Var *Awaiting Model Update*>,
 ('MLkNN', 'toxic'): <gurobi.Var *Awaiting Model Update*>,
 ('MLkNN', 'severe_toxic'): <gurobi.Var *Awaiting Model Update*>,
 ('MLkNN', 'obscene'): <gurobi.Var *Awaiting Model Update*>,
 ('MLkNN', 'threat'): <gurobi.Var *Awaiting Model Update*>,
 ('MLkNN', 'insult'): <gurobi.Var *Awaiting Model Update*>,
 ('MLkNN', 'identity_hate'): <gurobi.Var *Awaiting Model Update*>,
 ('OvR_threat', 'threat'): <gurobi.Var *Awaiting Model Update*>,
 ('OvR_obscene', 'obscene'): <gurobi.Var *Awaiting

In [121]:
# Task constraints
tasks = m.addConstrs((v.sum('*',t) <= 1 for t in T), name='task')
t1 = m.addConstr(v1.sum() == 1, name='t1')
t2 = m.addConstr(v2.sum() == 1, name='t2')

In [127]:
# Accuracy constraint
budget = m.addConstr((v1.prod(acc_task1)*v2.prod(acc_task2) >= Accuracy), name='budget')
# v1.prod(acc_task1) + v2.prod(acc_task2) - v1.prod(acc_task1) * v2.prod(acc_task2)

In [129]:
# Objective
m.setObjective(v1.prod(cost_task1) + v2.prod(cost_task2),GRB.MINIMIZE)

In [130]:
m.optimize()

Gurobi Optimizer version 9.0.3 build v9.0.3rc0 (mac64)
Optimize a model with 8 rows, 150 columns and 80 nonzeros
Model fingerprint: 0xeeb6f1ab
Model has 1 quadratic constraint
Variable types: 10 continuous, 140 integer (140 binary)
Coefficient statistics:
  Matrix range     [1e+00, 1e+00]
  QMatrix range    [8e-01, 9e-01]
  Objective range  [4e-04, 2e+02]
  Bounds range     [1e+00, 1e+00]
  RHS range        [1e+00, 1e+00]
  QRHS range       [8e-01, 8e-01]
Presolve removed 6 rows and 138 columns
Presolve time: 0.00s
Presolved: 2 rows, 12 columns, 12 nonzeros
Presolved model has 1 quadratic constraint(s)
Variable types: 0 continuous, 12 integer (12 binary)

Root relaxation: objective 7.220000e-04, 2 iterations, 0.00 seconds

    Nodes    |    Current Node    |     Objective Bounds      |     Work
 Expl Unexpl |  Obj  Depth IntInf | Incumbent    BestBd   Gap | It/Node Time

*    0     0               0       0.0007220    0.00072  0.00%     -    0s

Explored 0 nodes (2 simplex iterations) 

In [131]:
# Compute total matching score from assignment variables
total_matching_score = 0
for model in M:
    if v1[model].x > 1e-6:
        print(v1[model].varName, v1[model].x, acc_task1[model], cost_task1[model]) 
        total_matching_score += cost_task1[model]*v1[model].x
    if v2[model].x > 1e-6:
        print(v2[model].varName, v2[model].x, acc_task2[model], cost_task2[model]) 
        total_matching_score += cost_task2[model]*v2[model].x

print('Total matching score: ', total_matching_score)  

obscene[OvR_obscene] 1.0 0.95
toxic[OvR_toxic] 1.0 0.898
Total matching score:  1.8479999999999999
