# Apply Trained Models to External Data

### Table of Contents

**I. Preprocessing**
* Select descriptions to include as input text for gender biased text classification
* Tokenize the text

**II. Feature Extraction**
* Represent tokens as word embeddings with the custom FastText word embedding model
* Represent descriptions as TFIDF matrices

**III. Classification**
* Linguistic Classifier > Stereotype & Omission Classifier (Cascade 2)
* Stereotype & Omission Classifier (Baseline)
* Gendered Pronoun & Gendered Role Classifier > Stereotype & Omission Classifier
* Person & Occupation Classifier > Stereotype & Omission Classifier

Import programming libraries:

In [1]:
# Custom filepaths and functions
import config, utils

# Libraries for data, file, and model loading
import pandas as pd
import joblib
import os, re
import numpy as np

# Libraries for classification
import sklearn.metrics
from skmultilearn.problem_transform import ClassifierChain
from sklearn.ensemble import RandomForestClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import SGDClassifier

## I. Preprocessing

Load the external data:

In [2]:
df = utils.loadCSVData("../data/congruence_engine/BT_sample.csv", False)
df.head(2)

Unnamed: 0,record_id,RecordType,AltRefNo,RefNo,Title,Original Date Field,Description,RelatedNameCode,Authority Type,Authority Name
0,0,Photograph,TCD 263/MUS 1079,BTA/5 PH/7/MUS 1079,Wedding Greetings (General),,Image of telegram.,,,
1,1,Photograph,TCD 263/MUS 1142,BTA/5 PH/7/MUS 1142,Baby (Blanket and tag design),,Image of telegram.,,,


In [3]:
text_cols = ["Title", "Description"]

In [4]:
token_df = utils.getTokenDF(df, text_cols)
token_df.tail()

Unnamed: 0,record_id,token_id,token
133,137,1737,.
134,138,1738,Telephone
134,138,1739,number
134,138,1740,46
134,138,1741,.


## II. Feature Extraction

In [5]:
print(utils.mlb_ling.classes_)  # The names of labels that this MultiLabelBinarizer represents as 0 or 1

['Gendered-Pronoun' 'Gendered-Role' 'Generalization']


In [6]:
print(utils.mlb_so.classes_) # The names of labels that this MultiLabelBinarizer represents as 0 or 1

['Omission' 'Stereotype']


In [None]:
embedding_matrix = utils.getFeatures(token_df)

In [8]:
tfidf_matrix = utils.docToTfidf(df, text_cols)

## III. Classification

### Linguistic Classifier
* Type: Multilabel token classification
* Targets: Gendered Pronoun, Gendered Role, Generalization 

In [9]:
trained_ling_clf = joblib.load(config.models_path+"multilabel_token/cc-rf_F-fasttext100_T-linglabels.joblib")  # Load the Linguistic classifier

In [10]:
X = embedding_matrix
y = trained_ling_clf.predict(X)  # Run the classifier

In [11]:
filepath = "../data/congruence_engine/classifier_output/multilabel_linguistic/"
filename = "BT_sample_linglabels.csv"
token_df = utils.exportClassifiedData(token_df, y, utils.mlb_ling, filepath, filename)
token_df.head()

Unnamed: 0,record_id,token_id,token,prediction
0,0,0,Wedding,[]
0,0,1,Greetings,[]
0,0,2,(,[]
0,0,3,General,[]
0,0,4,),[]


In [12]:
token_df["prediction"].value_counts()

[]                  1739
[Gendered-Role]        2
[Generalization]       1
Name: prediction, dtype: int64

### Omission and Stereotype Classifier
* Type: multilabel document classification
* Targets: Omission, Stereotype
* Optional Features: labels assigned by the Linguistic Classifier

In [13]:
# If not using Linguistic Classifier's labels as features:
# doc_df = df

# If using Linguistic Classifier's labels as features:
doc_df = utils.preprocessClassifiedDocs(df, token_df)
doc_df.head()

Unnamed: 0,record_id,document_prediction,RecordType,AltRefNo,RefNo,Title,Original Date Field,Description,RelatedNameCode,Authority Type,Authority Name
0.0,0,[],Photograph,TCD 263/MUS 1079,BTA/5 PH/7/MUS 1079,Wedding Greetings (General),,Image of telegram.,,,
1.0,1,[],Photograph,TCD 263/MUS 1142,BTA/5 PH/7/MUS 1142,Baby (Blanket and tag design),,Image of telegram.,,,
2.0,2,[],Photograph,TCD 263/MUS 1075,BTA/5 PH/7/MUS 1075,Wedding (General Greeting),,Image of telegram.,,,
3.0,3,[],Photograph,TCD 263/MUS 977,BTA/5 PH/7/MUS 977,Ordinary,,Designer/manufacturer: Perry,,,
4.0,4,[Generalization],Photograph,TCD 263/MUS 637,BTA/5 PH/7/MUS 637,Two girls sending telegraph message,c1910,,,,


In [30]:
trained_so_clf = joblib.load(config.models_path+"multilabel_document/sgd-svm_F-tfidf-ling_T-so.joblib")

In [None]:
features = mlb_ling.transform(bt_clf["predicted_linguistic"])

In [31]:
doc_col = "Title"
bt_clf = bt_clf.fillna("")
vectorized = cvectorizer.transform(bt_clf[doc_col])
docs = tfidf.transform(vectorized)

In [32]:
X = scipy.sparse.hstack([docs, features])

In [33]:
y_pred = trained_so_clf.predict(X)

In [34]:
pred_labels = mlb_so.inverse_transform(y_pred)
print(pred_labels[:100])

[(), (), (), (), (), ('Omission', 'Stereotype'), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), ('Stereotype',), (), (), (), (), (), (), (), (), (), (), (), (), (), (), ()]


In [35]:
new_pred_col = []
for values in pred_labels:
    preds = []
    if (values != [tuple()]) and (values != []):
        for t in values:
            if len(t) > 0:
                label = str(t).strip("(',)")
                if label not in preds:
                    preds += [label]
    new_pred_col += [preds]
print(new_pred_col[:100])
pred_col_name = "predicted_so"
bt_clf.insert(len(bt_clf.columns), pred_col_name, new_pred_col)
bt_clf.head()

[[], [], [], [], [], ['Omission', 'Stereotype'], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], ['Stereotype'], [], [], [], [], [], [], [], [], [], [], [], [], [], [], []]


Unnamed: 0_level_0,token_id,token,predicted_linguistic,Title,predicted_so
record_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,"[0, 1, 2, 3, 4]","[Greetings, Wedding, (, ), General]",[],Wedding Greetings (General),[]
1,"[5, 6, 7, 8, 9, 10, 11]","[tag, (, Baby, ), Blanket, design, and]",[],Baby (Blanket and tag design),[]
2,"[12, 13, 14, 15, 16]","[Greeting, Wedding, (, ), General]",[],Wedding (General Greeting),[]
3,[17],[Ordinary],[],Ordinary,[]
4,"[18, 19, 20, 21, 22]","[girls, Two, telegraph, sending, message]",[Generalization],Two girls sending telegraph message,[]


In [36]:
bt_clf[pred_col_name].value_counts()

[]                        69382
[Omission, Stereotype]        1
[Stereotype]                  1
Name: predicted_so, dtype: int64