# Classification

## Linguistic Classifier (LC) + Omission & Stereotype Classifier (OSC)

In [2]:
# Custom filepaths and functions
import config, clf_utils

# Libraries for data, file, and model loading
import pandas as pd
import joblib
from joblib import load
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from pathlib import Path

# Libraries for classification
from gensim.models import FastText
from gensim.utils import tokenize
from gensim import utils
from gensim.test.utils import get_tmpfile
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import CountVectorizer
from skmultilearn.problem_transform import ClassifierChain
from sklearn.ensemble import RandomForestClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import SGDClassifier
import scipy
import sklearn.metrics
from sklearn.metrics import confusion_matrix, multilabel_confusion_matrix
from sklearn.metrics import precision_recall_fscore_support

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/lucyhavens/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### 1. Load the Data

In [4]:
extracted_dir = "data/extracted/"
doc_df = pd.read_csv(extracted_dir+"nusc_archival_descs_for_clf.csv", index_col=0)
doc_df.head()

Unnamed: 0,description_id,eadid,rowid,field,doc
0,0,BP,BP,processinfo,This EAD description created by Ruth Sheret 27...
1,1,BP,BP,scopecontent,The papers of the distinguished public servant...
2,2,BP,BP,unittitle,Plowden (Lady Bridget) Archive
3,3,BP,BP/01,scopecontent,"Formerly the 'Working Ladies Guild', the organ..."
4,4,BP,BP/01,unittitle,Chairman (Managing Committee) of the Mary Fiel...


In [5]:
text_cols = ["doc"]
token_df = clf_utils.getTokenDF(doc_df, text_cols)
token_df.head()

Unnamed: 0,description_id,token_id,token
0,0,0,This
0,0,1,EAD
0,0,2,description
0,0,3,created
0,0,4,by


In [6]:
token_df.tail()

Unnamed: 0,description_id,token_id,token
57946,57946,1258138,of
57946,57946,1258139,Lady
57946,57946,1258140,Pauline
57946,57946,1258141,'s
57946,57946,1258142,Diaries


In [7]:
token_df.shape

(1258143, 3)

### 2. Linguistic Classification
Classify the data using the Linguistic Classifier to create *Generalization*, *Gendered Pronoun*, and *Gendered Role* features to input into the Omission and Stereotype Classifier.

Load the models.

In [8]:
ft_model = FastText.load("models/embeddings/fasttextcbow100_lowercased/fasttext_cbow_100d.model")
mlb_ling = joblib.load("models/transform_labels/mlb_targets_ling.joblib")  # formerly named mlb_linlabels.joblib
ling_clf = joblib.load("models/ling_osc/cc-rf_F-fasttextcbow100_T-linglabels.joblib")

In [9]:
print("Classification targets:", mlb_ling.classes_)

Classification targets: ['Gendered-Pronoun' 'Gendered-Role' 'Generalization']


Transform the token data into an embedding matrix to input into the classifier.

In [10]:
X = clf_utils.getFeatures(token_df, embedding_model=ft_model)

Classify the tokens.

In [11]:
y = ling_clf.predict(X)

In [12]:
predictions = mlb_ling.inverse_transform(y)
new_preds = [pred[0] if len(pred) > 0 else "O" for pred in predictions]

In [13]:
pred_col = "prediction"
token_df.insert(len(token_df.columns), pred_col, new_preds)
token_df.head()

Unnamed: 0,description_id,token_id,token,prediction
0,0,0,This,O
0,0,1,EAD,O
0,0,2,description,O
0,0,3,created,O
0,0,4,by,O


In [14]:
token_df[pred_col].value_counts()

O                   1235218
Gendered-Pronoun      18548
Gendered-Role          4015
Generalization          362
Name: prediction, dtype: int64

In [15]:
token_df.to_csv(config.classified_data_path+"baseline_lc_predictions.csv")

### 3. Omission & Stereotype Classification
Classify the data using the Omission and Stereotype Classifier to assign *Omission* and *Stereotype* codes to NUSC archival metadata descriptions.

### 3.1 Preprocessing

In [51]:
# ling_preds_file = config.classified_data_path+"baseline_lc_predictions.csv" # token_df
# token_df = pd.read_csv(ling_preds_file, index_col=0)
# token_df.head()

In [16]:
# Group the data by description so it can be combined with the description data
df_features = clf_utils.implodeDataFrameUnique(token_df[["description_id", "token_id", pred_col]], ["description_id"])

# Deduplicate lists in description_id, token_id, and prediction columns
cols = ["token_id", pred_col]
for col in cols:
    old_col = list(df_features[col])
    new_col = []
    if col == pred_col:
        for values in old_col:
            if (len(values) > 1) and ("O" in values):
                values.remove("O")
            values_list = list(values)
            values_list.sort()
            new_col += [values_list]
    else:
        new_col = old_col
    col_i = list(df_features.columns).index(col)
    df_features = df_features.drop(columns=[col])
    df_features.insert(col_i, col, new_col)

df_features.head()

Unnamed: 0_level_0,token_id,prediction
description_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...",[O]
1,"[16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 2...","[Gendered-Pronoun, Gendered-Role, Generalization]"
2,"[267, 268, 269, 270, 271, 272]",[Gendered-Role]
3,"[273, 274, 275, 276, 277, 278, 279, 280, 281, ...","[Gendered-Pronoun, Gendered-Role, Generalization]"
4,"[352, 353, 354, 355, 342, 343, 344, 345, 346, ...",[Generalization]


In [17]:
df_features.tail()

Unnamed: 0_level_0,token_id,prediction
description_id,Unnamed: 1_level_1,Unnamed: 2_level_1
57942,[1258106],[O]
57943,"[1258107, 1258108, 1258109, 1258110, 1258111]",[O]
57944,"[1258112, 1258113, 1258114, 1258115, 1258116, ...",[Gendered-Role]
57945,"[1258118, 1258119, 1258120, 1258121, 1258122, ...",[O]
57946,"[1258136, 1258137, 1258138, 1258139, 1258140, ...",[Gendered-Role]


In [18]:
df_features[pred_col].value_counts()

[O]                                                  50746
[Gendered-Pronoun]                                    4037
[Gendered-Role]                                       1741
[Gendered-Pronoun, Gendered-Role]                     1081
[Generalization]                                       256
[Gendered-Role, Generalization]                         35
[Gendered-Pronoun, Generalization]                      26
[Gendered-Pronoun, Gendered-Role, Generalization]       25
Name: prediction, dtype: int64

### 3.2 Feature Extraction

Join the feature data (i.e., classifier-predicted *Gendered Pronoun*, *Gendered Role*, and *Generalization* labels) to the document data (i.e., NUSC archival catalog metadata descriptions).

In [19]:
df = df_features.join(doc_df.set_index("description_id"), on="description_id", how="right")

# Rename label columns for clarity
feature_col = "linguistic_prediction"
df = df.rename(columns={pred_col:feature_col})

# Remove "O" from linguistic prediction lists if other labels are present in the list
ling_pred = list(df[feature_col])
new_ling_pred = []
for pred in ling_pred:
    if len(pred) > 1:
        if "O" in pred:
            pred.remove("O")
    new_ling_pred += [pred]

df = df.drop(columns=[feature_col])
df.insert(3, feature_col, new_ling_pred)
df.head()

Unnamed: 0_level_0,token_id,eadid,rowid,linguistic_prediction,field,doc
description_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...",BP,BP,[O],processinfo,This EAD description created by Ruth Sheret 27...
1,"[16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 2...",BP,BP,"[Gendered-Pronoun, Gendered-Role, Generalization]",scopecontent,The papers of the distinguished public servant...
2,"[267, 268, 269, 270, 271, 272]",BP,BP,[Gendered-Role],unittitle,Plowden (Lady Bridget) Archive
3,"[273, 274, 275, 276, 277, 278, 279, 280, 281, ...",BP,BP/01,"[Gendered-Pronoun, Gendered-Role, Generalization]",scopecontent,"Formerly the 'Working Ladies Guild', the organ..."
4,"[352, 353, 354, 355, 342, 343, 344, 345, 346, ...",BP,BP/01,[Generalization],unittitle,Chairman (Managing Committee) of the Mary Fiel...


### 3.3 Classification

In [20]:
clf_os = joblib.load("models/ling_osc/sgd-svm_F-tfidf-ling_T-os.joblib")
mlb_os = joblib.load("models/transform_labels/mlb_targets_os.joblib")
cvectorizer = joblib.load("models/transform_docs/count_vectorizer.joblib")
tfidf_transformer = joblib.load("models/transform_docs/tfidf_transformer.joblib")

In [21]:
print(mlb_ling.classes_)
print(mlb_os.classes_)

['Gendered-Pronoun' 'Gendered-Role' 'Generalization']
['Omission' 'Stereotype']


In [22]:
docs = df["doc"]
vectorized = cvectorizer.transform(docs)
tfidf_matrix = tfidf_transformer.transform(vectorized)

In [23]:
features = mlb_ling.transform(df[feature_col])
X = scipy.sparse.hstack([tfidf_matrix, features])



In [24]:
y = clf_os.predict(X)
predictions = mlb_os.inverse_transform(y)
df.insert(len(df.columns), "os_predictions", predictions)
df.head()

Unnamed: 0_level_0,token_id,eadid,rowid,linguistic_prediction,field,doc,os_predictions
description_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...",BP,BP,[O],processinfo,This EAD description created by Ruth Sheret 27...,()
1,"[16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 2...",BP,BP,"[Gendered-Pronoun, Gendered-Role, Generalization]",scopecontent,The papers of the distinguished public servant...,"(Omission,)"
2,"[267, 268, 269, 270, 271, 272]",BP,BP,[Gendered-Role],unittitle,Plowden (Lady Bridget) Archive,()
3,"[273, 274, 275, 276, 277, 278, 279, 280, 281, ...",BP,BP/01,"[Gendered-Pronoun, Gendered-Role, Generalization]",scopecontent,"Formerly the 'Working Ladies Guild', the organ...","(Omission,)"
4,"[352, 353, 354, 355, 342, 343, 344, 345, 346, ...",BP,BP/01,[Generalization],unittitle,Chairman (Managing Committee) of the Mary Fiel...,()


In [25]:
df.os_predictions.value_counts()

()                        54648
(Omission,)                2096
(Omission, Stereotype)      918
(Stereotype,)               285
Name: os_predictions, dtype: int64

In [27]:
print("Proportion of descriptions classified:", (2096+918+285)/54648) # about 6 percent (1% more than found with baseline)

Proportion of descriptions classified: 0.06036817449860928


Save the classified data.

In [26]:
df.to_csv(config.classified_data_path+"lc-osc_predictions.csv")