# Classification

## Linguistic + Omission & Stereotype

In [1]:
# Custom filepaths and functions
import config, my_utils

# Libraries for data, file, and model loading
import pandas as pd
import joblib
from joblib import load
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from pathlib import Path

# Libraries for classification
from gensim.models import FastText
from gensim.utils import tokenize
from gensim import utils
from gensim.test.utils import get_tmpfile
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import CountVectorizer
from skmultilearn.problem_transform import ClassifierChain
from sklearn.ensemble import RandomForestClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import SGDClassifier
import sklearn.metrics

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/lucyhavens/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### 1. Preprocessing

Load the extracted archival metadata from the Newcastle University Special Collections catalog.

In [2]:
extracted_dir = "data/extracted/"
ead_df = pd.read_csv(extracted_dir + "newcastle_archival_metadata_sample.csv", index_col=0)
nonead_df = pd.read_csv(extracted_dir + "bell_archival_metadata_nonead.csv", index_col=0)
print(ead_df.shape, nonead_df.shape)

(16683, 9) (200, 10)


Transform the data so all the text that will be classified is in a single column.

In [3]:
text_cols = ["unittitle", "bioghist", "scopecontent", "processinfo"]
ead_doc_df = my_utils.transformForClassification(ead_df, text_cols, "unitid")

text_cols = ["title", "description", "extent-and-medium"]
nonead_doc_df = my_utils.transformForClassification(nonead_df, text_cols, "node_id")

doc_df = pd.concat([ead_doc_df, nonead_doc_df])
doc_df.sort_values(["eadid", "rowid", "field", "doc"], inplace=True)
doc_df.head()

Unnamed: 0,eadid,rowid,field,doc
26920,BP,BP,processinfo,This EAD description created by Ruth Sheret 27...
26520,BP,BP,scopecontent,The papers of the distinguished public servant...
25720,BP,BP,unittitle,Plowden (Lady Bridget) Archive
26521,BP,BP/01,scopecontent,"Formerly the 'Working Ladies Guild', the organ..."
25721,BP,BP/01,unittitle,Chairman (Managing Committee) of the Mary Fiel...


In [4]:
text_cols = ["doc"]
token_df = my_utils.getTokenDF(doc_df, text_cols, row_id="rowid")
token_df.head()

Unnamed: 0,record_id,token_id,token
0,BP,0,This
0,BP,1,EAD
0,BP,2,description
0,BP,3,created
0,BP,4,by


In [5]:
token_df.tail()

Unnamed: 0,record_id,token_id,token
34224,WCT/9/4,1000676,of
34224,WCT/9/4,1000677,Lady
34224,WCT/9/4,1000678,Pauline
34224,WCT/9/4,1000679,'s
34224,WCT/9/4,1000680,Diaries


In [10]:
token_df.shape

(1000681, 3)

### 2. Linguistic Classification
Classify the data using the Linguistic Classifier to create *Generalization*, *Gendered Pronoun*, and *Gendered Role* features to input into the Omission and Stereotype Classifier.

Load the models.

In [None]:
ft_model = FastText.load("models/embeddings/fasttextcbow100_lowercased/fasttext_cbow_100d.model")
mlb_ling = joblib.load("models/transform_labels/mlb_targets_ling.joblib")  # formerly named mlb_linlabels.joblib
ling_clf = joblib.load("models/ling_osc/cc-rf_F-fasttextcbow100_T-linglabels.joblib")

In [8]:
print("Classification targets:", mlb_ling.classes_)

Classification targets: ['Gendered-Pronoun' 'Gendered-Role' 'Generalization']


Transform the token data into an embedding matrix to input into the classifier.

In [9]:
X = my_utils.getFeatures(token_df, embedding_model=ft_model)

Classify the tokens.

In [26]:
y = ling_clf.predict(X)

In [28]:
predictions = mlb_ling.inverse_transform(y)
new_preds = [pred[0] if len(pred) > 0 else "" for pred in predictions]
print(new_preds[:25])

['', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', 'Gendered-Role', '']


In [None]:
token_df.insert(len(token_df.columns), "prediction", new_preds)
token_df.head()

Unnamed: 0,record_id,token_id,token,prediction
0,BP,0,This,
0,BP,1,EAD,
0,BP,2,description,
0,BP,3,created,
0,BP,4,by,


In [30]:
token_df.prediction.value_counts()

                    981151
Gendered-Pronoun     17014
Gendered-Role         2217
Generalization         299
Name: prediction, dtype: int64

In [32]:
token_df.to_csv(config.classified_data_path+"baseline_lc_predictions.csv")

### 3. Omission & Stereotype Classification
Classify the data using the Omission and Stereotype Classifier to assign *Omission* and *Stereotype* codes to NUSC archival metadata descriptions.