# Classification

## Baseline Omission and Stereotype Classifier

In [1]:
import config, utils
import re
import pandas as pd
import joblib
from joblib import load
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.metrics import multilabel_confusion_matrix
from sklearn.metrics import precision_recall_fscore_support


### 1. Preprocessing

Load the extracted archival metadata from the Newcastle University Special Collections catalog.

In [2]:
extracted_dir = "data/extracted/"
ead_df = pd.read_csv(extracted_dir + "newcastle_archival_metadata_sample.csv", index_col=0)
nonead_df = pd.read_csv(extracted_dir + "bell_archival_metadata_nonead.csv", index_col=0)
print(ead_df.shape, nonead_df.shape)

(16683, 8) (200, 9)


Transform the data so all the text that will be classified is in a single column.

In [6]:
text_cols = ["unittitle", "bioghist", "scopecontent", "processinfo"]
ead_doc_df = utils.transformForClassification(ead_df, text_cols, "unitid")

text_cols = ["title", "description", "extent-and-medium"]
nonead_doc_df = utils.transformForClassification(nonead_df, text_cols, "node_id")

doc_df = pd.concat([ead_doc_df, nonead_doc_df])
doc_df.sort_values(["eadid", "rowid", "field", "doc"], inplace=True)
doc_df.head()

Unnamed: 0,eadid,rowid,field,doc
26920,BP,BP,processinfo,This EAD description created by Ruth Sheret 27...
26520,BP,BP,scopecontent,The papers of the distinguished public servant...
25720,BP,BP,unittitle,Plowden (Lady Bridget) Archive
26521,BP,BP/01,scopecontent,"Formerly the 'Working Ladies Guild', the organ..."
25721,BP,BP/01,unittitle,Chairman (Managing Committee) of the Mary Fiel...


In [7]:
doc_df.tail()

Unnamed: 0,eadid,rowid,field,doc
60652,WCT,WCT/8,unittitle,Diaries
61822,WCT,WCT/9,scopecontent,Personal diaries and notebooks.
60658,WCT,WCT/9,unittitle,Diaries and notebooks of Lady Pauline
61826,WCT,WCT/9/4,scopecontent,Microfilm reels of Pauline's diaries and sketc...
60662,WCT,WCT/9/4,unittitle,Microfilm copies of Lady Pauline's Diaries


In [8]:
doc_df.shape

(34225, 4)

There are 34,225 metadata descriptions to classify for *Omission* and *Stereotype*.

### 2. Classification
Load the baseline Omission and Stereotype Classifier (OSC).

In [9]:
cvectorizer = joblib.load(config.baseline_osc_path+"count_vectorizer.joblib")
tfidf = joblib.load(config.baseline_osc_path+"tfidf_transformer.joblib")
mlb = joblib.load(config.baseline_osc_path+"mlb_targets_so.joblib")
clf = joblib.load(config.baseline_osc_path+"sgd-svm_F-tfidf_T-so.joblib")

Classify the descriptions.

In [10]:
vectorized = cvectorizer.transform(doc_df["doc"])
X = tfidf.transform(vectorized)
y = clf.predict(X)

In [11]:
predictions = mlb.inverse_transform(y)

In [12]:
doc_df.insert(len(doc_df.columns), "prediction", predictions)
doc_df.head()

Unnamed: 0,eadid,rowid,field,doc,prediction
26920,BP,BP,processinfo,This EAD description created by Ruth Sheret 27...,()
26520,BP,BP,scopecontent,The papers of the distinguished public servant...,"(Omission,)"
25720,BP,BP,unittitle,Plowden (Lady Bridget) Archive,()
26521,BP,BP/01,scopecontent,"Formerly the 'Working Ladies Guild', the organ...",()
25721,BP,BP/01,unittitle,Chairman (Managing Committee) of the Mary Fiel...,()


In [13]:
doc_df.prediction.value_counts()

()                        32853
(Omission,)                1309
(Omission, Stereotype)       48
(Stereotype,)                15
Name: prediction, dtype: int64

Export the classified data.

In [15]:
filename = "baseline_osc_predictions.csv"
doc_df.to_csv(config.classified_data_path+filename)