# Classification

## Baseline Omission and Stereotype Classifier

In [2]:
import config, my_utils
import re
import pandas as pd
import joblib
from joblib import load
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.metrics import multilabel_confusion_matrix
from sklearn.metrics import precision_recall_fscore_support


### 1. Preprocessing

Load the extracted archival metadata from the Newcastle University Special Collections catalog.

In [3]:
extracted_dir = "data/extracted/"
doc_df = pd.read_csv(extracted_dir+"nusc_archival_descs_for_clf.csv", index_col=0)
doc_df.head()


Unnamed: 0,description_id,eadid,rowid,field,doc
0,0,BP,BP,processinfo,This EAD description created by Ruth Sheret 27...
1,1,BP,BP,scopecontent,The papers of the distinguished public servant...
2,2,BP,BP,unittitle,Plowden (Lady Bridget) Archive
3,3,BP,BP/01,scopecontent,"Formerly the 'Working Ladies Guild', the organ..."
4,4,BP,BP/01,unittitle,Chairman (Managing Committee) of the Mary Fiel...


In [4]:
doc_df.tail()

Unnamed: 0,description_id,eadid,rowid,field,doc
34220,34220,WCT,WCT/8,unittitle,Diaries
34221,34221,WCT,WCT/9,scopecontent,Personal diaries and notebooks.
34222,34222,WCT,WCT/9,unittitle,Diaries and notebooks of Lady Pauline
34223,34223,WCT,WCT/9/4,scopecontent,Microfilm reels of Pauline's diaries and sketc...
34224,34224,WCT,WCT/9/4,unittitle,Microfilm copies of Lady Pauline's Diaries


In [5]:
doc_df.shape

(34225, 5)

There are 34,225 metadata descriptions to classify for *Omission* and *Stereotype*.

### 2. Classification
Load the baseline Omission and Stereotype Classifier (OSC).

In [6]:
cvectorizer = joblib.load("models/transform_docs/count_vectorizer.joblib")
tfidf = joblib.load("models/transform_docs/tfidf_transformer.joblib")
mlb = joblib.load("models/transform_labels/mlb_targets_os.joblib")
clf = joblib.load("models/baseline_osc/sgd-svm_F-tfidf_T-os.joblib")

Classify the descriptions.

In [7]:
vectorized = cvectorizer.transform(doc_df["doc"])
X = tfidf.transform(vectorized)
y = clf.predict(X)

In [8]:
predictions = mlb.inverse_transform(y)

In [9]:
doc_df.insert(len(doc_df.columns), "prediction", predictions)
doc_df.head()

Unnamed: 0,description_id,eadid,rowid,field,doc,prediction
0,0,BP,BP,processinfo,This EAD description created by Ruth Sheret 27...,()
1,1,BP,BP,scopecontent,The papers of the distinguished public servant...,()
2,2,BP,BP,unittitle,Plowden (Lady Bridget) Archive,()
3,3,BP,BP/01,scopecontent,"Formerly the 'Working Ladies Guild', the organ...",()
4,4,BP,BP/01,unittitle,Chairman (Managing Committee) of the Mary Fiel...,()


In [10]:
doc_df.prediction.value_counts()

()                        32879
(Omission,)                1265
(Omission, Stereotype)       65
(Stereotype,)                16
Name: prediction, dtype: int64

Export the classified data.

In [11]:
filename = "baseline_osc_predictions.csv"
doc_df.to_csv(config.classified_data_path+filename)