# Classification

## Linguistic Classifier (LC) + Omission & Stereotype Classifier (OSC)

In [67]:
# Custom filepaths and functions
import config, clf_utils

# Libraries for data, file, and model loading
import pandas as pd
import random
import joblib
from joblib import load
import numpy as np
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from pathlib import Path

# Libraries for classification
from gensim.models import FastText
from gensim.utils import tokenize
from gensim import utils
from gensim.test.utils import get_tmpfile
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import CountVectorizer
from skmultilearn.problem_transform import ClassifierChain
from sklearn.ensemble import RandomForestClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import SGDClassifier
import scipy
import sklearn.metrics
from sklearn.metrics import confusion_matrix, multilabel_confusion_matrix
from sklearn.metrics import precision_recall_fscore_support

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/lucyhavens/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### 1. Load the Data

In [4]:
extracted_dir = "data/extracted/"
doc_df = pd.read_csv(extracted_dir+"nusc_archival_descs_for_clf.csv", index_col=0)
doc_df.head()

Unnamed: 0,description_id,eadid,rowid,field,doc
0,0,BP,BP,processinfo,This EAD description created by Ruth Sheret 27...
1,1,BP,BP,scopecontent,The papers of the distinguished public servant...
2,2,BP,BP,unittitle,Plowden (Lady Bridget) Archive
3,3,BP,BP/01,scopecontent,"Formerly the 'Working Ladies Guild', the organ..."
4,4,BP,BP/01,unittitle,Chairman (Managing Committee) of the Mary Fiel...


In [5]:
text_cols = ["doc"]
token_df = clf_utils.getTokenDF(doc_df, text_cols)
token_df.head()

Unnamed: 0,description_id,token_id,token
0,0,0,This
0,0,1,EAD
0,0,2,description
0,0,3,created
0,0,4,by


In [6]:
token_df.tail()

Unnamed: 0,description_id,token_id,token
57946,57946,1258138,of
57946,57946,1258139,Lady
57946,57946,1258140,Pauline
57946,57946,1258141,'s
57946,57946,1258142,Diaries


In [7]:
token_df.shape

(1258143, 3)

### 2. Linguistic Classification
Classify the data using the Linguistic Classifier to create *Generalization*, *Gendered Pronoun*, and *Gendered Role* features to input into the Omission and Stereotype Classifier.

Load the models.

In [8]:
ft_model = FastText.load("models/embeddings/fasttextcbow100_lowercased/fasttext_cbow_100d.model")
mlb_ling = joblib.load("models/transform_labels/mlb_targets_ling.joblib")  # formerly named mlb_linlabels.joblib
ling_clf = joblib.load("models/ling_osc/cc-rf_F-fasttextcbow100_T-linglabels.joblib")

In [9]:
print("Classification targets:", mlb_ling.classes_)

Classification targets: ['Gendered-Pronoun' 'Gendered-Role' 'Generalization']


Transform the token data into an embedding matrix to input into the classifier.

In [10]:
X = clf_utils.getFeatures(token_df, embedding_model=ft_model)

Classify the tokens.

In [11]:
y = ling_clf.predict(X)

In [12]:
predictions = mlb_ling.inverse_transform(y)
new_preds = [pred[0] if len(pred) > 0 else "O" for pred in predictions]

In [13]:
pred_col = "prediction"
token_df.insert(len(token_df.columns), pred_col, new_preds)
token_df.head()

Unnamed: 0,description_id,token_id,token,prediction
0,0,0,This,O
0,0,1,EAD,O
0,0,2,description,O
0,0,3,created,O
0,0,4,by,O


In [14]:
token_df[pred_col].value_counts()

O                   1235218
Gendered-Pronoun      18548
Gendered-Role          4015
Generalization          362
Name: prediction, dtype: int64

In [15]:
token_df.to_csv(config.classified_data_path+"baseline_lc_predictions.csv")

### 3. Omission & Stereotype Classification
Classify the data using the Omission and Stereotype Classifier to assign *Omission* and *Stereotype* codes to NUSC archival metadata descriptions.

### 3.1 Preprocessing

In [51]:
# ling_preds_file = config.classified_data_path+"baseline_lc_predictions.csv" # token_df
# token_df = pd.read_csv(ling_preds_file, index_col=0)
# token_df.head()

In [16]:
# Group the data by description so it can be combined with the description data
df_features = clf_utils.implodeDataFrameUnique(token_df[["description_id", "token_id", pred_col]], ["description_id"])

# Deduplicate lists in description_id, token_id, and prediction columns
cols = ["token_id", pred_col]
for col in cols:
    old_col = list(df_features[col])
    new_col = []
    if col == pred_col:
        for values in old_col:
            if (len(values) > 1) and ("O" in values):
                values.remove("O")
            values_list = list(values)
            values_list.sort()
            new_col += [values_list]
    else:
        new_col = old_col
    col_i = list(df_features.columns).index(col)
    df_features = df_features.drop(columns=[col])
    df_features.insert(col_i, col, new_col)

df_features.head()

Unnamed: 0_level_0,token_id,prediction
description_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...",[O]
1,"[16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 2...","[Gendered-Pronoun, Gendered-Role, Generalization]"
2,"[267, 268, 269, 270, 271, 272]",[Gendered-Role]
3,"[273, 274, 275, 276, 277, 278, 279, 280, 281, ...","[Gendered-Pronoun, Gendered-Role, Generalization]"
4,"[352, 353, 354, 355, 342, 343, 344, 345, 346, ...",[Generalization]


In [17]:
df_features.tail()

Unnamed: 0_level_0,token_id,prediction
description_id,Unnamed: 1_level_1,Unnamed: 2_level_1
57942,[1258106],[O]
57943,"[1258107, 1258108, 1258109, 1258110, 1258111]",[O]
57944,"[1258112, 1258113, 1258114, 1258115, 1258116, ...",[Gendered-Role]
57945,"[1258118, 1258119, 1258120, 1258121, 1258122, ...",[O]
57946,"[1258136, 1258137, 1258138, 1258139, 1258140, ...",[Gendered-Role]


In [18]:
df_features[pred_col].value_counts()

[O]                                                  50746
[Gendered-Pronoun]                                    4037
[Gendered-Role]                                       1741
[Gendered-Pronoun, Gendered-Role]                     1081
[Generalization]                                       256
[Gendered-Role, Generalization]                         35
[Gendered-Pronoun, Generalization]                      26
[Gendered-Pronoun, Gendered-Role, Generalization]       25
Name: prediction, dtype: int64

### 3.2 Feature Extraction

Join the feature data (i.e., classifier-predicted *Gendered Pronoun*, *Gendered Role*, and *Generalization* labels) to the document data (i.e., NUSC archival catalog metadata descriptions).

In [19]:
df = df_features.join(doc_df.set_index("description_id"), on="description_id", how="right")

# Rename label columns for clarity
feature_col = "linguistic_prediction"
df = df.rename(columns={pred_col:feature_col})

# Remove "O" from linguistic prediction lists if other labels are present in the list
ling_pred = list(df[feature_col])
new_ling_pred = []
for pred in ling_pred:
    if len(pred) > 1:
        if "O" in pred:
            pred.remove("O")
    new_ling_pred += [pred]

df = df.drop(columns=[feature_col])
df.insert(3, feature_col, new_ling_pred)
df.head()

Unnamed: 0_level_0,token_id,eadid,rowid,linguistic_prediction,field,doc
description_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...",BP,BP,[O],processinfo,This EAD description created by Ruth Sheret 27...
1,"[16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 2...",BP,BP,"[Gendered-Pronoun, Gendered-Role, Generalization]",scopecontent,The papers of the distinguished public servant...
2,"[267, 268, 269, 270, 271, 272]",BP,BP,[Gendered-Role],unittitle,Plowden (Lady Bridget) Archive
3,"[273, 274, 275, 276, 277, 278, 279, 280, 281, ...",BP,BP/01,"[Gendered-Pronoun, Gendered-Role, Generalization]",scopecontent,"Formerly the 'Working Ladies Guild', the organ..."
4,"[352, 353, 354, 355, 342, 343, 344, 345, 346, ...",BP,BP/01,[Generalization],unittitle,Chairman (Managing Committee) of the Mary Fiel...


### 3.3 Classification

In [20]:
clf_os = joblib.load("models/ling_osc/sgd-svm_F-tfidf-ling_T-os.joblib")
mlb_os = joblib.load("models/transform_labels/mlb_targets_os.joblib")
cvectorizer = joblib.load("models/transform_docs/count_vectorizer.joblib")
tfidf_transformer = joblib.load("models/transform_docs/tfidf_transformer.joblib")

In [21]:
print(mlb_ling.classes_)
print(mlb_os.classes_)

['Gendered-Pronoun' 'Gendered-Role' 'Generalization']
['Omission' 'Stereotype']


In [22]:
docs = df["doc"]
vectorized = cvectorizer.transform(docs)
tfidf_matrix = tfidf_transformer.transform(vectorized)

In [23]:
features = mlb_ling.transform(df[feature_col])
X = scipy.sparse.hstack([tfidf_matrix, features])



In [None]:
y = clf_os.predict(X)
predictions = mlb_os.inverse_transform(y)
df.insert(len(df.columns), "os_prediction", predictions)
df.head()

Unnamed: 0_level_0,token_id,eadid,rowid,linguistic_prediction,field,doc,os_prediction
description_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...",BP,BP,[O],processinfo,This EAD description created by Ruth Sheret 27...,()
1,"[16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 2...",BP,BP,"[Gendered-Pronoun, Gendered-Role, Generalization]",scopecontent,The papers of the distinguished public servant...,"(Omission,)"
2,"[267, 268, 269, 270, 271, 272]",BP,BP,[Gendered-Role],unittitle,Plowden (Lady Bridget) Archive,()
3,"[273, 274, 275, 276, 277, 278, 279, 280, 281, ...",BP,BP/01,"[Gendered-Pronoun, Gendered-Role, Generalization]",scopecontent,"Formerly the 'Working Ladies Guild', the organ...","(Omission,)"
4,"[352, 353, 354, 355, 342, 343, 344, 345, 346, ...",BP,BP/01,[Generalization],unittitle,Chairman (Managing Committee) of the Mary Fiel...,()


In [47]:
df.os_prediction.value_counts()

()                        54648
(Omission,)                2096
(Omission, Stereotype)      918
(Stereotype,)               285
Name: os_prediction, dtype: int64

In [48]:
print("Proportion of descriptions classified:", (2096+918+285)/54648) # about 6 percent (1% more than found with baseline)

Proportion of descriptions classified: 0.06036817449860928


Save the classified data.

In [49]:
filename = "lc-osc_predictions.csv"
df.to_csv(config.classified_data_path+filename)

Deduplicate the descriptions and create a new export where each row has a unique doc-eadid-prediction combination.

In [50]:
df_dedup1 = df.drop_duplicates(subset=["eadid", "doc", "os_prediction"])
df_dedup2 = df.drop_duplicates(subset=["doc", "os_prediction"])
df_dedup3 = df.drop_duplicates(subset=["doc"])
print(df_dedup1.shape)
print(df_dedup2.shape)  # as expected, same as next
print(df_dedup3.shape)  # as expected, same as previous

(27235, 7)
(27209, 7)
(27209, 7)


In [51]:
dedup_filename = "lc-osc_predictions_deduplicated.csv"
df_dedup1.to_csv(config.classified_data_path+dedup_filename)

In [52]:
df_dedup1.os_prediction.value_counts()

()                        24330
(Omission,)                1869
(Omission, Stereotype)      779
(Stereotype,)               257
Name: os_prediction, dtype: int64

In [53]:
print("Proportion of unique descriptions classified:", (1869+779+257)/24330) # about 12%

Proportion of unique descriptions classified: 0.1193999177969585


### 3. Manual Review
Export a selection of the descriptions for manual review, including a combination of unclassified and classified descriptions.

In [54]:
unclf_docs = df_dedup1.loc[df_dedup1.os_prediction == tuple()]
unclf_docs.head()

Unnamed: 0_level_0,token_id,eadid,rowid,linguistic_prediction,field,doc,os_prediction
description_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...",BP,BP,[O],processinfo,This EAD description created by Ruth Sheret 27...,()
2,"[267, 268, 269, 270, 271, 272]",BP,BP,[Gendered-Role],unittitle,Plowden (Lady Bridget) Archive,()
4,"[352, 353, 354, 355, 342, 343, 344, 345, 346, ...",BP,BP/01,[Generalization],unittitle,Chairman (Managing Committee) of the Mary Fiel...,()
5,"[384, 356, 357, 358, 359, 360, 361, 362, 363, ...",BP,BP/01/01,[O],scopecontent,Annual reports covering the years 1959 - 1995....,()
6,"[385, 386]",BP,BP/01/01,[O],unittitle,Annual Reports,()


In [55]:
clf_docs = df_dedup1.loc[df_dedup1.os_prediction != tuple()]
clf_docs.head()

Unnamed: 0_level_0,token_id,eadid,rowid,linguistic_prediction,field,doc,os_prediction
description_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,"[16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 2...",BP,BP,"[Gendered-Pronoun, Gendered-Role, Generalization]",scopecontent,The papers of the distinguished public servant...,"(Omission,)"
3,"[273, 274, 275, 276, 277, 278, 279, 280, 281, ...",BP,BP/01,"[Gendered-Pronoun, Gendered-Role, Generalization]",scopecontent,"Formerly the 'Working Ladies Guild', the organ...","(Omission,)"
20,"[549, 550, 551, 552, 553, 554, 555, 556, 557, ...",BP,BP/01/04,"[Gendered-Role, Generalization]",scopecontent,Correspondence in relation to Lady Plowden's p...,"(Omission,)"
22,"[640, 641, 642, 643, 644, 645, 646, 647, 648, ...",BP,BP/01/05,[Gendered-Role],scopecontent,Small file of research (not carried out by Lad...,"(Omission,)"
30,"[768, 769, 770, 771, 772, 773, 774, 775, 776, ...",BP,BP/02/03,[Gendered-Role],scopecontent,Correspondence in relation to Lady Plowden's p...,"(Omission,)"


In [56]:
print(unclf_docs.shape)
print(clf_docs.shape)

(24330, 7)
(2905, 7)


In [57]:
unclf_docs_list = list(unclf_docs.doc)
clf_docs_list = list(clf_docs.doc)
print(clf_docs_list[0])

The papers of the distinguished public servant Lady Plowden (1910-2000) were generously gifted to Newcastle University Library in 2003 by the Plowden family.
Held in Special Collections, Lady Plowden's papers are an extensive and rich resource reflecting her many areas of concern. Lady Plowden held a number of high-profile public roles in the spheres of education reform and television broadcasting, most notably as Chairman of the Central Advisory Council for Education (England), 1963 – 1967, Vice-Chairman of the BBC Board of Governors, 1970 – 1975, and Chairman of the Independent Broadcasting Authority (IBA), 1975 – 1980. A large and varied number of public roles followed, many of which retained this focus on primary and pre-school education reform as well as the promotion of high-quality television broadcasting. Lady Plowden's interests were wide however and these papers also reflect Lady Plowden's roles within organisations related to Romany and Traveller education and rights, adult 

In [58]:
unclf_token_counts = [len(word_tokenize(doc)) for doc in unclf_docs_list]
clf_token_counts = [len(word_tokenize(doc)) for doc in clf_docs_list]
print(unclf_token_counts[0])
print(clf_token_counts[0])

16
251


In [60]:
print("Minimum, average, standard deviation, median, and maximum token counts in UNCLASSIFIED descriptions:")
print(np.min(unclf_token_counts))
print(np.mean(unclf_token_counts))
print(np.std(unclf_token_counts))
print(np.median(unclf_token_counts))
print(np.max(unclf_token_counts))

Minimum, average, standard deviation, median, and maximum token counts in UNCLASSIFIED descriptions:
1
16.016933826551583
22.144144438023456
11.0
706


In [61]:
print("Minimum, average, standard deviation, median, and maximum token counts in CLASSIFIED descriptions:")
print(np.min(clf_token_counts))
print(np.mean(clf_token_counts))
print(np.std(clf_token_counts))
print(np.median(clf_token_counts))
print(np.max(clf_token_counts))

Minimum, average, standard deviation, median, and maximum token counts in CLASSIFIED descriptions:
2
41.64647160068847
89.38022192312549
18.0
2478


In [69]:
clf_docs = clf_docs.reset_index()
unclf_docs = unclf_docs.reset_index()

In [70]:
o_docs = clf_docs.loc[clf_docs.os_prediction == ('Omission',)]
print(o_docs.shape)
o_docs.head()

(1869, 8)


Unnamed: 0,description_id,token_id,eadid,rowid,linguistic_prediction,field,doc,os_prediction
0,1,"[16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 2...",BP,BP,"[Gendered-Pronoun, Gendered-Role, Generalization]",scopecontent,The papers of the distinguished public servant...,"(Omission,)"
1,3,"[273, 274, 275, 276, 277, 278, 279, 280, 281, ...",BP,BP/01,"[Gendered-Pronoun, Gendered-Role, Generalization]",scopecontent,"Formerly the 'Working Ladies Guild', the organ...","(Omission,)"
2,20,"[549, 550, 551, 552, 553, 554, 555, 556, 557, ...",BP,BP/01/04,"[Gendered-Role, Generalization]",scopecontent,Correspondence in relation to Lady Plowden's p...,"(Omission,)"
3,22,"[640, 641, 642, 643, 644, 645, 646, 647, 648, ...",BP,BP/01/05,[Gendered-Role],scopecontent,Small file of research (not carried out by Lad...,"(Omission,)"
4,30,"[768, 769, 770, 771, 772, 773, 774, 775, 776, ...",BP,BP/02/03,[Gendered-Role],scopecontent,Correspondence in relation to Lady Plowden's p...,"(Omission,)"


In [71]:
os_docs = clf_docs.loc[clf_docs.os_prediction == ('Omission', 'Stereotype')]
print(os_docs.shape)
os_docs.head()

(779, 8)


Unnamed: 0,description_id,token_id,eadid,rowid,linguistic_prediction,field,doc,os_prediction
56,459,"[16129, 16130, 16131, 16132, 16133, 16134]",BP,BP/27/39,[Gendered-Role],unittitle,Research File: Women and Work,"(Omission, Stereotype)"
89,655,"[23424, 23425, 23426, 23427, 23428, 23429, 234...",BXB,BXB/1/1/AGA/1/4,[Gendered-Role],scopecontent,Consists of a photocopy of both the book cover...,"(Omission, Stereotype)"
90,724,"[24465, 24466, 24467, 24468, 24469, 24470, 244...",BXB,BXB/1/1/AGA/6/11,[Gendered-Role],scopecontent,Consists of photocopies of a poem entitled 'Ma...,"(Omission, Stereotype)"
96,840,"[27001, 27002, 27003]",BXB,BXB/1/1/ALV/1,[Gendered-Role],unittitle,Carrying My Wife,"(Omission, Stereotype)"
98,871,"[27235, 27236, 27237, 27238, 27239, 27240, 272...",BXB,BXB/1/1/ARM,"[Gendered-Pronoun, Gendered-Role]",bioghist,Simon Armitage was born in 1963 in Huddersfiel...,"(Omission, Stereotype)"


In [72]:
s_docs = clf_docs.loc[clf_docs.os_prediction == ('Stereotype',)]
print(s_docs.shape)
s_docs.head()

(257, 8)


Unnamed: 0,description_id,token_id,eadid,rowid,linguistic_prediction,field,doc,os_prediction
45,374,"[10017, 10018, 10019]",BP,BP/19/02/10,[Gendered-Role],unittitle,Women and Employment,"(Stereotype,)"
57,460,"[16135, 16136, 16137, 16138, 16139, 16140, 161...",BP,BP/27/40,[O],unittitle,Research File: A Woman's View of the Recession...,"(Stereotype,)"
108,1180,"[33428, 33429, 33430]",BXB,BXB/1/1/BOL/1,[O],unittitle,The Peepshow Girl,"(Stereotype,)"
172,3912,"[79353, 79354, 79355, 79356, 79357, 79358, 79359]",BXB,BXB/1/1/HOJ/1,[O],unittitle,The Brief History of a Disreputable Woman,"(Stereotype,)"
188,4714,"[92263, 92264, 92265, 92266, 92267]",BXB,BXB/1/1/KEB/6,[Gendered-Role],unittitle,Euripides' The Trojan Women,"(Stereotype,)"


In [73]:
sample_size = 50

In [74]:
unclf_sample_ids = random.sample(list(unclf_docs.description_id), sample_size)
unclf_sample = unclf_docs.loc[unclf_docs.description_id.isin(unclf_sample_ids)]
assert(unclf_sample.shape[0] == sample_size)
unclf_sample.head()

Unnamed: 0,description_id,token_id,eadid,rowid,linguistic_prediction,field,doc,os_prediction
405,532,"[17715, 17716, 17717, 17718, 17719, 17720, 177...",BP,BP/30/04/21,[O],unittitle,"Family Photograph Album: the Bell family, 1922...",()
417,546,"[18054, 18055, 18056, 18057, 18058, 18059, 180...",BP,BP/30/04/28,[O],unittitle,Family Photograph Album: Herbert William Richm...,()
525,699,"[24248, 24246, 24247]",BXB,BXB/1/1/AGA/4/1,[O],unittitle,Correspondence and leaflet,()
811,1183,"[33454, 33455, 33456, 33457, 33458, 33459, 334...",BXB,BXB/1/1/BOL/1/2,[O],scopecontent,Consists of a negative of the front cover imag...,()
941,1392,"[37469, 37470]",BXB,BXB/1/1/CAN,[O],unittitle,Niall Campbell,()


In [75]:
o_sample_ids = random.sample(list(o_docs.description_id), sample_size)
o_sample = o_docs.loc[o_docs.description_id.isin(o_sample_ids)]
assert(o_sample.shape[0] == sample_size)
o_sample.head()

Unnamed: 0,description_id,token_id,eadid,rowid,linguistic_prediction,field,doc,os_prediction
127,1924,"[46080, 46081, 46082, 46083, 46084, 46085, 460...",BXB,BXB/1/1/DHA,[Gendered-Pronoun],bioghist,"Born in Pakistan, Imtiaz Dharker grew up a Mus...","(Omission,)"
142,2488,"[54839, 54840, 54841, 54842, 54843, 54844, 548...",BXB,BXB/1/1/ELP/1,"[Gendered-Pronoun, Gendered-Role]",bioghist,[Taken from Bloodaxe Books website] Gilbert Bo...,"(Omission,)"
184,4471,"[88064, 88065, 88066, 88067, 88068, 88069, 880...",BXB,BXB/1/1/KAN,"[Gendered-Pronoun, Gendered-Role, Generalization]",bioghist,Sylvia Kantaris was born in 1936 in Derbyshire...,"(Omission,)"
198,5094,"[97950, 97951, 97952, 97953, 97954, 97955, 979...",BXB,BXB/1/1/LIT,"[Gendered-Pronoun, Gendered-Role]",bioghist,"S.J. Litherland's work encompasses love, polit...","(Omission,)"
220,6256,"[117771, 117772, 117773, 117774, 117775, 11777...",BXB,BXB/1/1/OSU/2/5,[Gendered-Pronoun],scopecontent,Consists of a selection of poems that do appea...,"(Omission,)"


In [76]:
os_sample_ids = random.sample(list(os_docs.description_id), sample_size)
os_sample = os_docs.loc[os_docs.description_id.isin(os_sample_ids)]
assert(os_sample.shape[0] == sample_size)
os_sample.head()

Unnamed: 0,description_id,token_id,eadid,rowid,linguistic_prediction,field,doc,os_prediction
96,840,"[27001, 27002, 27003]",BXB,BXB/1/1/ALV/1,[Gendered-Role],unittitle,Carrying My Wife,"(Omission, Stereotype)"
98,871,"[27235, 27236, 27237, 27238, 27239, 27240, 272...",BXB,BXB/1/1/ARM,"[Gendered-Pronoun, Gendered-Role]",bioghist,Simon Armitage was born in 1963 in Huddersfiel...,"(Omission, Stereotype)"
166,3719,"[76064, 76065, 76066, 76067, 76068, 76069, 760...",BXB,BXB/1/1/HIL/13/6,[Gendered-Role],scopecontent,Typescript is under working title 'Man in Unif...,"(Omission, Stereotype)"
1279,28064,"[403336, 403337, 403338, 403339, 403340, 40334...",GB,110241,[O],description,Hugo and men outside Dak Bungalow.,"(Omission, Stereotype)"
1341,28758,"[412460, 412461, 412462, 412463, 412464, 41246...",GB,110589,[O],description,Man Standing On Columned Balcony Of House,"(Omission, Stereotype)"


In [77]:
s_sample_ids = random.sample(list(s_docs.description_id), sample_size)
s_sample = s_docs.loc[s_docs.description_id.isin(s_sample_ids)]
assert(s_sample.shape[0] == sample_size)
s_sample.head()

Unnamed: 0,description_id,token_id,eadid,rowid,linguistic_prediction,field,doc,os_prediction
1137,26682,"[382560, 382561, 382562, 382563, 382564, 38256...",GB,109546,[O],description,Team of six horses and man ploughing - probabl...,"(Stereotype,)"
1166,27253,"[391309, 391310, 391311, 391312, 391313, 39131...",GB,109834,[O],description,"Royal Palace, Anup Talao Pavilion - Turkish Su...","(Stereotype,)"
1257,27849,"[400448, 400449, 400450, 400451, 400452, 40045...",GB,110133,[O],description,Detail of carved elephant from Dhamekh Stupa -...,"(Stereotype,)"
1284,28120,"[403915, 403916, 403917, 403918, 403919, 40392...",GB,110269,[O],description,Nepalese woman carrying a bedding pack.,"(Stereotype,)"
1285,28124,"[403954, 403955, 403956, 403957, 403958, 40395...",GB,110271,[O],description,Nepalese man carrying load on back and loaded ...,"(Stereotype,)"


In [78]:
sample_path = config.classified_data_path+"sample_ling-os/"
Path(sample_path).mkdir(parents=True, exist_ok=True)

In [79]:
sample_file = "baseline_osc_predictions_unclf_sample.csv"
unclf_sample.to_csv(sample_path+sample_file)

In [80]:
sample_file = "baseline_osc_predictions_omission_sample.csv"
o_sample.to_csv(sample_path+sample_file)

In [81]:
sample_file = "baseline_osc_predictions_omission-stereotype_sample.csv"
os_sample.to_csv(sample_path+sample_file)

In [82]:
sample_file = "baseline_osc_predictions_stereotype_sample.csv"
s_sample.to_csv(sample_path+sample_file)

In [83]:
df_sample = pd.concat([unclf_sample, o_sample, os_sample, s_sample])
df_sample = df_sample.sort_values(by=["description_id"])
df_sample.head()

Unnamed: 0,description_id,token_id,eadid,rowid,linguistic_prediction,field,doc,os_prediction
405,532,"[17715, 17716, 17717, 17718, 17719, 17720, 177...",BP,BP/30/04/21,[O],unittitle,"Family Photograph Album: the Bell family, 1922...",()
417,546,"[18054, 18055, 18056, 18057, 18058, 18059, 180...",BP,BP/30/04/28,[O],unittitle,Family Photograph Album: Herbert William Richm...,()
525,699,"[24248, 24246, 24247]",BXB,BXB/1/1/AGA/4/1,[O],unittitle,Correspondence and leaflet,()
96,840,"[27001, 27002, 27003]",BXB,BXB/1/1/ALV/1,[Gendered-Role],unittitle,Carrying My Wife,"(Omission, Stereotype)"
98,871,"[27235, 27236, 27237, 27238, 27239, 27240, 272...",BXB,BXB/1/1/ARM,"[Gendered-Pronoun, Gendered-Role]",bioghist,Simon Armitage was born in 1963 in Huddersfiel...,"(Omission, Stereotype)"


In [84]:
df_sample.tail()

Unnamed: 0,description_id,token_id,eadid,rowid,linguistic_prediction,field,doc,os_prediction
2751,56883,"[1242903, 1242904, 1242905, 1242906, 1242907, ...",WCT,WCT 120,[Gendered-Role],scopecontent,"Mostly relating to temperance, with reference ...","(Omission,)"
2772,56997,"[1244640, 1244641, 1244642, 1244643, 1244644, ...",WCT,WCT 154,"[Gendered-Pronoun, Gendered-Role]",scopecontent,"Regarding art, with reference to Rossetti. Fea...","(Omission,)"
23607,57045,"[1245189, 1245190, 1245191, 1245192, 1245193, ...",WCT,WCT 160,[O],unittitle,Letters to Walter and John Trevelyan from Geor...,()
2789,57105,"[1246124, 1246125, 1246126, 1246127, 1246128, ...",WCT,WCT 178,[O],scopecontent,"Regarding anti-war publications, with copies o...","(Omission,)"
2848,57553,"[1250998, 1250999, 1251000, 1251001, 1251002, ...",WCT,WCT 245,[Gendered-Role],scopecontent,Includes discussion of a manuscript autobiogra...,"(Omission,)"


In [85]:
clf_sample_file = "baseline_osc_predictions_sample_with_preds.csv"
df_sample.to_csv(sample_path+clf_sample_file)

In [87]:
df_sample_unclf = df_sample.drop(columns=["os_prediction"])
unclf_sample_file = "baseline_osc_predictions_sample_no_preds.csv"
df_sample_unclf.to_csv(sample_path+unclf_sample_file)