# Combine Manually Coded Data
## Manual Review, part II

In [1]:
import pandas as pd       # For data analysis
import numpy as np        # For numerical operations
from pathlib import Path  # For writing data and creating directories
import os                 # For reading data files
import config             # For accessing path variables

### I. Combine Manually Coded Data Files

Read all the CSV data files with manual coding.

In [3]:
coded_path = config.coded_data_path1
csvs = os.listdir(coded_path)
to_remove = ["all_manually_coded_data.csv", ".DS_Store"]
for f in to_remove:
    if f in csvs:
        csvs.remove(f)
print(csvs)

['CHE_sample_no_preds-manually-checked.csv', 'CPT_sample_no_preds-manually-checked.csv', 'SH_sample_no_preds-manually-checked.csv', 'BXB_sample_no_preds-manually-checked.csv', 'BP_unclassified-manually-checked.csv', 'HL_unclassified-manually-checked.csv', 'WCT_sample_no_preds-manually-checked.csv', 'SW_sample_no_preds-manually-checked.csv', 'OBR_sample_no_preds-manually-checked.csv', 'THS_unclassified-manually-checked.csv']


In [4]:
df1 = pd.DataFrame()
df2 = pd.DataFrame()
for csv in csvs:
    if "sample" in csv:
        subdf1 = pd.read_csv(coded_path + csv)
        df1 = pd.concat([df1, subdf1])
    else:
        subdf2 = pd.read_csv(coded_path + csv)
        df2 = pd.concat([df2, subdf2])

In [5]:
df1.rename(columns={
    "Gender bias":"gender_bias", "Unnamed: 1":"index"
    }, inplace=True)
df1.set_index("index", inplace=True)
df1.head(5)

Unnamed: 0_level_0,gender_bias,description_id,eadid,rowid,field,doc
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
11452,,11452,CHE,CHE/01/01,scopecontent,Drafts and meeting notes relating to the creat...
11467,,11467,CHE,CHE/02/04,unittitle,CHE Supporters Review
11471,,11471,CHE,CHE/02/06,unittitle,Friend Newcastle Annual Reports
11554,,11554,CHE,CHE/03/06/12,unittitle,Collection of documents on sex education in sc...
11555,,11555,CHE,CHE/03/06/13,unittitle,"Letters, newsletters and leaflets on the topic..."


In [6]:
df1.gender_bias.value_counts()

gender_bias
Omission - could we say more about her contribution?                                                                        2
Omission - could we say more about his contribution?                                                                        2
Omission - who was Herbert Samuel married to?                                                                               1
Omission - replace 'her' with name in first instance                                                                        1
Stereotype - the tone of this biography is different to the others. It is grandiose and frames Reverdy as a male genius.    1
Omission - missing Charlotte's maiden name. Do we have any more information about her?                                      1
Omission - missing Cooper's first name                                                                                      1
Omission - do we know why it was cancelled?                                                               

In [7]:
df2.rename(columns={
    "Unnamed: 0":"index", "If yeStereotype, Type?":"type", "gender_bias?":"gender_bias"
    }, inplace=True)
df2.set_index("index", inplace=True)
df2.fillna({"gender_bias": "n"}, inplace=True)
df2.fillna({"type": "NA"}, inplace=True)
df2.head(5)

Unnamed: 0_level_0,doc,gender_bias,type,eadid,description_id,rowid,field
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,'ACERT The First Hundred Days',n,,BP,[186],['BP/09/09/03'],['unittitle']
1,'British Housing: Problems and Solutions',n,,BP,[189],['BP/09/09/05'],['unittitle']
2,'Information Booklet' 1989,n,,BP,[199],['BP/09/09/13'],['unittitle']
3,'Information Booklet' 1990,n,,BP,[203],['BP/09/09/17'],['unittitle']
4,"'Looking Back Moving Forwards', 1985",n,,BP,[193],['BP/09/09/07'],['unittitle']


Convert the last three columns' values from strings to lists.

In [8]:
col_names = ["description_id", "rowid", "field"]
for col_name in col_names:
    new_col = list(df2[col_name])
    new_col = [(col[1:-1]).replace("'", "").split(", ") for col in new_col]
    col_i = list(df2.columns).index(col_name)
    df2.drop(columns=[col_name], inplace=True)
    df2.insert(col_i, col_name, new_col)
df2.head()

Unnamed: 0_level_0,doc,gender_bias,type,eadid,description_id,rowid,field
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,'ACERT The First Hundred Days',n,,BP,[186],[BP/09/09/03],[unittitle]
1,'British Housing: Problems and Solutions',n,,BP,[189],[BP/09/09/05],[unittitle]
2,'Information Booklet' 1989,n,,BP,[199],[BP/09/09/13],[unittitle]
3,'Information Booklet' 1990,n,,BP,[203],[BP/09/09/17],[unittitle]
4,"'Looking Back Moving Forwards', 1985",n,,BP,[193],[BP/09/09/07],[unittitle]


Explode the DataFrame so there's only one description_id per row.

In [9]:
df2 = df2.explode(["description_id","rowid","field"])
df2.head()

Unnamed: 0_level_0,doc,gender_bias,type,eadid,description_id,rowid,field
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,'ACERT The First Hundred Days',n,,BP,186,BP/09/09/03,unittitle
1,'British Housing: Problems and Solutions',n,,BP,189,BP/09/09/05,unittitle
2,'Information Booklet' 1989,n,,BP,199,BP/09/09/13,unittitle
3,'Information Booklet' 1990,n,,BP,203,BP/09/09/17,unittitle
4,"'Looking Back Moving Forwards', 1985",n,,BP,193,BP/09/09/07,unittitle


Standardize the manual coding, with one column for gender bias, one column marking the type of gender bias, and and one column recording any notes about the observed gender bias.

In [10]:
df1["type"] = df1["gender_bias"].str.split(" - ", expand=True)[0]
df1["note"] = df1["gender_bias"].str.rsplit(" - ", expand=True)[1]
df1 = df1[["doc", "gender_bias", "type", "note", "eadid", "description_id", "rowid", "field"]]
df1.fillna({"gender_bias": "n"}, inplace=True)
df1.fillna({"type": "NA"}, inplace=True)
df1.fillna({"note": "NA"}, inplace=True)
df1.head()

Unnamed: 0_level_0,doc,gender_bias,type,note,eadid,description_id,rowid,field
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
11452,Drafts and meeting notes relating to the creat...,n,,,CHE,11452,CHE/01/01,scopecontent
11467,CHE Supporters Review,n,,,CHE,11467,CHE/02/04,unittitle
11471,Friend Newcastle Annual Reports,n,,,CHE,11471,CHE/02/06,unittitle
11554,Collection of documents on sex education in sc...,n,,,CHE,11554,CHE/03/06/12,unittitle
11555,"Letters, newsletters and leaflets on the topic...",n,,,CHE,11555,CHE/03/06/13,unittitle


In [11]:
df1.type.value_counts()

type
NA            678
Omission       26
Stereotype      1
Name: count, dtype: int64

In [12]:
df1.note.value_counts()

note
NA                                                                                                             678
could we say more about her contribution?                                                                        2
could we say more about his contribution?                                                                        2
who was Herbert Samuel married to?                                                                               1
the tone of this biography is different to the others. It is grandiose and frames Reverdy as a male genius.      1
missing Charlotte's maiden name. Do we have any more information about her?                                      1
missing Cooper's first name                                                                                      1
do we know why it was cancelled?                                                                                 1
who are 'the contributors'?                                                

In [13]:
gender_bias_values = list(df1.loc[df1.gender_bias != "n"].gender_bias)
df1 = df1.replace(to_replace=gender_bias_values, value="y")
df1.tail()

Unnamed: 0_level_0,doc,gender_bias,type,note,eadid,description_id,rowid,field
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
43357,First draft typescript of 'Laughter When We're...,n,,,OBR,43357,OBR/3/2/6/16,unittitle
43401,Photocopy of chapter from 'Rules of the Game: ...,n,,,OBR,43401,OBR/3/2/8/6,unittitle
43414,Typescript of talk on 'Translating The Birds f...,n,,,OBR,43414,OBR/3/2/9/7,unittitle
43464,Sheffield Hallam University,n,,,OBR,43464,OBR/7/3,unittitle
43498,Includes flyers and posters for poetry reading...,n,,,OBR,43498,OBR/8/3/3,scopecontent


In [14]:
df2.insert(column="note", value="NA", loc=3)
df2.head()

Unnamed: 0_level_0,doc,gender_bias,type,note,eadid,description_id,rowid,field
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,'ACERT The First Hundred Days',n,,,BP,186,BP/09/09/03,unittitle
1,'British Housing: Problems and Solutions',n,,,BP,189,BP/09/09/05,unittitle
2,'Information Booklet' 1989,n,,,BP,199,BP/09/09/13,unittitle
3,'Information Booklet' 1990,n,,,BP,203,BP/09/09/17,unittitle
4,"'Looking Back Moving Forwards', 1985",n,,,BP,193,BP/09/09/07,unittitle


In [15]:
df = pd.concat([df1, df2])
df.head()

Unnamed: 0_level_0,doc,gender_bias,type,note,eadid,description_id,rowid,field
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
11452,Drafts and meeting notes relating to the creat...,n,,,CHE,11452,CHE/01/01,scopecontent
11467,CHE Supporters Review,n,,,CHE,11467,CHE/02/04,unittitle
11471,Friend Newcastle Annual Reports,n,,,CHE,11471,CHE/02/06,unittitle
11554,Collection of documents on sex education in sc...,n,,,CHE,11554,CHE/03/06/12,unittitle
11555,"Letters, newsletters and leaflets on the topic...",n,,,CHE,11555,CHE/03/06/13,unittitle


In [16]:
df.gender_bias.value_counts()

gender_bias
n    12287
y       50
Name: count, dtype: int64

In [17]:
df.type.value_counts()

type
NA                         12287
Omission                      45
Sterotype                      2
Stereotype and Omission        2
Stereotype                     1
Name: count, dtype: int64

...most of which were *Omission*!

In [18]:
df.shape

(12337, 8)

Replace the `type` column with two new columns to each record `"y"` or `"n"` for *Omission* and *Stereotype*.

In [19]:
type_list = list(df.type)
omission_manual = ["y" if "Omission" in t else "n" for t in type_list]
stereotype_manual = ["y" if "Stereotype" in t else "n" for t in type_list]
# print(type_list[100:150])
# print(omission_manual[100:150])
# print(stereotype_manual[100:150])

In [20]:
df.insert(column="omission", value=omission_manual, loc=2)
df.insert(column="stereotype", value=stereotype_manual, loc=3)
df.tail()

Unnamed: 0_level_0,doc,gender_bias,omission,stereotype,type,note,eadid,description_id,rowid,field
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
4861,“The Way to the Cathedral” I.,n,n,n,,,THS,32972,THS 61.1,unittitle
4862,“The Way to the Cathedral” II.,n,n,n,,,THS,32974,THS 61.2,scopecontent
4862,“The Way to the Cathedral” II.,n,n,n,,,THS,32975,THS 61.2,unittitle
4863,“Thomas Sharp – an appreciation” by Lewis Keeble.,n,n,n,,,THS,32934,THS 56.2,unittitle
4864,“Thomas Sharp – an appreciation” by Lewis Keeb...,n,n,n,,,THS,32933,THS 56.2,scopecontent


Export the combined manually coded data.

In [22]:
combined_filename = "all_manually_coded_data.csv"
df.to_csv(config.coded_data_path1+combined_filename)