# Analysis of Transcriptions of the Gertrude Bell Archive

#### Table of Contents
* [Summary Statistics](#summary-statistics)

In [None]:
import os
import pandas as pd
import numpy as np
import re

import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
nltk.download('punkt')
nltk.download('punkt_tab')
from nltk.corpus import PlaintextCorpusReader
nltk.download('wordnet')
from nltk.corpus import wordnet
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.text import Text
from nltk.stem.porter import PorterStemmer
from nltk.probability import FreqDist
nltk.download('averaged_perceptron_tagger')
from nltk.tag import pos_tag
nltk.download('tagsets')  # part of speech tags
from nltk.draw.dispersion import dispersion_plot as displt

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/lucyhavens/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/lucyhavens/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/lucyhavens/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/lucyhavens/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/lucyhavens/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package tagsets to
[nltk_data]     /Users/lucyhavens/nltk_data...
[nltk_data]   Package tagsets is already up-to-date!


In [21]:
# dir = "data/extracted/GB_transcriptions/"  # TXT files of transcriptions - one per item
f = "data/extracted/bell_transcription_data.csv"

## Summary Statistics

In [22]:
df = pd.read_csv(f, index_col=0)
df.tail()

Unnamed: 0,node_id,type,iiif-manifest,transcription
11985,117613,photographs,https://cdm21051.contentdm.oclc.org/iiif/info/...,
11986,117614,photographs,https://cdm21051.contentdm.oclc.org/iiif/info/...,
11987,117615,photographs,https://cdm21051.contentdm.oclc.org/iiif/info/...,
11988,117616,photographs,https://cdm21051.contentdm.oclc.org/iiif/info/...,
11989,117617,photographs,,


In [23]:
df.type.value_counts()

type
photographs    8145
diary          2151
letters        1694
Name: count, dtype: int64

Exclude items for which there are no transcriptions (i.e. the value in the `transcription` column is `NaN`).

In [24]:
print("Before:", df.shape)
subdf = df.dropna(subset=['transcription'])
print("After:", subdf.shape)

Before: (11990, 4)
After: (3833, 4)


In [25]:
subdf.type.value_counts()

type
diary      2140
letters    1693
Name: count, dtype: int64

Nearly all of the text-based material has been transcribed!  99.5% of diaries and 99.9% of letters have transcriptions.  Diaries and letters account for 32.0% of the Gertrude Bell Archive's items (the rest are photographs).

In [29]:
transcriptions = list(subdf.transcription)
sentences = [sent_tokenize(t) for t in transcriptions]
print(sentences[0])

['<p>[25 September 1974] Sept 25 1874 Red Barns, Coatham, Redcar.', 'My dear Florence Mopsa has been very naughty this morning.', 'She has been scampering all over the dining room Tilla[?]', 'says.', 'I had a great chase all over the hall and dining room to catch her and bring her to Papa, she bit and made one little red mark on my hand.', 'During breakfast she hissed at Kitty Scott.', 'Auntie Ada had her on her knee and Kitty was at one side.', 'As Auntie Ada let Mopsa go down she hissed at Kitty and hunted her round my side of the table.', 'Please Papa says will you ask Auntie Florence if she will order us some honey like her own.', 'I gave Mopsa your message and she sends her love.', 'I forgot to say Kitty was very frightened.', 'I send you my love and to Grandmamma and Auntie Florence.', 'Your affectionate little friend Gertrude Bell.</p>']


In [None]:
tokens = [[word_tokenize(s) for s in item_sentences] for item_sentences in sentences]
print(tokens[0][0])

['<', 'p', '>', '[', '25', 'September', '1974', ']', 'Sept', '25', '1874', 'Red', 'Barns', ',', 'Coatham', ',', 'Redcar', '.']


In [None]:
# total_sentences = sum(len(item_sentences) for item_sentences in sentences)
# print("Total sentences:", total_sentences)
sentence_counts = [len(item_sentences) for item_sentences in sentences]
subdf.insert(len(subdf.columns), 'sentence_count', sentence_counts)
print("Total sentences:", sum(count for count in sentence_counts))

Total sentences: 109201


In [52]:
# total_tokens = sum(sum(len(sentence_tokens) for sentence_tokens in item_tokens) for item_tokens in tokens)
token_counts = [sum(len(sentence_tokens) for sentence_tokens in item_tokens) for item_tokens in tokens]
subdf.insert(len(subdf.columns), 'token_count', token_counts)
print("Total tokens:", sum(token_counts))

Total tokens: 2151944


In [58]:
print(np.mean(sentence_counts))
print(np.min(sentence_counts))
print(np.max(sentence_counts))
print(np.std(sentence_counts))

28.489694756065745
1
702
41.560205443172464


In [59]:
print(np.mean(token_counts))
print(np.min(token_counts))
print(np.max(token_counts))
print(np.std(token_counts))

561.4255152621968
1
16765
957.9106305655201


In [None]:
subdf.head()

Unnamed: 0,node_id,type,iiif-manifest,transcription,sentence_count,token_count
0,69571,letters,https://cdm21051.contentdm.oclc.org/iiif/info/...,"<p>[25 September 1974] Sept 25 1874 Red Barns,...",13,180
1,69572,letters,https://cdm21051.contentdm.oclc.org/iiif/info/...,Dear dear Mamy The Johnson's are here because ...,8,125
2,69573,letters,https://cdm21051.contentdm.oclc.org/iiif/info/...,Dear dear Mamy I am so sorry I did not write t...,2,55
3,69574,letters,https://cdm21051.contentdm.oclc.org/iiif/info/...,Dear dear Mamy I am very very sorry you cannot...,6,94
4,69575,letters,https://cdm21051.contentdm.oclc.org/iiif/info/...,Dear Mamy I am so very very very very very ver...,7,76


Save a new version of the CSV file with the sentence and token count columns.

In [55]:
subdf.to_csv(f)

In [62]:
subdf.describe()

Unnamed: 0,node_id,sentence_count,token_count
count,3833.0,3833.0,3833.0
mean,82569.252022,28.489695,561.425515
std,10733.689654,41.565628,958.035611
min,69571.0,1.0,1.0
25%,70521.0,7.0,87.0
50%,91250.0,17.0,282.0
75%,92211.0,34.0,637.0
max,93178.0,702.0,16765.0


In [64]:
subdf.loc[subdf.type == "letters"].describe()

Unnamed: 0,node_id,sentence_count,token_count
count,1693.0,1693.0,1693.0
mean,70523.678086,41.164206,888.453042
std,637.587727,56.784756,1322.083158
min,69571.0,1.0,8.0
25%,69986.0,12.0,184.0
50%,70409.0,24.0,456.0
75%,71231.0,49.0,1070.0
max,71649.0,702.0,16765.0


In [66]:
subdf.loc[subdf.type == "diary"].describe()

Unnamed: 0,node_id,sentence_count,token_count
count,2140.0,2140.0,2140.0
mean,92098.764486,18.462617,302.707009
std,619.458821,17.79781,331.723797
min,91027.0,1.0,1.0
25%,91562.75,5.0,67.0
50%,92098.5,13.0,192.0
75%,92635.25,26.0,429.25
max,93178.0,219.0,4369.0
