<h1> Quantitative Analysis </h1>

---

<h2> Preprocessing Steps </h2>

<h4>Step 1</h4>

Read data into Pandas dataframe and examine the data.

In [1]:
import numpy as np
import pandas as pd
import json
from pathlib import Path

def load_json_to_df(path: Path) -> pd.DataFrame:
    """
    Takes a file path, opens the .json file at
    that position, and transforms it into a dataframe
    """
    with open(path, 'r') as f:
        json_data = json.load(f)
        return pd.DataFrame({
            "letter": json_data.values()
        }, index = json_data.keys())

DATA_DIR = Path.cwd() / "data"
KAGGLE_FILE_PATH = DATA_DIR / "ww1letters.json"
KAGGLE_METADATA_PATH = DATA_DIR / "index.csv"

kaggle_df = load_json_to_df(KAGGLE_FILE_PATH)
with open(KAGGLE_METADATA_PATH, 'r') as f:
    metadata_df = pd.read_csv(f)
    metadata_df.index = metadata_df["letter_key"]

kaggle_df = kaggle_df.join(
    metadata_df[["year", "language"]],
)
kaggle_df = kaggle_df[kaggle_df["year"].notna()].astype({"year": "int64"})
kaggle_df["language"] = kaggle_df["language"].fillna("english")

print(kaggle_df.head(), "\n")
print(kaggle_df.info())

                                                   letter  year language
arc71   Mercredi 29 septembre 1915\nMa chère Louisette...  1915   french
hl_02   Correspondance militaire adressée à monsieur J...  1914   french
hl_03a  Chère femme, mes deux gosses ainsi que toute m...  1914   french
hl_03b  Chers Mère et frère,\n\nTout ce que je vous re...  1914   french
hl_04   Aux armées le 27 mai 1916\n\n \nChers parents,...  1916   french 

<class 'pandas.core.frame.DataFrame'>
Index: 55 entries, arc71 to na_uk_40
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   letter    55 non-null     object
 1   year      55 non-null     int64 
 2   language  55 non-null     object
dtypes: int64(1), object(2)
memory usage: 1.7+ KB
None


In [2]:
DEAREST_PATH = DATA_DIR / "400_letters.json"

dearest_df = load_json_to_df(DEAREST_PATH)

london_years = [1915] * 3 + [1916] * 6
france_years = [1917] * 77
missing_years = [1917] * 13
pow_years = [1918] * 174 + [1917] * 79
sheerness_years = [1919] * 15

dearest_df["year"] = london_years + france_years + missing_years + pow_years + sheerness_years
print(dearest_df.head(), "\n")
print(dearest_df.info())

                                                  letter  year
lo001  Dear Ginger, I have just returned from a holid...  1915
lo002  Zeppelins over London 4th Nov 1915 David to Gi...  1915
lo003  David Joins Army 28 Dec 1915 David to Ginger D...  1915
lo004  How Britain Prepared Ginger (Ethel)  to David ...  1916
lo005  1916 OTC Gidea Park David to Ginger (sister Et...  1916 

<class 'pandas.core.frame.DataFrame'>
Index: 367 entries, lo001 to sh015
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   letter  367 non-null    object
 1   year    367 non-null    int64 
dtypes: int64(1), object(1)
memory usage: 8.6+ KB
None


<h4>Step 2</h4>

Retrieve English letters and apply tokenisation, lemmatisation, other steps?

How to deal with French letters?

We'll also explain more steps here.

credits to notebook

In [3]:
import nltk
from nltk.corpus import wordnet, stopwords

STEMMER = nltk.stem.WordNetLemmatizer()
un_to_wn_map = {"VERB" : wordnet.VERB,
                "NOUN" : wordnet.NOUN,
                "ADJ" : wordnet.ADJ,
                "ADV" : wordnet.ADV}
STOP = set(stopwords.words("english")).union({'’', '“', '”', '[', ']', '…'})

english_kaggle = kaggle_df["letter"][kaggle_df["language"] == "english"]
# print(english_kaggle)

def preprocess(df: pd.DataFrame) -> list[str]:
    """
    Takes a series of letters and preprocesses them
    by applying tokenisation, 
    """
    letters_preprocessed = []
    for letter in df:
        letter_tokens = nltk.tokenize.word_tokenize(letter.lower())
        letter_tagged = nltk.pos_tag(letter_tokens, tagset = "universal")
        letter_lemmas = []
        
        for (token, pos) in letter_tagged:
            if token not in STOP:
                if pos in un_to_wn_map.keys():
                    letter_lemmas.append(STEMMER.lemmatize(token,
                                                           pos = un_to_wn_map[pos]))
                elif token.isalnum():
                    letter_lemmas.append(STEMMER.lemmatize(token))
    
        letters_preprocessed.append(' '.join(letter_lemmas))

    return letters_preprocessed

english_kaggle_pp = preprocess(english_kaggle)
dearest_pp = preprocess(dearest_df["letter"])

<h4>Step 3</h4>

Replace the letters in the database.

In [4]:
english_data_df = kaggle_df[kaggle_df["language"] == "english"].copy()
english_data_df = english_data_df.drop(columns = ["language"])
english_data_df["letter"] = english_kaggle_pp

dearest_df["letter"] = dearest_pp

data_df = pd.concat([dearest_df, english_data_df], ignore_index = False)

print(data_df.head(), "\n")
print(data_df.info())

data_df.to_csv(DATA_DIR / "preprocessed_letters.csv")

                                                  letter  year
lo001  dear ginger return holiday brighton glorious t...  1915
lo002  zeppelin london 4th nov 1915 david ginger 56 r...  1915
lo003  david join army 28 dec 1915 david ginger dec 2...  1915
lo004  britain prepared ginger ethel david somewhere ...  1916
lo005  1916 otc gidea park david ginger sister ethel ...  1916 

<class 'pandas.core.frame.DataFrame'>
Index: 403 entries, lo001 to na_uk_40
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   letter  403 non-null    object
 1   year    403 non-null    int64 
dtypes: int64(1), object(1)
memory usage: 9.4+ KB
None


---

<h2>General Statistics</h2>

<h4>ugh</h4>

In [None]:
from collections import Counter

english_letters = pd.read_csv(DATA_DIR / "preprocessed_letters.csv")["letter"]

def letter_stats(letter: list[str]) -> float:
    """
    Takes in a letter and calculates its key characteristics
    """
    text_size = len(letter)
    vocab_size = len(set(letter))
    ttr = vocab_size / text_size

    return ttr

def show_letter_stats(letters: list[list[str]]) -> (list[float]):
    """
    Show stuff
    """
    for letter in letters:
        ttr = letter_stats(letter)
        break