<h1> Quantitative Analysis </h1>

---

<h2> Preprocessing Steps </h2>

<h4>Step 1</h4>

Read data into Pandas dataframe and examine the data.

In [1]:
import numpy as np
import pandas as pd
import json
from pathlib import Path

RAW_FILE_PATH = Path.cwd() / "ww1letters.json"
RAW_METADATA_PATH = Path.cwd() / "index.csv"

# we should include metadata such as the year, language, author
with open(RAW_FILE_PATH, 'r') as f:
    json_data = json.load(f)
    indices = json_data.keys()
    letters = json_data.values()
    data_df = pd.DataFrame({
        "letter": letters
    }, index = indices)

with open(RAW_METADATA_PATH, 'r') as f:
    metadata_df = pd.read_csv(f)
    metadata_df.index = metadata_df["letter_key"]

data_df = data_df.join(
    metadata_df[["year", "language"]],
)
data_df["year"] = data_df["year"].fillna(0)
data_df = data_df.astype({"year": "int64"})
data_df["language"] = data_df["language"].fillna("english")

print(data_df.head(), "\n")
print(data_df.info())

                                                   letter  year language
arc71   Mercredi 29 septembre 1915\nMa chère Louisette...  1915   french
hl_01   Magnac Laval\n\nChère épouse et parents,\n\nAu...     0   french
hl_02   Correspondance militaire adressée à monsieur J...  1914   french
hl_03a  Chère femme, mes deux gosses ainsi que toute m...  1914   french
hl_03b  Chers Mère et frère,\n\nTout ce que je vous re...  1914   french 

<class 'pandas.core.frame.DataFrame'>
Index: 78 entries, arc71 to new3
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   letter    78 non-null     object
 1   year      78 non-null     int64 
 2   language  78 non-null     object
dtypes: int64(1), object(2)
memory usage: 4.5+ KB
None


<h4>Step 2</h4>

Retrieve English letters and apply tokenisation, lemmatisation, other steps?

How to deal with French letters?

We'll also explain more steps here.

credits to notebook

In [2]:
import nltk
from nltk.corpus import wordnet, stopwords

STEMMER = nltk.stem.WordNetLemmatizer()
un_to_wn_map = {"VERB" : wordnet.VERB,
                "NOUN" : wordnet.NOUN,
                "ADJ" : wordnet.ADJ,
                "ADV" : wordnet.ADV}
STOP = set(stopwords.words("english")).union({'’', '“', '”', '[', ']', '…'})

english_letters = data_df["letter"][data_df["language"] == "english"]
# print(english_letters)

letters_preprocessed = []
for letter in english_letters:
    letter_tokens = nltk.tokenize.word_tokenize(letter.lower())
    letter_tagged = nltk.pos_tag(letter_tokens, tagset = "universal")
    letter_lemmas = []
    
    for (token, pos) in letter_tagged:
        if token not in STOP:
            if pos in un_to_wn_map.keys():
                letter_lemmas.append(STEMMER.lemmatize(token,
                                                       pos = un_to_wn_map[pos]))
            elif token.isalnum():
                letter_lemmas.append(STEMMER.lemmatize(token))

    letters_preprocessed.append(' '.join(letter_lemmas))

<h4>Step 3</h4>

Replace the letters in the database.

In [3]:
english_data_df = data_df[data_df["language"] == "english"].copy()
english_data_df = english_data_df.drop(columns = ["language"])
english_data_df["letter"] = letters_preprocessed

print(english_data_df.head(), "\n")
print(english_data_df.info())

english_data_df.to_csv(Path.cwd() / "preprocessed_letters.csv")

                                                     letter  year
na_uk_01  dear mr welsh matter hour go trench eight day ...  1915
na_uk_02  dear nic boyce thanks much interesting letter…...  1915
na_uk_03  dear lack many thanks photograph receive think...  1915
na_uk_04  dear bert line let know alright hop same… pres...  1915
na_uk_05  dear mr hunt please accept convey gentleman be...  1915 

<class 'pandas.core.frame.DataFrame'>
Index: 58 entries, na_uk_01 to new3
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   letter  58 non-null     object
 1   year    58 non-null     int64 
dtypes: int64(1), object(1)
memory usage: 1.4+ KB
None


---

<h2>General Statistics</h2>

<h4>ugh</h4>

In [4]:
from collections import Counter

english_letters = pd.read_csv("preprocessed_letters.csv")["letter"]

def letter_stats(letter: list[str]) -> float:
    """
    Takes in a letter and calculates its key characteristics
    """
    text_size = len(letter)
    vocab_size = len(set(letter))
    ttr = vocab_size / text_size

    return ttr

def show_letter_stats(letters: list[list[str]]) -> (list[float]):
    """
    Show stuff
    """
    for letter in letters:
        ttr = letter_stats(letter)
        break