<h1> Quantitative Analysis </h1>

---

<h2> Preprocessing Steps </h2>

__Step 1:__

Read data into Pandas dataframe and examine the data.

In [1]:
import numpy as np
import pandas as pd
import json
from pathlib import Path

RAW_FILE_PATH = Path.cwd() / "ww1letters.json"
RAW_METADATA_PATH = Path.cwd() / "index.csv"

# we should include metadata such as the year, language, author
with open(RAW_FILE_PATH, 'r') as f:
    json_data = json.load(f)
    indices = json_data.keys()
    letters = json_data.values()
    data_df = pd.DataFrame({
        "letter": letters
    }, index = indices)

with open(RAW_METADATA_PATH, 'r') as f:
    metadata_df = pd.read_csv(f)
    metadata_df.index = metadata_df["letter_key"]

data_df = data_df.join(
    metadata_df[["year", "language"]],
)
data_df["language"] = data_df["language"].fillna("english")

print(data_df.head(), "\n")
print(data_df.info())

                                                   letter    year language
arc71   Mercredi 29 septembre 1915\nMa chère Louisette...  1915.0   french
hl_01   Magnac Laval\n\nChère épouse et parents,\n\nAu...     NaN   french
hl_02   Correspondance militaire adressée à monsieur J...  1914.0   french
hl_03a  Chère femme, mes deux gosses ainsi que toute m...  1914.0   french
hl_03b  Chers Mère et frère,\n\nTout ce que je vous re...  1914.0   french 

<class 'pandas.core.frame.DataFrame'>
Index: 78 entries, arc71 to new3
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   letter    78 non-null     object 
 1   year      55 non-null     float64
 2   language  78 non-null     object 
dtypes: float64(1), object(2)
memory usage: 4.5+ KB
None


__Step 2:__

Retrieve English letters and apply tokenisation, lemmatisation, other steps?

How to deal with French letters?

We'll also explain more steps here.

credits to notebook

In [2]:
import nltk
from nltk.corpus import wordnet

STEMMER = nltk.stem.WordNetLemmatizer()
un_to_wn_map = {"VERB" : wordnet.VERB,
                "NOUN" : wordnet.NOUN,
                "ADJ" : wordnet.ADJ,
                "ADV" : wordnet.ADV}
PUNCTUATIONS = {'’', '“', '”', '[', ']'}

english_letters = data_df["letter"][data_df["language"] == "english"]
# print(english_letters)

letters_preprocessed = []
for letter in english_letters:
    letter_tokens = nltk.tokenize.word_tokenize(letter.lower())
    letter_tagged = nltk.pos_tag(letter_tokens, tagset = "universal")
    letter_lemmas = []
    
    for (token, pos) in letter_tagged:
        if pos in un_to_wn_map.keys() and token not in PUNCTUATIONS:
            letter_lemmas.append(STEMMER.lemmatize(token,
                                                  pos = un_to_wn_map[pos]))
        elif token.isalnum():
            letter_lemmas.append(STEMMER.lemmatize(token))

    letters_preprocessed.append(letter_lemmas)

__Step 3:__

Replace the letters in the database.

In [3]:
english_data_df = data_df[data_df["language"] == "english"].copy()
english_data_df = english_data_df.drop(columns = ["language"])
english_data_df["letter"] = letters_preprocessed

print(english_data_df.head(), "\n")
print(english_data_df.info())

english_data_df.to_csv(Path.cwd() / "preprocessed_letters.csv")

                                                     letter    year
na_uk_01  [dear, mr, welsh, we, be, only, out, here, a, ...  1915.0
na_uk_02  [dear, nic, boyce, thanks, very, much, for, yo...  1915.0
na_uk_03  [dear, lack, many, thanks, for, the, photograp...  1915.0
na_uk_04  [dear, bert, just, a, few, line, to, let, you,...  1915.0
na_uk_05  [dear, mr, hunt, please, accept, yourself, and...  1915.0 

<class 'pandas.core.frame.DataFrame'>
Index: 58 entries, na_uk_01 to new3
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   letter  58 non-null     object 
 1   year    36 non-null     float64
dtypes: float64(1), object(1)
memory usage: 1.4+ KB
None
