# 01 Corpus compilation 

This notebook shows how the two corpora used in the experiment were preprocessed and united for the model building. As poetic subcorpus of the Russian National corpus is under license, the data in the repository includes already lemmatised and shuffled texts. 

In [8]:
library(tidyverse)
library(tidytext)

# function for sampling
source("00_fn_sampling.R")

### Preparing NRC data (raw data not available)

In [10]:
# raw data preparation
# base file: nkrja_19th_lem.Rda - 19th century poetic corpus, lemmatised

load("../data/nkrja_19th_lem.Rda")

glimpse(c19)


Rows: 15,569
Columns: 41
$ Unnamed..0          [3m[90m<int>[39m[23m 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,…
$ path                [3m[90m<chr>[39m[23m "xix/1790-1810_poets/poets-001", "xix/1790-1810_po…
$ author              [3m[90m<chr>[39m[23m "С. С. Бобров", "С. С. Бобров", "С. С. Бобров", "С…
$ dub                 [3m[90m<lgl>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ original            [3m[90m<chr>[39m[23m "", "", "Гораций", "", "", "", "", "", "", "", "",…
$ language            [3m[90m<chr>[39m[23m "", "", "латинский", "", "", "", "", "", "", "", "…
$ header              [3m[90m<chr>[39m[23m "Царство всеобщей любви : «Еще вкруг солнцев не вр…
$ cyclus              [3m[90m<chr>[39m[23m "", "", "", "", "", "", "", "", "", "", "", "", ""…
$ liber               [3m[90m<chr>[39m[23m "", "", "", "", "", "", "", "", "", "", "", "", ""…
$ created             [3m[90m<chr>[39m[23m "1785", "1785", "1787", "1789", "1789

Subsetting, cleaning & sampling

In [16]:
#### Subsetting ####
dat <- c19 %>% 
  filter(diff < 10) %>% # alternative to 'date_reliable' - remove texts with wide date ranges
  filter(year > 1774 & year < 1851) # select roughly the 1st half of the 19th century


#### Cleaning ####
# change column names for script
names(dat)[names(dat) == "Unnamed..0"] <- "id"
names(dat)[names(dat) == "lemma"] <- "text_lemm"

## meters and feet cleaning
# main meters
meters_count <- dat %>% 
  group_by(meter) %>% 
  count(sort = T)
head(meters_count, 10)

meters <- as.vector(meters_count$meter[1:5])

#### cleaning ####
dat <- dat %>% 
  mutate(meter_gr = ifelse(!meter %in% meters, "other", meter)) %>% 
  # most of dactyl formulas are detailed like "Д тонический: Дк + Пен", summarise them into one
  mutate(meter_gr = ifelse(str_detect(meter_gr, "Д|!Аф"), "Д", meter_gr)) %>% 
  # label all combinations of free iamb as "В"
  mutate(feet_gr = ifelse(str_detect(feet, "вольная"), "В", feet)) %>% 
  # ALL feet written as 6(5) to 6 (remove everything in brackets)
  mutate(feet_gr = str_replace_all(feet_gr, "(\\d)\\(\\d.*?\\)", "\\1")) %>% 
  # remove spaces for everything else for easier notation
  mutate(feet_gr = str_remove_all(feet_gr, "[[:space:]]"))

# test  
unique(dat$meter_gr)
head(unique(dat$feet_gr))

meter,n
<chr>,<int>
Я,7654
Х,1549
Аф,449
Д,221
Ан,155
"Д, тонический : Гек + Пен",112
"Д, тонический : Гек",78
"Х, Я",32
"Ан, Аф",18
Я # Х,18


In [18]:
# Sampling

ru19_fin <- sample_long(dat,
                        starting_n = 1,
                        sample_size = "corpus median",
                        over9000 = 6) # max number of samples for poems longer than 800 lines

glimpse(ru19_fin)


Redetermining length of poems...

Preparing to sample...

Sampling long poems...

“[1m[22m`cols` is now required when using `unnest()`.
[36mℹ[39m Please use `cols = c(text_lemm)`.”
[1m[22m`summarise()` has grouped output by 'id'. You can override using the `.groups`
argument.
“[1m[22m`cols` is now required when using `unnest()`.
[36mℹ[39m Please use `cols = c(text_lemm)`.”
[1m[22m`summarise()` has grouped output by 'id'. You can override using the `.groups`
argument.
“[1m[22m`cols` is now required when using `unnest()`.
[36mℹ[39m Please use `cols = c(text_lemm)`.”
[1m[22m`summarise()` has grouped output by 'id'. You can override using the `.groups`
argument.
“[1m[22m`cols` is now required when using `unnest()`.
[36mℹ[39m Please use `cols = c(text_lemm)`.”
[1m[22m`summarise()` has grouped output by 'id'. You can override using the `.groups`
argument.
Sampling poems over 9000!!!...

“[1m[22m`cols` is now required when using `unnest()`.
[36mℹ[39m Please use `co

Rows: 13,717
Columns: 43
$ id                  [3m[90m<chr>[39m[23m "8", "25", "26", "28", "30", "36", "38", "39", "40…
$ path                [3m[90m<chr>[39m[23m "xix/1790-1810_poets/poets-008", "xix/1790-1810_po…
$ author              [3m[90m<chr>[39m[23m "С. С. Бобров", "С. А. Тучков", "С. А. Тучков", "С…
$ dub                 [3m[90m<lgl>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ original            [3m[90m<chr>[39m[23m "", "", "", "", "", "", "", "", "", "", "", "", ""…
$ language            [3m[90m<chr>[39m[23m "", "", "", "", "", "", "", "", "", "", "", "", ""…
$ header              [3m[90m<chr>[39m[23m "К новостолетию XIX : «Страшна отрасль дней небесн…
$ cyclus              [3m[90m<chr>[39m[23m "", "[Сонеты], 1", "[Сонеты], 2", "", "", "", "", …
$ liber               [3m[90m<chr>[39m[23m "", "", "", "", "", "", "", "", "", "", "", "", ""…
$ created             [3m[90m<chr>[39m[23m "1800", "1789", "1789", "1789", "1789

In [21]:
# final renaming & columns selection

nkrja19 <- ru19_fin %>% 
  # rename meters
  mutate(text_raw = "",
         meter_gr = recode(meter_gr, 
                           "Х" = "trochee",
                           "Я" = "iamb", 
                           "Аф" = "amphibrach",
                           "Ан" = "anapaest", 
                           "Д" = "dactyl"),
         feet_gr = recode(feet_gr, "В" = "free"),
         formula = paste0(meter_gr, "_", feet_gr),
         # add id referring to nkrja
         id = paste0("N_", id)) %>% 
  # select only needed columns
  # remove raw texts because of licencing
  select(id, author, text_raw, text_lemm, year, formula, meter_gr, feet_gr, verses) %>% 
  # rename columns
  rename("meter" = "meter_gr",
         "feet" = "feet_gr",
         "n_lines" = "verses")

glimpse(nkrja19) # WHY SO MANY 2 LINES POEMS???

Rows: 13,717
Columns: 9
$ id        [3m[90m<chr>[39m[23m "N_8", "N_25", "N_26", "N_28", "N_30", "N_36", "N_38", "N_39…
$ author    [3m[90m<chr>[39m[23m "С. С. Бобров", "С. А. Тучков", "С. А. Тучков", "С. А. Тучко…
$ text_raw  [3m[90m<chr>[39m[23m "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", …
$ text_lemm [3m[90m<chr>[39m[23m "страшный отрасль день небесный, \n вестник таинство неизвес…
$ year      [3m[90m<int>[39m[23m 1800, 1789, 1789, 1789, 1789, 1802, 1790, 1790, 1790, 1790, …
$ formula   [3m[90m<chr>[39m[23m "trochee_4", "iamb_6", "other_4", "iamb_6", "iamb_6", "iamb_…
$ meter     [3m[90m<chr>[39m[23m "trochee", "iamb", "other", "iamb", "iamb", "iamb", "iamb", …
$ feet      [3m[90m<chr>[39m[23m "4", "6", "4", "6", "6", "free", "free", "6", "6", "6", "fre…
$ n_lines   [3m[90m<dbl>[39m[23m 8, 14, 14, 6, 12, 8, 8, 3, 2, 4, 2, 4, 2, 6, 3, 2, 2, 2, 8, …


In [7]:
# load full data & sample for periodicals corpus
periodicals <- read.csv("")

── [1mAttaching packages[22m ─────────────────────────────────────── tidyverse 1.3.2 ──
[32m✔[39m [34mggplot2[39m 3.4.1     [32m✔[39m [34mpurrr  [39m 1.0.1
[32m✔[39m [34mtibble [39m 3.1.8     [32m✔[39m [34mdplyr  [39m 1.1.0
[32m✔[39m [34mtidyr  [39m 1.3.0     [32m✔[39m [34mstringr[39m 1.5.0
[32m✔[39m [34mreadr  [39m 2.1.4     [32m✔[39m [34mforcats[39m 1.0.0
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()
