## Chapter 2.2. Poems in periodicals: statistics

Load packages and periodicals data

In [None]:
library(tidyverse)

library(treemap)
library(ggplotify)
theme_set(theme_minimal())
library(wesanderson)

Load data

In [None]:
# periodicals
per <- read.csv("../../data/periodicals_lem.csv") %>% 
            select(-X) %>% 
            rename(author = Author_Initials,
                  year = Year, 
                  title = Text_title,
                  first_line = First_line) %>% 
            filter(year != 1834)
glimpse(per)

In [None]:
# RNC data
load("../../data/nkrja_19th_lem.Rda")

rnc_authors <- c19 %>% 
    select(author) %>% 
    mutate(author = str_remove_all(author, "[[:space:]]|[[:punct:]]")) %>% 
    distinct() %>% 
    pull()

head(rnc_authors)

#rm(c19)

In [None]:
print("Total number of authors in periodicals:")
per %>% 
    filter(!is.na(A_ID) & !is.na(author)) %>% # 1095 poems are with known author
    select(author) %>% 
    count(author, sort = T) %>% nrow() 

print("Number of authors with biography in RP:")
per %>% 
    filter(!is.na(RP_biography)) %>% 
    select(RP_biography) %>% 
    count(RP_biography, sort = T) %>% nrow() 

print("Number of authors included in RNC")
per %>% 
    filter(!is.na(author)) %>% 
    select(author) %>% 
    mutate(author = str_replace_all(author, "(.*?[[:space:]])(\\w\\.\\w.)", "\\2\\1")) %>% 
    mutate(author = str_remove_all(author, "[[:space:]]|[[:punct:]]")) %>% 
    distinct() %>% 
    filter(author %in% rnc_authors) %>% nrow()

## Textual intersections between RNC & periodicals
- Create for each poem an id column containing two first lines
- Mark the intersection with a special tag

In [None]:
periodicals <- per %>% 
    filter(n_lines > 3) %>%
    mutate(doublesCheck = str_extract(text_lemm, "^.*?\n.*?\n.*?\n")) %>% 
    mutate(doublesCheck = str_remove_all(doublesCheck, "[[:punct:]]|[[:space:]]"))

glimpse(periodicals)

In [None]:
nkrja19 <- c19 %>% 
    rename(index = `Unnamed..0`,
          text_lemm = lemma) %>% 
    select(index, author, text_lemm) %>%
    mutate(doublesCheck = str_extract(text_lemm, "^.*?\n.*?\n.*?\n")) %>% 
    mutate(doublesCheck = str_remove_all(doublesCheck, "[[:punct:]]|[[:space:]]"))

glimpse(nkrja19)

In [None]:
doubles_id <- periodicals %>% 
    inner_join(nkrja19, by = "doublesCheck") %>% 
    select(index, id, year, author.x, author.y, doublesCheck, text_raw)

doubles_id %>% select(-text_raw)

In [None]:
print("Total number of intersected texts:")
nrow(doubles_id)

print("Total number of texts in periodicals:")
nrow(periodicals)

print("Total % of intersected texts out for all periodicals texts:")
round( (nrow(doubles_id)/nrow(periodicals))*100 , 1) 

In [None]:
glimpse(doubles_id)

doubles_counts <- doubles_id %>% 
    count(year) %>% 
    mutate(group = "Периодика и НКРЯ")

doubles_counts %>% pivot_wider(names_from = year, values_from = n)

### Plot 2.2

In [None]:
rnc_1835 <- c19 %>% 
    rename(index = `Unnamed..0`) %>% 
    # remove intersection
    #anti_join(doubles_id %>% select(index), by = "index") %>%

    # calculate number of unique poems
    filter(year > 1834 & year < 1841) %>% 
    count(year, sort = F) %>% 
    mutate(group = "НКРЯ")

rnc_1835 %>% pivot_wider(names_from = year, values_from = n)

In [None]:
counter_1835 <- per %>% 
    # remove intersection
    #anti_join(doubles_id %>% select(id), by = "id") %>%

    count(year, sort = F) %>% 
    mutate(group = "Периодика") %>%
    rbind(rnc_1835, doubles_counts)

counter_1835 %>% pivot_wider(names_from = year, values_from = n)

In [None]:
p2_2_1 <- counter_1835 %>%
    ggplot(aes(x = year, y = n, fill = group)) + 
    geom_col(position = "dodge", width = 0.6) + 
    scale_fill_manual(values = c(
                                wes_palette("Zissou1")[4],
                                wes_palette("Royal1")[1], 
                                wes_palette("Royal1")[2])) + 
    scale_x_continuous(breaks = 1835:1840) +
    theme(axis.text = element_text(size = 14),
         axis.title = element_text(size = 16),
         legend.title = element_text(size = 16, face = "bold"),
         legend.text = element_text(size = 14)) + 
    labs(x = "Год",
        y = "Количество текстов",
        fill = "Корпус")

p2_2_1

In [None]:
ggsave(file = "plots/fig2_2_1.png", plot = p2_2_1, dpi = 300,
      width = 8, height = 6, bg = "white")

## Sources statistics
Analysis of poems published in different sources (journals & newspapers) and by different authors

In [None]:
glimpse(per)

In [None]:
# current state of digitisation

per %>% 
    group_by(year, PER_ID) %>% 
    count(sort = F)

In [None]:
per %>% group_by(year, PER_ID) %>% count() %>% 
    ggplot(aes(x = year, y = n, fill = PER_ID)) + geom_col()

In [None]:
authors_sources <- per %>% 
    mutate(PER_ID = ifelse(str_detect(PER_ID,"СОиСА"), "СО", PER_ID)) %>% 
    filter(!is.na(author)) %>% 
    
    # grouping by author, year and source
    group_by(year, PER_ID, author) %>% 
    count(sort = T) %>% 
    ungroup() %>%
    
    # non-frequent authors grouped as "others"
    mutate(author = ifelse(n < 3, "Другие", author)) %>% 
    group_by(year, PER_ID, author) %>% 
    # sum poems by others
    summarise(n = sum(n)) %>% 
    mutate(author_label = paste0(author, " (", n, ")"))

head(authors_sources)

In [None]:
light_palette <- 
    c(wes_palette("Chevalier1")[1], 
    wes_palette("Chevalier1")[2], 
    wes_palette("Chevalier1")[3], 
    wes_palette("Darjeeling2")[1], 
    wes_palette("Darjeeling2")[2], 
    wes_palette("Darjeeling2")[3])

In [None]:
# treemap: https://cran.r-project.org/web/packages/treemap/treemap.pdf

x <- NULL

for (i in 1:length(unique(per$year))) {
    
    x <- unique(per$year)[i]
    
    filename = paste0("plots/treemaps/plot_", x, ".png")

    png(filename, width = 600, height = 600)

    t <- authors_sources %>% 
        filter(year == x) %>% 
        treemap(
            index = c("PER_ID", "author_label"),
            vSize = "n",
            type = "index",
            palette = light_palette,
            fontsize.labels = c(20, 16),
            #position.legend = "right",
            overlap.labels = 1,
            title = x,
            fontsize.title = 22
        )

    dev.off()
}

In [None]:
authors_sources %>% 
        filter(year == 1835) %>% 
        treemap(
            index = c("PER_ID", "author_label"),
            vSize = "n",
            type = "index",
            palette = light_palette,
            fontsize.labels = c(20, 16),
            #position.legend = "right",
            overlap.labels = 1,
            title = "1835",
            fontsize.title = 22
        )