# Step 7: spaCy Parse and Network Extract

**Purpose:** This script takes the text files we cleaned in part 1, and runs spaCy on them to identify named entities and token dependencies for network generation. Then it takes those spacy outputs and groups them by EIS number and runs textnet extract to generate the network files for each plan. Those extracts are saved in intermediate_files.

**Setup:** requires textNet, spaCy, python, and (recommended) findpython. If you want to overwrite file outputs, set overwrite to T (we shouldn't need to do this unless changes made to network generation process).

In [None]:
overwrite = F
library(textNet)
library(findpython)

## Load and prepare text files

spacy_parse() takes a named list where each element is a different file.

In [None]:
files <- list.files(path = "salinasbox/clean_data/pdf_to_text_clean", 
           pattern = ".RDS", full.names = T)

texts <- vector(mode = "list", length = 
                     length(files))

texts <- lapply(files, function(i){
  readRDS(i)$text
})

names(texts) <- basename(files)

## Setup Python environment

To resolve issues with finding python binary with find_python.

In [None]:
library(reticulate)
myenv <- conda_list(conda = "auto")$python
use_condaenv(myenv[4])

ret_path <- find_python_cmd(required_modules = c('spacy', 'en_core_web_lg','en_core_web_trf'))

## Define custom entities

In [None]:
parties <- c("Project", "Projects",
             "Applicant", "Applicants",
             "Permittee", "Permittees",
             "Proponent", "Proponents",
             "Band", "Bands",
             "tribe", "tribes",
             "Tribe", "Tribes",
             "we", "We")

## Parse text with spaCy

In [None]:
parse_fileloc <- paste0("salinasbox/intermediate_data/parsed_files/", basename(files))

parsed <- textNet::parse_text(ret_path,
                              text_list = texts[1:2],
                              parsed_filenames = parse_fileloc,
                              overwrite = overwrite,
                              ### NEW THING I CHANGED MODEL ####
                              model = "en_core_web_trf",
                              custom_entities = list(PARTIES = parties))

names(parsed) <- names(texts)
saveRDS(object = parsed, file = "salinasbox/intermediate_data/all_parsed.RDS")

## Group by EIS number

Put all parts of same EIS number together.

In [None]:
projects <- vector(mode = "list", length = 
                     length(unique(substr(basename(files), 1, 8))))

names(projects) <- unique(substr(basename(files), 1, 8))

filenum = 1
for(i in 1:length(projects)){
  projects[[i]] <- parsed[[filenum]]
  filenum = filenum + 1
  while(filenum <= length(parsed) & substr(names(parsed)[filenum], 1, 8) == names(projects)[i]){
    projects[[i]] <- rbind(projects[[i]], parsed[[filenum]])
    filenum = filenum + 1
  }
}

## Extract networks

Better to be inclusive with entity types and remove later. See [OntoNotes documentation, page 21](https://catalog.ldc.upenn.edu/docs/LDC2013T19/OntoNotes-Release-5.0.pdf) for definitions.

Notes:
- "EVENT" doesn't have much in it but we will preserve just in case
- "LANGUAGE" doesn't have much in it but sometimes "Latino" (??)
- Did not keep "MONEY" because it appeared unreliable (sometimes was kJ, etc.)

In [None]:
extracts <- vector(mode = "list", length = length(projects))

keptentities <- c("PERSON", 
              "NORP", 
              "FAC",
              "ORG", "GPE", 
              "LOC", "PRODUCT", 
              "EVENT", "WORK_OF_ART",
              "LAW", "LANGUAGE",
              "PARTIES")

for(m in 1:length(projects)){
  if(overwrite ==T | !file.exists(paste0("salinasbox/intermediate_data/raw_extracted_networks/extract_", names(projects)[m],".RDS"))){
    extracts[[m]] <- textnet_extract(projects[[m]], 
                                     cl = 4,
                                     keep_entities = keptentities,
                                     return_to_memory = T,
                                     keep_incomplete_edges = T,
                                     file = paste0("salinasbox/intermediate_data/raw_extracted_networks/extract_", names(projects)[m],".RDS")
    )
  }else{
    print(paste0("file ", paste0("salinasbox/intermediate_data/raw_extracted_networks/extract_", names(projects)[m],".RDS"),
                 " already exists."))
  }
  
}

saveRDS(object = extracts, file = "salinasbox/intermediate_data/raw_extracts.RDS")