In [1]:
library(quanteda)
library(seededlda)
library(quanteda.corpora)
library(gridExtra)
library(cowplot)
library(numbers)
library(ggplot2)
library(lingmatch)
library(SentimentAnalysis)
library(tidyverse)
library(tibble)
library("quanteda.dictionaries")
require(quanteda.textmodels)
require(quanteda.textplots)
library(quanteda.textstats)
library(dplyr)
library(stringr)

Package version: 3.0.0
Unicode version: 13.0
ICU version: 69.1

Parallel computing: 4 of 4 threads used.

See https://quanteda.io for tutorials and examples.


Attaching package: ‘seededlda’


The following object is masked from ‘package:stats’:

    terms


Loading required package: Matrix


Attaching package: ‘SentimentAnalysis’


The following object is masked from ‘package:base’:

    write


── [1mAttaching packages[22m ───────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.1 ──

[32m✔[39m [34mtibble [39m 3.1.2     [32m✔[39m [34mdplyr  [39m 1.0.5
[32m✔[39m [34mtidyr  [39m 1.1.3     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 1.4.0     [32m✔[39m [34mforcats[39m 0.5.1
[32m✔[39m [34mpurrr  [39m 0.3.4     

── [1mConflicts[22m ──────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32

In [2]:
# New version
categorizeTranscripts <- function(df) {
    data_subs <- df
    data_subs <- data_subs[data_subs$transcript != '',]
    data_subs$length <- str_count(data_subs$transcript, "\\w+")
    print(paste("No. of rows", nrow(data_subs)))
    corp_subs <- corpus(data_subs, text_field = "transcript")
    toks_subs <- tokens(corp_subs, remove_punct = TRUE, remove_numbers = TRUE, remove_symbol = TRUE)
    toks_subs <- tokens_remove(toks_subs, pattern = c(stopwords("en")))

    df_dfm_subs <- docvars(dfm(toks_subs))
    
    # Positive/Negative 2015 Lexicoder Sentiment Dictionary

    data_dictionary_LSD2015_pos_neg <- data_dictionary_LSD2015[1:2]
    toks_lookup <- tokens_lookup(toks_subs, dictionary = data_dictionary_LSD2015_pos_neg, levels = 1)

    df_lsd <- convert(dfm(toks_lookup), to="data.frame")

    df_dfm_subs$lsd_negative <- df_lsd$negative
    df_dfm_subs$lsd_positive <- df_lsd$positive

    df_dfm_subs$lsd <- colnames(df_lsd[,2:3])[max.col(df_lsd[,2:3], ties.method = "first")]

    df_dfm_subs$lsd_net <- df_dfm_subs$lsd_positive - df_dfm_subs$lsd_negative

    df_dfm_subs$lsd_ratio <- ifelse(!df_dfm_subs$lsd_negative | !df_dfm_subs$lsd_positive, 0, df_dfm_subs$lsd_positive / df_dfm_subs$lsd_negative)    

    # AFINN Sentiment Dictionary

    toks_lookup <- tokens_lookup(toks_subs, dictionary = data_dictionary_AFINN, levels = 1)

    df_afinn <- convert(dfm(toks_lookup), to="data.frame")

    df_dfm_subs$afinn_negative <- df_afinn$negative
    df_dfm_subs$afinn_positive <- df_afinn$positive

    df_dfm_subs$afinn <- colnames(df_afinn[,2:3])[max.col(df_afinn[,2:3], ties.method = "first")]

    df_dfm_subs$afinn_net <- df_dfm_subs$afinn_positive - df_dfm_subs$afinn_negative

    df_dfm_subs$afinn_ratio <- ifelse(!df_dfm_subs$afinn_negative | !df_dfm_subs$afinn_positive, 0, df_dfm_subs$afinn_positive / df_dfm_subs$afinn_negative)    

    # Augmented General Inquirer Positive and Negative Dictionary

    toks_lookup <- tokens_lookup(toks_subs, dictionary = data_dictionary_geninqposneg, levels = 1)

    df_geninqposneg <- convert(dfm(toks_lookup), to="data.frame")

    df_dfm_subs$geninqposneg_negative <- df_geninqposneg$negative
    df_dfm_subs$geninqposneg_positive <- df_geninqposneg$positive

    df_dfm_subs$geninqposneg <- colnames(df_geninqposneg[,2:3])[max.col(df_geninqposneg[,2:3], ties.method = "first")]

    df_dfm_subs$geninqposneg_net <- df_dfm_subs$geninqposneg_positive - df_dfm_subs$geninqposneg_negative

    df_dfm_subs$geninqposneg_ratio <- ifelse(!df_dfm_subs$geninqposneg_negative | !df_dfm_subs$geninqposneg_positive, 0, df_dfm_subs$geninqposneg_positive / df_dfm_subs$geninqposneg_negative)    

    # Positive and negative words from Hu and Liu (2004) Dictionary

    toks_lookup <- tokens_lookup(toks_subs, dictionary = data_dictionary_HuLiu, levels = 1)

    df_huliu <- convert(dfm(toks_lookup), to="data.frame")

    df_dfm_subs$huliu_negative <- df_huliu$negative
    df_dfm_subs$huliu_positive <- df_huliu$positive

    df_dfm_subs$huliu <- colnames(df_huliu[,2:3])[max.col(df_huliu[,2:3], ties.method = "first")]

    df_dfm_subs$huliu_net <- df_dfm_subs$huliu_positive - df_dfm_subs$huliu_negative

    df_dfm_subs$huliu_ratio <- ifelse(!df_dfm_subs$huliu_negative | !df_dfm_subs$huliu_positive, 0, df_dfm_subs$huliu_positive / df_dfm_subs$huliu_negative)    

    # Loughran and McDonald Sentiment Word Lists

    toks_lookup <- tokens_lookup(toks_subs, dictionary = data_dictionary_LoughranMcDonald, levels = 1)

    df_lm <- convert(dfm(toks_lookup), to="data.frame")

    df_dfm_subs$lm_negative <- df_lm$negative
    df_dfm_subs$lm_positive <- df_lm$positive
    df_dfm_subs$lm_uncertainty <- df_lm$uncertainty
    df_dfm_subs$lm_litigious <- df_lm$litigious
    df_dfm_subs$lm_constraining <- df_lm$constraining
    df_dfm_subs$lm_superfluous <- df_lm$superfluous
    df_dfm_subs$lm_interesting <- df_lm$interesting
    df_dfm_subs$lm_modal_words_strong <- df_lm$'modal words strong'

    df_dfm_subs$lm <- colnames(df_lm[,2:9])[max.col(df_lm[,2:9], ties.method = "first")]
    
    # Laver & Garry Dictionary of Policy Position

    toks_lookup <- tokens_lookup(toks_subs, dictionary = data_dictionary_LaverGarry, levels = 1)

    df_lgpp <- convert(dfm(toks_lookup), to="data.frame")

    df_dfm_subs$lgpp_culture <- df_lgpp$culture
    df_dfm_subs$lgpp_economy <- df_lgpp$economy
    df_dfm_subs$lgpp_environment <- df_lgpp$environment
    df_dfm_subs$lgpp_groups <- df_lgpp$groups
    df_dfm_subs$lgpp_institutions <- df_lgpp$institutions
    df_dfm_subs$lgpp_law_and_order <- df_lgpp$law_and_order
    df_dfm_subs$lgpp_rural <- df_lgpp$rural
    df_dfm_subs$lgpp_urban <- df_lgpp$urban
    df_dfm_subs$lgpp_values <- df_lgpp$values

    df_dfm_subs$lgpp <- colnames(df_lgpp[,2:10])[max.col(df_lgpp[,2:10], ties.method = "first")]
    
    # NRC Emotion Lexicon (version 0.92)

    toks_lookup <- tokens_lookup(toks_subs, dictionary = data_dictionary_NRC, levels = 1)

    df_nrc_el <- convert(dfm(toks_lookup), to="data.frame")

    df_dfm_subs$nrc_el_anger <- df_nrc_el$anger
    df_dfm_subs$nrc_el_anticipation <- df_nrc_el$anticipation
    df_dfm_subs$nrc_el_disgust <- df_nrc_el$disgust
    df_dfm_subs$nrc_el_fear <- df_nrc_el$fear
    df_dfm_subs$nrc_el_joy <- df_nrc_el$joy
    df_dfm_subs$nrc_el_negative <- df_nrc_el$negative
    df_dfm_subs$nrc_el_positive <- df_nrc_el$positive
    df_dfm_subs$nrc_el_sadness <- df_nrc_el$sadness
    df_dfm_subs$nrc_el_surprise <- df_nrc_el$surprise
    df_dfm_subs$nrc_el_trust <- df_nrc_el$trust

    df_dfm_subs$nrc_el <- colnames(df_nrc_el[,2:11])[max.col(df_nrc_el[,2:11], ties.method = "first")]
    
    ## Readability

    docid <- corp_subs$video
    docnames(corp_subs) <- docid

    ts <- textstat_readability(corp_subs, measure = c('Flesch', 'ARI', 'Flesch.Kincaid'))

    df_ts <-data.frame(ts)

    df_dfm_subs$Flesch<-df_ts$Flesch
    df_dfm_subs$ARI<-df_ts$ARI
    df_dfm_subs$Flesch.Kincaid<-df_ts$Flesch.Kincaid
    
    date = format(Sys.time(), "%Y-%m-%d_%H_%M_%S")
    file_name = paste('processed/training_videos_classified',date, '.csv', sep='')
    write.csv(df_dfm_subs, file_name, row.names=FALSE)
}

In [7]:
file_name <- "processed/trainnig_videos_transcripts2022-01-10_22_40_40.csv"
nrecords<-5000
for (i in 0:1) {
    paste("Loop: ", i, " up to: ", (i + 1) * nrecords)
    myData <- read.csv(file_name, nrows=nrecords, skip=i*nrecords, col.names = c("video","transcript"))
    print(dim(myData))
    categorizeTranscripts(myData)
}

[1] 5000    2
[1] "No. of rows 5000"
[1] 4118    2
[1] "No. of rows 4118"
