# Video Classification based on trasncripts

A set of *.csv* files conataining the captions from videos is processed to clasify the content using different dictionaries as reference.

Each video is tagged with the original search term that recommended it and the depth at which it was recommended to the user. This information is used to present the results graphically. 

The following dictionaries are used:

- **2015 Lexicoder Sentiment Dictionary:** classifies sentiment into positive/negative
- **Laver & Garry Dictionary of Policy Position:** classifies topic into CULTURE, ECONOMY, ENVIRONMENT, GROUPS, INSTITUTIONS, LAW_AND_ORDER, RURAL, URBAN, VALUES
- **NRC Emotion Lexicon (version 0.92):** classifies emotion into anger, anticipation, disgust, fear, joy, sadness, surprise, and trust
- **NRC Emotion Intensity Lexicon (version 1):** classifies emotion into anger, anticipation, disgust, fear, joy, sadness, surprise, and trust and indicates its intensity in a scale of 0 to 1
- **NRC Word-Emotion Association Lexicon (NRC Emotion Lexicon):** classifies emotion into anger, fear, anticipation, trust, surprise, sadness, joy, and disgust
- **NRC Valence, Arousal, and Dominance (VAD) Lexicon:** indicates the scale from 0 to 1 of valence, arousal and dominance
- **AFINN:** measures sentiment from negative to positive in a scale from -5 to 5

Young, L. & Soroka, S. (2012). Affective News: The Automated Coding of Sentiment in Political Texts]. doi: 10.1080/10584609.2012.671234. Political Communication, 29(2), 205–231.

Laver. M. & Garry, J. (2000). Estimating Policy Positions from Political Texts. American Journal of Political Science, 44 (3), 619-634.

Word Affect Intensities. Saif M. Mohammad. In Proceedings of the 11th Edition of the
Language Resources and Evaluation Conference (LREC-2018), May 2018, Miyazaki, Japan.

Saif Mohammad and Peter Turney. Crowdsourcing a Word-Emotion Association Lexicon. Computational Intelligence, 29(3): 436-465, 2013. Wiley Blackwell Publishing Ltd.

Saif Mohammad and Peter Turney. Emotions Evoked by Common Words and Phrases: Using Mechanical Turk to Create an Emotion Lexicon. In Proceedings of the NAACL-HLT 2010 Workshop on Computational Approaches to Analysis and Generation of Emotion in Text, June 2010, LA, California.

Obtaining Reliable Human Ratings of Valence, Arousal, and Dominance for 20,000 English Words.
Saif M. Mohammad. In Proceedings of the 56th Annual Meeting of the Association for Computational Linguistics, Melbourne, Australia, July 2018.

In [4]:
library(quanteda)
library(seededlda)
library(quanteda.corpora)
library(gridExtra)
library(cowplot)
library(numbers)
library(ggplot2)
library(lingmatch)
library(SentimentAnalysis)
library(tidyverse)
library(tibble)
library("quanteda.dictionaries")
require(quanteda.textmodels)
require(quanteda.textplots)
library(quanteda.textstats)
library(dplyr)

In [5]:
categorizeSubtitles <- function(file) {
    data_subs <- read.csv(file)
    data_subs<-data_subs[data_subs$subtitles != '',]
    data_subs<-data_subs[data_subs$current_depth < 6,]
    print(file)
    print(paste("No. of rows", nrow(data_subs)))
    print(unique(data_subs[c("keyword")]))
    corp_subs <- corpus(data_subs, text_field = "subtitles")
    toks_subs <- tokens(corp_subs, remove_punct = TRUE, remove_numbers = TRUE, remove_symbol = TRUE)
    toks_subs <- tokens_remove(toks_subs, pattern = c(stopwords("en")))
    
    # Positive/Negative 2015 Lexicoder Sentiment Dictionary
    
    data_dictionary_LSD2015_pos_neg <- data_dictionary_LSD2015[1:2]
    toks_subs_lsd <- tokens_lookup(toks_subs, dictionary = data_dictionary_LSD2015_pos_neg)
    dfmat_subs_lsd <- dfm(toks_subs_lsd) %>% 
        dfm_group(groups = current_depth)
    dfm_subs <- dfm(toks_subs)
    dfm_subs_lsd <- dfm_lookup(dfm_subs, dictionary = data_dictionary_LSD2015_pos_neg)
    df_dfm_subs_lsd <- convert(dfm_subs_lsd, to="data.frame")
    df_dfm_subs <- docvars(dfm_subs)
    
    df_pos <- convert(dfmat_subs_lsd, to="data.frame")
    df_pos$sentiment <- "positive"
    df_pos$current_depth <- as.integer(df_pos$doc_id)
    df_pos$count <- df_pos$positive
    df_pos <- subset(df_pos, select = -c(doc_id, negative, positive))
    df_neg <- convert(dfmat_subs_lsd, to="data.frame")
    df_neg$sentiment <- "negative"
    df_neg$current_depth <- as.integer(df_neg$doc_id)
    df_neg$count <- df_neg$negative
    df_neg <- subset(df_neg, select = -c(doc_id, negative, positive))
    df_pos_neg <- rbind(df_pos, df_neg)
    
    df_net<-convert(dfmat_subs_lsd, to="data.frame")
    df_net$current_depth<-as.integer(df_net$doc_id)
    df_net$net<-df_net$positive-df_net$negative
    df_net<-subset(df_net, select = -c(doc_id, negative, positive))
    
    df_dfm_subs$positive<-df_dfm_subs_lsd$positive
    df_dfm_subs$negative<-df_dfm_subs_lsd$negative
    df_dfm_subs$sentiment<-"positive"
    df_dfm_subs$sentiment[df_dfm_subs$negative>df_dfm_subs$positive]<-"negative"
    df_dfm_subs$sentiment_ratio<-ifelse(!df_dfm_subs$negative | !df_dfm_subs$positive, 0, df_dfm_subs$positive/df_dfm_subs$negative)
    
    # Laver & Garry Dictionary of Policy Position
    
    dfm_subs_lg <- dfm(toks_subs) %>% 
              dfm_trim(min_termfreq = 0.8, termfreq_type = "quantile",
                       max_docfreq = 0.1, docfreq_type = "prop")
    
    dict_topic_lg <- dictionary(file = "Dictionaries/LaverGarry.cat")
    
    tmod_slda_lg <- textmodel_seededlda(dfm_subs_lg, dictionary = dict_topic_lg)
    
    dfm_subs_lg$lgpp <- topics(tmod_slda_lg)
    
    lg_table<-aggregate(rep(1, nrow(dfm_subs_lg)), by = list(lg_topic = dfm_subs_lg$lgpp, current_depth = dfm_subs_lg$current_depth), sum)
    
    lg_table$order<-order(lg_table$current_depth, lg_table$x)
    
    lg_table_0<-lg_table[lg_table$current_depth==0,]
    lg_table_1<-lg_table[lg_table$current_depth==1,]
    lg_table_2<-lg_table[lg_table$current_depth==2,]
    lg_table_3<-lg_table[lg_table$current_depth==3,]
    lg_table_4<-lg_table[lg_table$current_depth==4,]
    lg_table_5<-lg_table[lg_table$current_depth==5,]
    
    lg_table_0<-arrange(lg_table_0, lg_table_0$x)
    lg_table_1<-arrange(lg_table_1, lg_table_1$x)
    lg_table_2<-arrange(lg_table_2, lg_table_2$x)
    lg_table_3<-arrange(lg_table_3, lg_table_3$x)
    lg_table_4<-arrange(lg_table_4, lg_table_4$x)
    lg_table_5<-arrange(lg_table_5, lg_table_5$x)
    
    lg_table_0$order<-order(lg_table_0$x)
    lg_table_1$order<-order(lg_table_1$x)
    lg_table_2$order<-order(lg_table_2$x)
    lg_table_3$order<-order(lg_table_3$x)
    lg_table_4$order<-order(lg_table_4$x)
    lg_table_5$order<-order(lg_table_5$x)
    
    lg_table<- rbind(lg_table_0, lg_table_1, lg_table_2, lg_table_3, lg_table_4, lg_table_5)
    
    df_dfm_subs$lgpp<-dfm_subs_lg$lgpp
    
    # NRC Emotion Lexicon (version 0.92)
    
    dfm_subs_nrc_el <- dfm(toks_subs) %>% 
              dfm_trim(min_termfreq = 0.8, termfreq_type = "quantile",
                       max_docfreq = 0.1, docfreq_type = "prop")
    
    tmod_slda_nrc_el <- textmodel_seededlda(dfm_subs_nrc_el, dictionary = data_dictionary_NRC)
    
    dfm_subs_nrc_el$emotion <- topics(tmod_slda_nrc_el)
    
    nrc_el_table<-aggregate(rep(1, nrow(dfm_subs_nrc_el)), by = list(emotion = dfm_subs_nrc_el$emotion, current_depth = dfm_subs_nrc_el$current_depth), sum)

    nrc_el_table$order<-order(nrc_el_table$current_depth, nrc_el_table$x)
    
    nrc_el_table_0<-nrc_el_table[nrc_el_table$current_depth==0,]
    nrc_el_table_1<-nrc_el_table[nrc_el_table$current_depth==1,]
    nrc_el_table_2<-nrc_el_table[nrc_el_table$current_depth==2,]
    nrc_el_table_3<-nrc_el_table[nrc_el_table$current_depth==3,]
    nrc_el_table_4<-nrc_el_table[nrc_el_table$current_depth==4,]
    nrc_el_table_5<-nrc_el_table[nrc_el_table$current_depth==5,]
    
    nrc_el_table_0<-arrange(nrc_el_table_0, nrc_el_table_0$x)
    nrc_el_table_1<-arrange(nrc_el_table_1, nrc_el_table_1$x)
    nrc_el_table_2<-arrange(nrc_el_table_2, nrc_el_table_2$x)
    nrc_el_table_3<-arrange(nrc_el_table_3, nrc_el_table_3$x)
    nrc_el_table_4<-arrange(nrc_el_table_4, nrc_el_table_4$x)
    nrc_el_table_5<-arrange(nrc_el_table_5, nrc_el_table_5$x)
    
    nrc_el_table_0$order<-order(nrc_el_table_0$x)
    nrc_el_table_1$order<-order(nrc_el_table_1$x)
    nrc_el_table_2$order<-order(nrc_el_table_2$x)
    nrc_el_table_3$order<-order(nrc_el_table_3$x)
    nrc_el_table_4$order<-order(nrc_el_table_4$x)
    nrc_el_table_5$order<-order(nrc_el_table_5$x)
    
    nrc_el_table<- rbind(nrc_el_table_0, nrc_el_table_1, nrc_el_table_2, nrc_el_table_3, nrc_el_table_4, nrc_el_table_5)

    df_dfm_subs$nrc_el<-dfm_subs_nrc_el$emotion
    
    # topics.yml
    
    dict_topic <- dictionary(file = "Dictionaries/topics.yml")
    
    dfm_subs_topic <- dfm(toks_subs) %>% 
              dfm_trim(min_termfreq = 0.8, termfreq_type = "quantile",
                       max_docfreq = 0.1, docfreq_type = "prop")
    
    tmod_slda_topic <- textmodel_seededlda(dfm_subs_topic, dictionary = dict_topic)
    
    dfm_subs_topic$topic <- topics(tmod_slda_topic)

    topic_table<-aggregate(rep(1, nrow(dfm_subs_topic)), by = list(topic = dfm_subs_topic$topic, current_depth = dfm_subs_topic$current_depth), sum)
    
    topic_table<-topic_table[order(topic_table$current_depth, topic_table$x),]
    
    topic_table_0<-topic_table[topic_table$current_depth==0,]
    topic_table_1<-topic_table[topic_table$current_depth==1,]
    topic_table_2<-topic_table[topic_table$current_depth==2,]
    topic_table_3<-topic_table[topic_table$current_depth==3,]
    topic_table_4<-topic_table[topic_table$current_depth==4,]
    topic_table_5<-topic_table[topic_table$current_depth==5,]
    
    topic_table_0<-arrange(topic_table_0, topic_table_0$x)
    topic_table_1<-arrange(topic_table_1, topic_table_1$x)
    topic_table_2<-arrange(topic_table_2, topic_table_2$x)
    topic_table_3<-arrange(topic_table_3, topic_table_3$x)
    topic_table_4<-arrange(topic_table_4, topic_table_4$x)
    topic_table_5<-arrange(topic_table_5, topic_table_5$x)

    topic_table_0$order<-order(topic_table_0$x)
    topic_table_1$order<-order(topic_table_1$x)
    topic_table_2$order<-order(topic_table_2$x)
    topic_table_3$order<-order(topic_table_3$x)
    topic_table_4$order<-order(topic_table_4$x)
    topic_table_5$order<-order(topic_table_5$x)
    
    topic_table <- rbind(topic_table_0, topic_table_1, topic_table_2, topic_table_3, topic_table_4, topic_table_5)

    df_dfm_subs$topic<-dfm_subs_topic$topic
    
    ## Readability
    
    docid <- corp_subs$video
    docnames(corp_subs) <- docid

    ts <- textstat_readability(corp_subs, measure = c('Flesch', 'ARI', 'Flesch.Kincaid'))
    
    df_ts <-data.frame(ts)
    
    df_dfm_subs$Flesch<-df_ts$Flesch
    df_dfm_subs$ARI<-df_ts$ARI
    df_dfm_subs$Flesch.Kincaid<-df_ts$Flesch.Kincaid
    
    date = format(Sys.time(), "%Y-%m-%d_%H_%M_%S")
    file_name = paste('Subtitles/Classified/classified_subtitles_',date, '.csv', sep='')
    write.csv(df_dfm_subs, file_name)
}

In [8]:
#file_names <- dir("Subtitles/", pattern = "subtitles_")
file_names <- dir("Subtitles/", pattern = ".csv")
file_names<-gsub(" ", "", paste("Subtitles/",file_names))

for (file in file_names) {
    print(file)
    categorizeSubtitles(file)
}

[1] "Subtitles/GMCLnb4J-B4.csv"
[1] "Subtitles/GMCLnb4J-B4.csv"
[1] "No. of rows 0"


ERROR: Error in h(simpleError(msg, call)): error in evaluating the argument 'x' in selecting a method for function 'print': undefined columns selected
