
# Installing external libs

In [0]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://www-us.apache.org/dist/spark/spark-2.4.5/spark-2.4.5-bin-hadoop2.7.tgz
!tar xf spark-2.4.5-bin-hadoop2.7.tgz
!pip install -q findspark

In [0]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.5-bin-hadoop2.7"

# Importing dataset

In [0]:
from google.colab import drive
drive.mount('/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /gdrive


In [0]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").config("spark.driver.memory", "10g").getOrCreate()

In [0]:
booksDF = spark.read.option('header', 'true').csv('/gdrive/My Drive/Datasets/ef.csv')
booksDF.show()

+---+--------------------+------------+-------+--------------------+
|_c0|               title|       genre|  label|             content|
+---+--------------------+------------+-------+--------------------+
|  0|4097_alice+of+old...|Love_stories|failure|produced by charl...|
|  1|17053_kate+bonnet...|Love_stories|failure|etext prepared by...|
|  2|22002_a+simple+st...|Love_stories|failure|produced by david...|
|  3|28862_the+time+of...|Love_stories|failure|etext prepared by...|
|  4|25727_vagabondia+...|Love_stories|failure|produced by david...|
|  5|13731_romance+isl...|Love_stories|failure|produced by janet...|
|  6|16692_beyond+the+...|Love_stories|failure|produced by suzan...|
|  7|21098_the+indepen...|Love_stories|failure|produced by nick ...|
|  8|25001_an+old+man'...|Love_stories|failure|etext prepared by...|
|  9|  23810_at+fault.txt|Love_stories|failure|produced by berna...|
| 10|8897_nina+balatka...|Love_stories|failure|etext prepared by...|
| 11|20358_jerry+junio...|Love_sto

# Data preprocessing

In [0]:
booksDF = booksDF.drop(*['_c0'])
booksDF.show()

+--------------------+------------+-------+--------------------+
|               title|       genre|  label|             content|
+--------------------+------------+-------+--------------------+
|4097_alice+of+old...|Love_stories|failure|produced by charl...|
|17053_kate+bonnet...|Love_stories|failure|etext prepared by...|
|22002_a+simple+st...|Love_stories|failure|produced by david...|
|28862_the+time+of...|Love_stories|failure|etext prepared by...|
|25727_vagabondia+...|Love_stories|failure|produced by david...|
|13731_romance+isl...|Love_stories|failure|produced by janet...|
|16692_beyond+the+...|Love_stories|failure|produced by suzan...|
|21098_the+indepen...|Love_stories|failure|produced by nick ...|
|25001_an+old+man'...|Love_stories|failure|etext prepared by...|
|  23810_at+fault.txt|Love_stories|failure|produced by berna...|
|8897_nina+balatka...|Love_stories|failure|etext prepared by...|
|20358_jerry+junio...|Love_stories|failure|produced by bruce...|
|  17821_red+hair.txt|Lov

In [0]:
import pyspark.sql.functions as F
from pyspark.sql.functions import udf
from pyspark.sql.types import *

## Generating emotion swings

In [0]:
emotion_path = "/gdrive/My Drive/Datasets/NRC-Emotion-Lexicon-Wordlevel.txt"
emotion_intensity_path = "/gdrive/My Drive/Datasets/NRC-Emotion-Intensity-Lexicon.txt"
emotion_df = spark.read.format('csv').options(header='False', inferSchema='true', delimiter='\t').load(emotion_path)
emotion_intensity_df = spark.read.format('csv').options(header='True', inferSchema='true', delimiter='\t') .load(emotion_intensity_path)

In [0]:
emotion_df.show()
emotion_intensity_df.show()

+------+------------+---+
|   _c0|         _c1|_c2|
+------+------------+---+
| aback|       anger|  0|
| aback|anticipation|  0|
| aback|     disgust|  0|
| aback|        fear|  0|
| aback|         joy|  0|
| aback|    negative|  0|
| aback|    positive|  0|
| aback|     sadness|  0|
| aback|    surprise|  0|
| aback|       trust|  0|
|abacus|       anger|  0|
|abacus|anticipation|  0|
|abacus|     disgust|  0|
|abacus|        fear|  0|
|abacus|         joy|  0|
|abacus|    negative|  0|
|abacus|    positive|  0|
|abacus|     sadness|  0|
|abacus|    surprise|  0|
|abacus|       trust|  1|
+------+------------+---+
only showing top 20 rows

+------------+-------+-----------------------+
|        word|emotion|emotion-intensity-score|
+------------+-------+-----------------------+
|    outraged|  anger|                  0.964|
|   brutality|  anger|                  0.959|
|      hatred|  anger|                  0.953|
|     hateful|  anger|                   0.94|
|   terrorize|  anger

In [0]:
from collections import defaultdict
import json

# Generate emotion lexicon.
emotion_list = map(lambda row: row.asDict(), emotion_df.collect())
emotion_lex = defaultdict(list)
for emotion in emotion_list:
    if emotion['_c2'] == 1 and emotion['_c1'] != 'positive' and emotion['_c1'] != 'negative':
        emotion_lex[emotion['_c0']].append(emotion['_c1'])

# with open('emotion_lex.json', 'w') as json_file:
#     json.dump(emotion_lex, json_file)

In [0]:
# Generate emotion intensity dictionary
emotion_intensity_list = map(lambda row: row.asDict(), emotion_intensity_df.collect())
emotion_inten_dict = defaultdict(list)
for emotion_inten in emotion_intensity_list:
    emotion_inten_dict[emotion_inten['word']].append({emotion_inten['emotion']: emotion_inten['emotion-intensity-score']})

# with open('emotion_inten_dict.json', 'w') as json_file:
#     json.dump(emotion_inten_dict, json_file)

In [0]:
import math

def calc_percentage(vals):
    su = sum(vals)
    if su != 0:
        return [round(v / su, 4) for v in vals]
    else:
        return [0.0 for v in vals]

def calc_emo_vec(words):
    emo_vec = {'anger': 0, 'anticipation': 0, 'disgust': 0, 'fear': 0, 'joy': 0, 'sadness': 0, 'surprise': 0, 'trust': 0}
    for word in words:
        for emo in emotion_lex[word]:
            emo_vec[emo] += 1
    return calc_percentage(list(emo_vec.values()))

def calc_emo_inten(words):
    emo_vec = {'anger': 0.0, 'anticipation': 0.0, 'disgust': 0.0, 'fear': 0.0, 'joy': 0.0, 'sadness': 0.0, 'surprise': 0.0, 'trust': 0.0}
    for word in words:
        for emo in emotion_inten_dict[word]:
            emo_vec[list(emo.keys())[0]] += list(emo.values())[0]
    return calc_percentage(list(emo_vec.values()))

def chunks(lst, n):
    num = math.ceil(len(lst) / n)
    for i in range(0, len(lst), num):
        yield lst[i:i + num]

def gen_emo_vecs_chunks(text, num):
    words_list = text.split()
    emo_vecs = list()
    for words in list(chunks(words_list, num)):
        emo_vecs.append(calc_emo_vec(words))
    return emo_vecs

def gen_emo_inten_chunks(text, num):
    words_list = text.split()
    emo_vecs = list()
    for words in list(chunks(words_list, num)):
        emo_vecs.append(calc_emo_inten(words))
    return emo_vecs

def gen_emo_vecs_overall(text):
    words_list = text.split()
    return calc_emo_vec(words_list)

def gen_emo_inten_overall(text):
    words_list = text.split()
    return calc_emo_inten(words_list)

In [0]:
# Generate overall emotion vecor
gen_emo_vecs_overall_udf = udf(gen_emo_vecs_overall, ArrayType(FloatType()))
booksDF = booksDF.withColumn('emo_vecs_overall', gen_emo_vecs_overall_udf('content'))

In [0]:
# Generate emotion vector by chunks (10, 20, 30, 50)
booksDF = booksDF.withColumn('c10', F.lit(10))
booksDF = booksDF.withColumn('c20', F.lit(20))
booksDF = booksDF.withColumn('c30', F.lit(30))
booksDF = booksDF.withColumn('c50', F.lit(50))

gen_emo_vecs_chunks_udf = udf(gen_emo_vecs_chunks, ArrayType(ArrayType(FloatType())))
booksDF = booksDF.withColumn('emo_vecs_chunks_10', gen_emo_vecs_chunks_udf('content', 'c10'))
booksDF = booksDF.withColumn('emo_vecs_chunks_20', gen_emo_vecs_chunks_udf('content', 'c20'))
booksDF = booksDF.withColumn('emo_vecs_chunks_30', gen_emo_vecs_chunks_udf('content', 'c30'))
booksDF = booksDF.withColumn('emo_vecs_chunks_50', gen_emo_vecs_chunks_udf('content', 'c50'))

In [0]:
booksDF.show()

+------------------+------+------------+--------------------+--------------------+----------+--------------------+---+---+---+---+--------------------+--------------------+--------------------+--------------------+
|             genre|rating|rating_count|               title|         content_std|word_count|    emo_vecs_overall|c10|c20|c30|c50|  emo_vecs_chunks_10|  emo_vecs_chunks_20|  emo_vecs_chunks_30|  emo_vecs_chunks_50|
+------------------+------+------------+--------------------+--------------------+----------+--------------------+---+---+---+---+--------------------+--------------------+--------------------+--------------------+
|    School Stories|  3.96|         125|   The White Feather|produced by suzan...|     44679|[0.1147, 0.1775, ...| 10| 20| 30| 50|[[0.0771, 0.1849,...|[[0.0852, 0.1738,...|[[0.0794, 0.1869,...|[[0.0826, 0.1983,...|
|   Science Fiction|  4.02|         615|   The Sensitive Man|produced by sanka...|     20847|[0.1225, 0.1534, ...| 10| 20| 30| 50|[[0.1135, 

In [0]:
# Generate overall emotion intensity
gen_emo_inten_overall_udf = udf(gen_emo_inten_overall, ArrayType(FloatType()))
booksDF = booksDF.withColumn('emo_inten_overall', gen_emo_inten_overall_udf('content'))

# Generate emotion intensity by chunks (10, 15, 20)
gen_emo_inten_chunks_udf = udf(gen_emo_inten_chunks, ArrayType(ArrayType(FloatType())))
booksDF = booksDF.withColumn('emo_inten_chunks_10', gen_emo_inten_chunks_udf('content', 'c10'))
booksDF = booksDF.withColumn('emo_inten_chunks_20', gen_emo_inten_chunks_udf('content', 'c20'))
booksDF = booksDF.withColumn('emo_inten_chunks_30', gen_emo_inten_chunks_udf('content', 'c30'))
booksDF = booksDF.withColumn('emo_inten_chunks_50', gen_emo_inten_chunks_udf('content', 'c50'))

In [0]:
booksDF.show()

+--------------------+------------+-------+--------------------+--------------------+---+---+---+---+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|               title|       genre|  label|             content|    emo_vecs_overall|c10|c20|c30|c50|  emo_vecs_chunks_10|  emo_vecs_chunks_20|  emo_vecs_chunks_30|  emo_vecs_chunks_50|   emo_inten_overall| emo_inten_chunks_10| emo_inten_chunks_20| emo_inten_chunks_30| emo_inten_chunks_50|
+--------------------+------------+-------+--------------------+--------------------+---+---+---+---+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|4097_alice+of+old...|Love_stories|failure|produced by charl...|[0.1173, 0.1441, ...| 10| 20| 30| 50|[[0.0889, 0.1812,...|[[0.0

## Exporting emotion vectors

In [0]:
import pandas as pd

def std_emo_overall(emo_overall_list, column_str):
    emo_overall_csv = list()
    for line in emo_overall_list:
        dd = defaultdict()
        dd['title'] = line['title']
        dd['genre'] = line['genre']
        dd['label'] = line['label']
        for i, e in enumerate(['anger', 'anticipation', 'disgust', 'fear', 'joy', 'sadness', 'surprise', 'trust']):
            dd[e] = line[column_str][i]
        emo_overall_csv.append(dd)
    return emo_overall_csv

def std_emo_chunks(emo_chunks_list, column_str):
    emo_chunks_csv = list()
    for line in emo_chunks_list:
        dd = defaultdict()
        dd['title'] = line['title']
        dd['genre'] = line['genre']
        dd['label'] = line['label']
        for i, emos in enumerate(line[column_str]):
            for j, e in enumerate(['anger', 'anticipation', 'disgust', 'fear', 'joy', 'sadness', 'surprise', 'trust']):
                dd[e+'_'+str(i)] = emos[j]
        emo_chunks_csv.append(dd)
    return emo_chunks_csv

In [0]:
# Save emotion vectors and intensity overall
emo_vecs_overall_df = booksDF.select(*['title', 'genre', 'label', 'emo_vecs_overall'])
emo_vecs_overall_list = list(map(lambda row: row.asDict(), emo_vecs_overall_df.collect()))
emo_vecs_overall_csv = std_emo_overall(emo_vecs_overall_list, 'emo_vecs_overall')
emo_overall_df = pd.DataFrame(emo_vecs_overall_csv)

emo_inten_overall_df = booksDF.select(*['title', 'genre', 'label', 'emo_inten_overall'])
emo_inten_overall_list = list(map(lambda row: row.asDict(), emo_inten_overall_df.collect()))
emo_inten_overall_csv = std_emo_overall(emo_inten_overall_list, 'emo_inten_overall')
inten_overall_df = pd.DataFrame(emo_inten_overall_csv)

In [0]:
# Save emotion vectors and intensity chunks (10, 15, 20)
emo_vecs_chunks_10_df = booksDF.select(*['title', 'genre', 'label', 'emo_vecs_chunks_10'])
emo_vecs_chunks_10_list = list(map(lambda row: row.asDict(), emo_vecs_chunks_10_df.collect()))
emo_vecs_chunks_10_csv = std_emo_chunks(emo_vecs_chunks_10_list, 'emo_vecs_chunks_10')
emo_10_df = pd.DataFrame(emo_vecs_chunks_10_csv)

emo_vecs_chunks_20_df = booksDF.select(*['title', 'genre', 'label', 'emo_vecs_chunks_20'])
emo_vecs_chunks_20_list = list(map(lambda row: row.asDict(), emo_vecs_chunks_20_df.collect()))
emo_vecs_chunks_30_df = booksDF.select(*['title', 'genre', 'label', 'emo_vecs_chunks_30'])
emo_vecs_chunks_30_list = list(map(lambda row: row.asDict(), emo_vecs_chunks_30_df.collect()))
emo_vecs_chunks_20_csv = std_emo_chunks(emo_vecs_chunks_20_list, 'emo_vecs_chunks_20')
emo_vecs_chunks_30_csv = std_emo_chunks(emo_vecs_chunks_30_list, 'emo_vecs_chunks_30')
emo_20_df = pd.DataFrame(emo_vecs_chunks_20_csv)
emo_30_df = pd.DataFrame(emo_vecs_chunks_30_csv)

emo_vecs_chunks_50_df = booksDF.select(*['title', 'genre', 'label', 'emo_vecs_chunks_50'])
emo_vecs_chunks_50_list = list(map(lambda row: row.asDict(), emo_vecs_chunks_50_df.collect()))
emo_vecs_chunks_50_csv = std_emo_chunks(emo_vecs_chunks_50_list, 'emo_vecs_chunks_50')
emo_50_df = pd.DataFrame(emo_vecs_chunks_50_csv)

emo_inten_chunks_10_df = booksDF.select(*['title', 'genre', 'label', 'emo_inten_chunks_10'])
emo_inten_chunks_10_list = list(map(lambda row: row.asDict(), emo_inten_chunks_10_df.collect()))
emo_inten_chunks_10_csv = std_emo_chunks(emo_inten_chunks_10_list, 'emo_inten_chunks_10')
inten_10_df = pd.DataFrame(emo_inten_chunks_10_csv)

emo_inten_chunks_20_df = booksDF.select(*['title', 'genre', 'label', 'emo_inten_chunks_20'])
emo_inten_chunks_20_list = list(map(lambda row: row.asDict(), emo_inten_chunks_20_df.collect()))
emo_inten_chunks_30_df = booksDF.select(*['title', 'genre', 'label', 'emo_inten_chunks_30'])
emo_inten_chunks_30_list = list(map(lambda row: row.asDict(), emo_inten_chunks_30_df.collect()))
emo_inten_chunks_20_csv = std_emo_chunks(emo_inten_chunks_20_list, 'emo_inten_chunks_20')
emo_inten_chunks_30_csv = std_emo_chunks(emo_inten_chunks_30_list, 'emo_inten_chunks_30')
inten_20_df = pd.DataFrame(emo_inten_chunks_20_csv)
inten_30_df = pd.DataFrame(emo_inten_chunks_30_csv)

emo_inten_chunks_50_df = booksDF.select(*['title', 'genre', 'label', 'emo_inten_chunks_50'])
emo_inten_chunks_50_list = list(map(lambda row: row.asDict(), emo_inten_chunks_50_df.collect()))
emo_inten_chunks_50_csv = std_emo_chunks(emo_inten_chunks_50_list, 'emo_inten_chunks_50')
inten_50_df = pd.DataFrame(emo_inten_chunks_50_csv)

In [0]:
with open('/gdrive/My Drive/Datasets/ef_vec_overall.csv', 'w') as f:
    emo_overall_df.to_csv(f)

with open('/gdrive/My Drive/Datasets/ef_inten_overall.csv', 'w') as f:
    inten_overall_df.to_csv(f)

with open('/gdrive/My Drive/Datasets/ef_vec_10.csv', 'w') as f:
    emo_10_df.to_csv(f)

with open('/gdrive/My Drive/Datasets/ef_vec_20.csv', 'w') as f:
    emo_20_df.to_csv(f)

with open('/gdrive/My Drive/Datasets/ef_vec_30.csv', 'w') as f:
    emo_30_df.to_csv(f)

with open('/gdrive/My Drive/Datasets/ef_vec_50.csv', 'w') as f:
    emo_50_df.to_csv(f)

with open('/gdrive/My Drive/Datasets/ef_inten_10.csv', 'w') as f:
    inten_10_df.to_csv(f)

with open('/gdrive/My Drive/Datasets/ef_inten_20.csv', 'w') as f:
    inten_20_df.to_csv(f)

with open('/gdrive/My Drive/Datasets/ef_inten_30.csv', 'w') as f:
    inten_30_df.to_csv(f)

with open('/gdrive/My Drive/Datasets/ef_inten_50.csv', 'w') as f:
    inten_50_df.to_csv(f)

In [0]:
genre_content_df = booksDF.select(*['title', 'genre', 'rating', 'rating_count', 'content_std', 'word_count']).toPandas()
genre_content_df.head()

Unnamed: 0,title,genre,rating,rating_count,content_std,word_count
0,The White Feather,School Stories,3.96,125,produced by suzanne l shell charles franks and...,44679
1,The Sensitive Man,Science Fiction,4.02,615,produced by sankar viswanathan greg weeks and ...,20847
2,463,Historical Fiction,4.67,39,produced by judith boss html version by al hai...,46221
3,Sea Legs,Science Fiction,3.65,246,produced by greg weeks mary meehan and the onl...,13902
4,The Young Buglers,Children's Fiction,4.12,91,produced by ted garvin suzanne shell william f...,102661


In [0]:
with open('/gdrive/My Drive/Datasets/genre_content.csv', 'w') as f:
    genre_content_df.to_csv(f)