In [23]:
import numpy as np
import pandas as pd 
from tqdm import tqdm
from transformers import pipeline

In [9]:
books = pd.read_csv('../data/books_cleaned_with_categories.csv')
classifier = pipeline('text-classification', 
    model='j-hartmann/emotion-english-distilroberta-base', 
    top_k=None)
classifier('I love this :(')

Device set to use cpu


[[{'label': 'joy', 'score': 0.8908524513244629},
  {'label': 'neutral', 'score': 0.05040666088461876},
  {'label': 'sadness', 'score': 0.03542739152908325},
  {'label': 'surprise', 'score': 0.009478951804339886},
  {'label': 'disgust', 'score': 0.007098732981830835},
  {'label': 'anger', 'score': 0.005757557228207588},
  {'label': 'fear', 'score': 0.0009783386485651135}]]

In [25]:
def calculate_max_emotion_scores(predictions, emotion_labels):
    per_emotion_scores = {label: [] for label in emotion_labels}
    for prediction in predictions:
        for each in prediction:
            label, score = each['label'], each['score']
            per_emotion_scores[label].append(score)
    return {label: np.max(scores) for label, scores in per_emotion_scores.items()}

In [26]:
isbn = []
emotion_labels = ['anger', 'fear', 'joy', 'disgust', 'sadness', 'surprise', 'neutral']
emotion_scores = {label: [] for label in emotion_labels}
for i in tqdm(range(books.shape[0])):
    isbn.append(books.loc[i, 'isbn13'])
    sentences = books.loc[i, 'description'].split('.')
    predictions = classifier(sentences)
    max_scores = calculate_max_emotion_scores(predictions, emotion_labels)
    for label in emotion_labels:
        emotion_scores[label].append(max_scores[label])

100%|██████████| 5197/5197 [24:03<00:00,  3.60it/s]  


In [27]:
emotions_df = pd.DataFrame(emotion_scores)
emotions_df['isbn13'] = isbn
emotions_df.head()

Unnamed: 0,anger,fear,joy,disgust,sadness,surprise,neutral,isbn13
0,0.064133,0.928168,0.932798,0.273592,0.967158,0.729602,0.646216,9780002005883
1,0.612619,0.942528,0.704422,0.348285,0.11169,0.252546,0.88794,9780002261982
2,0.064133,0.972321,0.767238,0.104007,0.11169,0.078765,0.549478,9780006178736
3,0.351484,0.360706,0.251881,0.150722,0.11169,0.078765,0.732685,9780006280897
4,0.081412,0.095043,0.040564,0.184495,0.475881,0.078765,0.88439,9780006280934


In [28]:
books = pd.merge(books, emotions_df, on='isbn13')
books.head()

Unnamed: 0,isbn13,isbn10,title,authors,categories,thumbnail,description,published_year,average_rating,num_pages,...,tagged_description,sample_categories,simple_categories,anger,fear,joy,disgust,sadness,surprise,neutral
0,9780002005883,2005883,Gilead,Marilynne Robinson,Fiction,http://books.google.com/books/content?id=KQZCP...,A NOVEL THAT READERS and critics have been eag...,2004.0,3.85,247.0,...,9780002005883 A NOVEL THAT READERS and critics...,Fiction,Fiction,0.064133,0.928168,0.932798,0.273592,0.967158,0.729602,0.646216
1,9780002261982,2261987,Spider's Web,Charles Osborne;Agatha Christie,Detective and mystery stories,http://books.google.com/books/content?id=gA5GP...,A new 'Christie for Christmas' -- a full-lengt...,2000.0,3.83,241.0,...,9780002261982 A new 'Christie for Christmas' -...,,Fiction,0.612619,0.942528,0.704422,0.348285,0.11169,0.252546,0.88794
2,9780006178736,6178731,Rage of angels,Sidney Sheldon,Fiction,http://books.google.com/books/content?id=FKo2T...,"A memorable, mesmerizing heroine Jennifer -- b...",1993.0,3.93,512.0,...,"9780006178736 A memorable, mesmerizing heroine...",Fiction,Fiction,0.064133,0.972321,0.767238,0.104007,0.11169,0.078765,0.549478
3,9780006280897,6280897,The Four Loves,Clive Staples Lewis,Christian life,http://books.google.com/books/content?id=XhQ5X...,Lewis' work on the nature of love divides love...,2002.0,4.15,170.0,...,9780006280897 Lewis' work on the nature of lov...,,Nonfiction,0.351484,0.360706,0.251881,0.150722,0.11169,0.078765,0.732685
4,9780006280934,6280935,The Problem of Pain,Clive Staples Lewis,Christian life,http://books.google.com/books/content?id=Kk-uV...,"""In The Problem of Pain, C.S. Lewis, one of th...",2002.0,4.09,176.0,...,"9780006280934 ""In The Problem of Pain, C.S. Le...",,Nonfiction,0.081412,0.095043,0.040564,0.184495,0.475881,0.078765,0.88439


In [29]:
books.to_csv('../data/books_cleaned_with_emotions.csv', index=False)