In [64]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import os
from nltk.tokenize import sent_tokenize
import pandas as pd
from wordcloud import WordCloud
from PIL import Image
import matplotlib.pyplot as plt
import numpy as np
import random

In [2]:
import config
import tfidf
os.getcwd()

'/Users/Tristan/books/src'

In [None]:
os.listdir(config.dataset_dir)

In [None]:
def make_lex_dict(lexicon_file):
        """
        Convert lexicon file to a dictionary
        """
        lex_dict = {}
        for line in lexicon_file.split('\n'):
            (word, measure) = line.strip().split('\t')[0:2]
            lex_dict[word] = float(measure)
        return lex_dict
    
sent_dict = make_lex_dict(open('/Users/Tristan/books/src/' +'vader_lexicon.txt', 'r').read())


Sentiment analysis. Analysis is performed for each sentence and the sentiment scores kept in lists. Sentiment scores are calculated by averaging the sentiment scores for all sentences.

In [None]:
def return_sentiment_scores(sentence):
    # return just the sentiment scores
    snt = analyser.polarity_scores(sentence)
    return snt

def sentiment_analysis(directory):
    analyser = SentimentIntensityAnalyzer()                    
    # returns the sentiment of every book in the directory
    data = pd.read_csv(config.dataset_dir + 'output/final_data.csv', index_col=0)
    print(len(data.index))
#     max_amt = len(data.index) + 2
#     print(data.index, len(os.listdir(directory)))
    pos_list = []
    neg_list = []
    neu_list = []
    comp_list = []
    
    # for every book
    for filename in data['filename']:#[:max_amt]:
        
        sub_pos_list = []
        sub_neg_list = []
        sub_neu_list = []
        sub_comp_list = []
        
        # if file is a textfile
        if filename.endswith(".txt"):
            text = open(os.path.join(directory, filename), 'r', errors='replace')
            # for every line in the text
            for line in text.readlines():
                scores = return_sentiment_scores(line)
                # save sentiment scores 
                sub_neg_list.append(scores['neg'])
                sub_neu_list.append(scores['neu'])
                sub_pos_list.append(scores['pos'])
                sub_comp_list.append(scores['compound'])
            
            # then save average sentiment scores for each book
            neg_list.append((sum(sub_neg_list) / float(len(sub_neg_list))))
            pos_list.append((sum(sub_pos_list) / float(len(sub_pos_list))))
            neu_list.append((sum(sub_neu_list) / float(len(sub_neu_list))))
            comp_list.append((sum(sub_comp_list) / float(len(sub_comp_list))))
            
    # convert scores to pandas compatible list
    neg = pd.Series(neg_list)
    pos = pd.Series(pos_list)
    neu = pd.Series(neu_list)
    com = pd.Series(comp_list)

    print(len(neg), len(pos), len(neu), len(com))
    # fill the right columns with the right data
    print(type(data),'type')
    print(neg)
    data['neg score'] = neg.values
    data['pos score'] = pos.values
    data['neu score'] = neu.values
    data['comp score'] = com.values
    data.to_csv(config.dataset_dir + 'output/final_data.csv')
    return data

In [None]:
analyser = SentimentIntensityAnalyzer()                    
sentiment_analysis(config.dataset_dir + 'bookdatabase/books/')

We also want to count the amount of positive and negative words as features. We also create a new file for each book with just the sentiment words. As a result, we will be able to do tfidf on these files later and create wordclouds per genre.

In [None]:
def count_sentiment_words(directory):
    sent_words_list =[]
    pos_list = []
    neg_list = []
    
    data = pd.read_csv(config.dataset_dir + 'output/final_data.csv', index_col=0)

    for filename in data['filename']:
        sent_words_list =[]
        pos_count = 0
        neg_count = 0
        
        if filename.endswith(".txt"):
            text = open(os.path.join(directory, filename), 'r', errors='replace')
            sentiment_file = open(config.dataset_dir +'output/sentiment_word_texts/' + filename , 'w')

            for line in text.readlines():
                for word in line.split(" "):
                    if word in sent_dict:
                        if sent_dict[word] >= 0:
                            pos_count += 1
                            sent_words_list.append(word)
                            sentiment_file.write("%s" % word)
                            sentiment_file.write(" ")
                        else:
                            neg_count += 1
                            sentiment_file.write("%s" % word)
                            sentiment_file.write(" ")

            pos_list.append(pos_count)
            neg_list.append(neg_count)
            
    data['amt pos'] = pos_list 
    data['amt neg'] = neg_list
    
    data.to_csv(config.dataset_dir + 'output/final_data.csv')
    return data

count_sentiment_words(config.dataset_dir + 'bookdatabase/books/')


In [None]:
import pandas as pd

In [None]:
def read_unique_genres():
    genres_file = open(config.dataset_dir + 'unique_genres.txt', 'r')
    return[genre.strip('\n') for genre in genres_file.readlines()]

In [25]:
def create_wordcloud(scores, genre):
    
    font_path = config.dataset_dir + 'Open_Sans_Condensed/OpenSansCondensed-Light.ttf'

    try:
        w = WordCloud(background_color='white', min_font_size=14, font_path=font_path, width = 1000, height = 500,relative_scaling=1,normalize_plurals=False)
        wordcloud = w.generate_from_frequencies(scores)
        wordcloud.recolor(color_func=grey_color_func)

        
    except ZeroDivisionError:
        return
    
    plt.figure(figsize=(15,8))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off")
    plt.savefig(config.dataset_dir + 'output/wordclouds/' + genre + '.png')
    plt.close()
    
def grey_color_func(word, font_size, position, orientation, random_state=None, **kwargs):
    return "hsl(0, 0%%, %d%%)" % random.randint(10, 50)


def tfidf_per_genre(plot_wc=False):
    data = pd.read_csv(config.dataset_dir + 'final_data.csv')
    genres_file = open(config.dataset_dir + 'unique_genres.txt', 'r')
    genre_list = [genre.strip('\n') for genre in genres_file.readlines()]
    directory = config.dataset_dir + 'output/sentiment_word_texts/'
    book_list = []
    

    index = tfidf.create_index(directory)

    for genre in genre_list:
        genre = genre.replace()
        score_dict = {}
        book_list = []

        books_of_genre = data.loc[data['genre'] == genre]
        
        for book in books_of_genre['filename']:
            book_list.append(book)
            
        try:
            tf_matrix, genre_tokens = tfidf.create_tf_matrix(directory, book_list, genre)
            
            for term in genre_tokens:
        
                score = tfidf.tfidf(term, genre, directory, index, tf_matrix)
                score_dict[term] = score
             
            scores_file = open(config.dataset_dir +'output/top200_per_genre/' + genre + '.txt', 'w')
            
            for w in sorted(score_dict, key=score_dict.get, reverse=True):
                
                scores_file.write('%s/n' % w)
            
            scores_file.close()
            
            print('success')
            
            if plot_wc:
                font_path = config.dataset_dir + 'Open_Sans_Condensed/OpenSansCondensed-Light.ttf'
                create_wordcloud(score_dict, genre)

        except ZeroDivisionError:
            continue
        except ValueError:
            continue
        
#     return tfidf_dict_per_genre

tfidf_dict_per_genre = tfidf_per_genre(plot_wc=True)

success




success
success
success
success
success
success
success
success
success
success
success
success
success
success
success
success
success
success
success
success
success
success
success
success
success
success
success
success
success
success
success
success
success
success
success
success
success
success
success
success
success
success
success
success
success
success
success
success
success
success
success
success
success
success
success
success
success
success
success
success
success
success
success
success
success
success
success
success
success
success
success
success
success
success
success
success
success
success
success
success
success
success
success
success
success
success
success
success
success
success
success
success
success
success
success
success


FileNotFoundError: [Errno 2] No such file or directory: '../datasets/output/top200_per_genre/Fantasy/Ethnic novel.txt'

In [None]:
list(tfidf_dict_per_genre.keys())[:4]

In [None]:
len(list(tfidf_dict_per_genre.keys()))

In [None]:
len(tfidf_dict_per_genre['Diary and Novel']) # may differ per genre

In [None]:
n_words_per_genre = 100

In [None]:
sample = tfidf_dict_per_genre['War']
i = list(sample.keys())[-1]
sample[i]

In [None]:
max(list(sample.values()))

In [None]:
sample

## Generate labels file

In [None]:
import pandas, os
import data, config
from utils import io

In [None]:
info = pandas.read_csv(config.dataset_dir + 'final_data.csv')
book_list = os.listdir(config.dataset_dir + 'output/sentiment_word_texts')
labels = data.extract_genres(info, book_list)

In [None]:
labels

In [None]:
io.save_dict_to_csv(config.dataset_dir, 'labels', labels)

# (oud)
Choose to most important to be kept in the feature-vector

In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt

In [None]:
import data, config, tfidf

In [None]:
directory = config.dataset_dir + 'output/sentiment_word_texts'
book_list = os.listdir(directory)
book_list = book_list[:20]

In [None]:
index = tfidf.create_index(directory, book_list)
tf_matrix = tfidf.create_tf_matrix(directory, book_list)

In [None]:
tfidf_dict = tfidf.perform_tfidf(directory, book_list, index, tf_matrix)

In [None]:
# (optional) show the result
w = WordCloud(background_color='white', width=900, height=500, 
                      max_words=1628,relative_scaling=1,normalize_plurals=False)
wordcloud = w.generate_from_frequencies(tfidf_dict)

In [None]:
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
# plt.savefig(config.dataset_dir + 'output/wordclouds/' + genre + '.png')

In [None]:
# tfidf_dict_per_genre = wordcloud_per_genre()