# Compare genres of input and output books

In [1]:
import gensim
from gensim.models import Doc2Vec, KeyedVectors
import logging
import time
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import requests
import matplotlib.pyplot as plt
import gensim.downloader as api # Downloading the Google pretrained Word2Vec Model

In [2]:
import numpy as np
import requests
import random
from tqdm.notebook import tqdm
tqdm.pandas()

In [3]:
import os
import json
import glob
import re
import sys
import collections
from nltk import flatten
import dask
from dask import delayed,compute
import dask.dataframe as dd
from dask.multiprocessing import get
import logging
from sklearn.metrics.pairwise import cosine_similarity

In [78]:
from sklearn.metrics import multilabel_confusion_matrix
import ast
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report
from itertools import repeat

In [7]:
#read dataset
books_df = pd.read_csv('medium_books_missing_obs23675.csv')
#drop na (subset-genre)
books_df = books_df.dropna(subset=['genres'])
#remove quotes from around genre list
books_df['genres'] = books_df.loc[books_df['genres']!=np.nan, 'genres'].progress_apply(lambda x: ast.literal_eval(x))

  0%|          | 0/31730 [00:00<?, ?it/s]

In [8]:
#create list of genres
genres_list = []
for i in books_df['genres']:
    genres_list.extend(i)
#create unique list of genres
unique_genres_list = list(set(genres_list))
unique_genres_df = pd.DataFrame.from_dict({'genres': sorted(unique_genres_list)})
num_genres = len(unique_genres_list)
num_genres

829

In [55]:
unique_genres_list.index('Fiction')

790

Fiction has index 790

In [60]:
unique_genres_list.index('Nonfiction')

186

Nonfiction has index 186

## Step 1: 
Original books genres -> matrix

In [15]:
#Read in recommendations as df
avg_word_recs_df = pd.read_csv('avg_word_vec_recs_medium_dataset.csv')

In [16]:
orig_book_indices = list(books_df.index)

In [17]:
num_orig_books = len(orig_book_indices)
num_orig_books

31730

In [18]:
recommended_books_df = avg_word_recs_df
recommended_books_df.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,book_authors,book_desc,book_title,genres,image_url,book_desc_tok,similarity_scores,orig_book_idx
0,1,1,4,['Stephenie Meyer'],About three things I was absolutely positive.F...,Twilight,"['Young Adult', 'Fantasy', 'Romance', 'Paranor...",https://images.gr-assets.com/books/1361039443l...,about three things i was abso...,0.001497,0
1,2,2,6,"['C.S. Lewis', 'Pauline Baynes']","Journeys to the end of the world, fantastic cr...",The Chronicles of Narnia,"['Fantasy', 'Classics', 'Fiction', 'Young Adul...",https://images.gr-assets.com/books/1449868701l...,journeys to the end of the ...,0.001022,0
2,3,3,8,['Margaret Mitchell'],Gone with the Wind is a novel written by Marga...,Gone with the Wind,"['Classics', 'Historical', 'Historical Fiction...",https://images.gr-assets.com/books/1328025229l...,gone with the wind is a no...,0.000723,0
3,5,5,11,['Douglas Adams'],Seconds before the Earth is demolished to make...,The Hitchhiker's Guide to the Galaxy,"['Science Fiction', 'Fiction', 'Humor', 'Fanta...",https://images.gr-assets.com/books/1388282444l...,seconds before the earth is d...,0.000394,0
4,7,7,13,"['Emily Brontë', 'Richard J. Dunn', 'David Tim...",You can find the redesigned cover of this edit...,Wuthering Heights,"['Classics', 'Fiction', 'Romance', 'Literature']",https://images.gr-assets.com/books/1388212715l...,you can find the redesigned c...,0.000358,0


In [19]:
recommended_books_df.shape

(158650, 11)

In [20]:
missing_orig_books_indices_list = []
for idx in list(set(recommended_books_df['orig_book_idx'])):
    if idx not in orig_book_indices:
        missing_orig_books_indices_list.append(idx)

In [21]:
missing_orig_books_indices_list

[]

In [22]:
recommended_books_df_rows_to_remove = []
for row_num in range(recommended_books_df.shape[0]):
    if recommended_books_df['orig_book_idx'][row_num] in (missing_orig_books_indices_list):
        recommended_books_df_rows_to_remove.append(row_num)

In [23]:
recommended_books_df_rows_to_remove

[]

In [24]:
#remove books from recommended books that have orig books with those indices
recommended_books_df = recommended_books_df.drop(recommended_books_df_rows_to_remove, axis=0)

In [26]:
#binarizer to transform between labels and a multilabel format
mlb = MultiLabelBinarizer()
#fit the binarizer to the list of genres
mlb.fit([unique_genres_list])

MultiLabelBinarizer()

In [27]:
orig_book_indices[-10:]

[31720, 31721, 31722, 31723, 31724, 31725, 31726, 31727, 31728, 31729]

In [28]:
books_df.shape

(31730, 8)

In [29]:
num_preds_per_book = 5
orig_book_genres_matrix = []
for orig_book_index in orig_book_indices:
    orig_book = books_df.iloc[books_df.index==orig_book_index]
    orig_book_genres = orig_book['genres']
    orig_book_genres_matrix.extend(np.repeat(mlb.transform(orig_book_genres), repeats = num_preds_per_book, axis=0))

In [30]:
y_expected = np.array(orig_book_genres_matrix)

In [31]:
np.array(y_expected).shape

(158650, 829)

## Step 2
Predictions -> matrix with size = number of genres

In [32]:
#Cleaning: if genre = NaN, replace with '[]'
recommended_books_df.loc[recommended_books_df['genres'].isna(), 'genres']='[]'

In [33]:
#Cleaning
#recommended_books = recommended_books.dropna(subset=['genres'])
recommended_books_df['genres'] = recommended_books_df['genres'].progress_apply(lambda x: ast.literal_eval(x))

  0%|          | 0/158650 [00:00<?, ?it/s]

In [34]:
recommended_books_genres = list(recommended_books_df['genres'])


In [35]:
recommended_books_genres

[['Young Adult',
  'Fantasy',
  'Romance',
  'Paranormal',
  'Vampires',
  'Fiction',
  'Fantasy',
  'Paranormal'],
 ['Fantasy', 'Classics', 'Fiction', 'Young Adult', 'Childrens'],
 ['Classics',
  'Historical',
  'Historical Fiction',
  'Fiction',
  'Romance',
  'Historical'],
 ['Science Fiction', 'Fiction', 'Humor', 'Fantasy', 'Classics'],
 ['Classics', 'Fiction', 'Romance', 'Literature'],
 ['Classics',
  'Historical',
  'Historical Fiction',
  'Fiction',
  'Romance',
  'Historical'],
 ['Fantasy', 'Classics', 'Fiction', 'Young Adult', 'Childrens'],
 ['Childrens', 'Childrens', 'Picture Books', 'Classics', 'Fiction'],
 ['Science Fiction', 'Fiction', 'Humor', 'Fantasy', 'Classics'],
 ['Classics', 'Fiction', 'Romance', 'Literature'],
 ['Classics',
  'Historical',
  'Historical Fiction',
  'Fiction',
  'Romance',
  'Historical'],
 ['Classics', 'Fiction', 'Romance', 'Literature'],
 ['Science Fiction', 'Fiction', 'Humor', 'Fantasy', 'Classics'],
 ['Childrens', 'Childrens', 'Picture Books', '

In [36]:
rec_books_matrix = mlb.transform(recommended_books_genres)

In [37]:
rec_books_matrix.shape

(158650, 829)

In [38]:
rec_books_matrix[0]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [69]:
eval_metrics.iloc[unique_genres_list.index('Historical Fiction'),:]

precision    0.999433
recall       0.999918
f1score      0.999675
Name: 654, dtype: float64

In [70]:
eval_metrics.iloc[unique_genres_list.index('Science Fiction'),:]

precision    0.998120
recall       0.998952
f1score      0.998536
Name: 451, dtype: float64

In [72]:
eval_metrics.iloc[unique_genres_list.index('Romance'),:]

precision    0.995092
recall       0.992798
f1score      0.993944
Name: 703, dtype: float64

In [73]:
eval_metrics.iloc[unique_genres_list.index('Classics'),:]

precision    0.999937
recall       0.999994
f1score      0.999965
Name: 527, dtype: float64

In [74]:
precision_micro_average = precision_score(y_expected, y_pred, average='micro', zero_division='warn')
precision_micro_average

0.18124524598645558

In [76]:
recall_macro_average = recall_score(y_expected, y_pred, average='macro', zero_division='warn')
recall_macro_average

0.021230298328750768

In [80]:
recall_micro_average = recall_score(y_expected, y_pred, average='micro', zero_division='warn')
recall_micro_average

0.1864124000732288

In [79]:
print(classification_report(
    y_expected,
    y_pred, output_dict=False))

  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.00      0.00      0.00        10
           1       1.00      0.20      0.33         5
           2       0.25      0.10      0.14        10
           3       0.00      0.00      0.00        10
           4       0.00      0.00      0.00        10
           5       0.06      0.02      0.03        45
           6       0.00      0.00      0.00        50
           7       0.05      0.02      0.03       215
           8       0.00      0.00      0.00         5
           9       0.02      0.01      0.01       865
          10       0.00      0.00      0.00        25
          11       0.04      0.01      0.01       495
          12       0.00      0.00      0.00       125
          13       0.00      0.00      0.00        35
          14       0.00      0.00      0.00        65
          15       0.01      0.01      0.01       780
          16       0.04      0.04      0.04      1035
          17       0.00    