In [1]:
# import all necessary libraries

import pandas as pd
import re
import numpy as np
import nltk
import string
import html
from collections import Counter
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF
from nltk.corpus import wordnet
from typing import List

from keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder
import spacy
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from sklearn.model_selection import train_test_split
from random import randint
from numpy import array, argmax, asarray, zeros
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Embedding
from keras.layers import SimpleRNN, LSTM
from keras.layers import Flatten, Masking
from keras.utils.vis_utils import plot_model

## Text Preprocessing - Lemmatization

In [2]:
sample = pd.read_csv('podcast_sample.csv', lineterminator='\n', index_col = 0)
sample.head(2)

Unnamed: 0,podcast_id,title,content,rating,author_id,created_at,itunes_id,slug,itunes_url,podcast_title,category
0,b313ef8ef0d5b64290d3036ff1bbf2d2,감성 라디오 음악도시,미국 서부에 있는 유학생이에요. 성시경씨 제대 후 라디오 복귀만 기다려오다가 6 월...,5,664CCA7142E9AE8,2011-09-14T13:25:46-07:00,442838670,fm-%EC%9D%8C%EC%95%85%EB%8F%84%EC%8B%9C-%EC%A2...,https://podcasts.apple.com/us/podcast/fm-%EC%9...,FM 음악도시(종영),music
1,abfb842993be20d21bfae7103addc5e9,They’ve really cut back on the content this se...,Last season there was a new pod every 3-4 days...,1,AD790CE113DCBC1,2018-04-11T13:46:47-07:00,1015394113,the-good-phight-for-philadelphia-phillies-fans,https://podcasts.apple.com/us/podcast/the-good...,The Good Phight: for Philadelphia Phillies fans,sports


In [3]:
## Perform lemmatization
## Reference: https://gist.github.com/gaurav5430/9fce93759eb2f6b1697883c3782f30de#file-nltk-lemmatize-sentences-py
lemmatizer = WordNetLemmatizer()

# function to convert nltk tag to wordnet tag
def nltk_tag_to_wordnet_tag(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:          
        return None

def lemmatize_sentence(sentence):
    #tokenize the sentence and find the POS tag for each token
    nltk_tagged = nltk.pos_tag(nltk.word_tokenize(sentence))  
    #tuple of (token, wordnet_tag)
    wordnet_tagged = map(lambda x: (x[0], nltk_tag_to_wordnet_tag(x[1])), nltk_tagged)
    lemmatized_sentence = []
    for word, tag in wordnet_tagged:
        if tag is None:
            #if there is no available tag, append the token as is
            lemmatized_sentence.append(word)
        else:        
            #else use the tag to lemmatize the token
            lemmatized_sentence.append(lemmatizer.lemmatize(word, tag))
    return " ".join(lemmatized_sentence)

def lem(line):
    word_tokens = nltk.word_tokenize(line)
    word_tokens = [lemmatize_sentence(t) for t in word_tokens]
    cleaned_review = " ".join(word_tokens)
        
    return cleaned_review

In [4]:
# Concat review title and review content to get more information later
sample['reviews_title'] = sample['title'] + ' ' + sample['content']
sample['reviews_title'] = sample['reviews_title'].astype(str)
sample['reviews_title'] = sample['reviews_title'].apply(lambda x: x.lower())
sample.head(2)

Unnamed: 0,podcast_id,title,content,rating,author_id,created_at,itunes_id,slug,itunes_url,podcast_title,category,reviews_title
0,b313ef8ef0d5b64290d3036ff1bbf2d2,감성 라디오 음악도시,미국 서부에 있는 유학생이에요. 성시경씨 제대 후 라디오 복귀만 기다려오다가 6 월...,5,664CCA7142E9AE8,2011-09-14T13:25:46-07:00,442838670,fm-%EC%9D%8C%EC%95%85%EB%8F%84%EC%8B%9C-%EC%A2...,https://podcasts.apple.com/us/podcast/fm-%EC%9...,FM 음악도시(종영),music,감성 라디오 음악도시 미국 서부에 있는 유학생이에요. 성시경씨 제대 후 라디오 복귀...
1,abfb842993be20d21bfae7103addc5e9,They’ve really cut back on the content this se...,Last season there was a new pod every 3-4 days...,1,AD790CE113DCBC1,2018-04-11T13:46:47-07:00,1015394113,the-good-phight-for-philadelphia-phillies-fans,https://podcasts.apple.com/us/podcast/the-good...,The Good Phight: for Philadelphia Phillies fans,sports,they’ve really cut back on the content this se...


In [5]:
sample['reviews_title'] = sample['reviews_title'].apply(lambda x: lem(x))
sample.head()

Unnamed: 0,podcast_id,title,content,rating,author_id,created_at,itunes_id,slug,itunes_url,podcast_title,category,reviews_title
0,b313ef8ef0d5b64290d3036ff1bbf2d2,감성 라디오 음악도시,미국 서부에 있는 유학생이에요. 성시경씨 제대 후 라디오 복귀만 기다려오다가 6 월...,5,664CCA7142E9AE8,2011-09-14T13:25:46-07:00,442838670,fm-%EC%9D%8C%EC%95%85%EB%8F%84%EC%8B%9C-%EC%A2...,https://podcasts.apple.com/us/podcast/fm-%EC%9...,FM 음악도시(종영),music,감성 라디오 음악도시 미국 서부에 있는 유학생이에요 . 성시경씨 제대 후 라디오 복...
1,abfb842993be20d21bfae7103addc5e9,They’ve really cut back on the content this se...,Last season there was a new pod every 3-4 days...,1,AD790CE113DCBC1,2018-04-11T13:46:47-07:00,1015394113,the-good-phight-for-philadelphia-phillies-fans,https://podcasts.apple.com/us/podcast/the-good...,The Good Phight: for Philadelphia Phillies fans,sports,they ’ ve really cut back on the content this ...
2,ebdf879a424547d01862a9bbba18a0f3,Good info. source...,Bob brings a lot of knowledge to any firearm d...,4,E223A4B2642C970,2010-01-19T08:11:43-07:00,333180229,handgun-world-podcast,https://podcasts.apple.com/us/podcast/handgun-...,Handgun World Podcast,news,good info . source ... bob brings a lot of kno...
3,ab2fdb7db023b223d870487165d11ff3,Mixed,They have lost much of thier credibility by de...,3,E1E7DBE750D119E,2021-01-28T12:21:49-07:00,971901464,wsj-opinion-potomac-watch,https://podcasts.apple.com/us/podcast/wsj-opin...,WSJ Opinion: Potomac Watch,news,mixed they have lose much of thier credibility...
4,ca601bd1524322d0527b16adf2738ff3,Try it now!,Even better than I expected. I was interested ...,5,D7CA4858AFA2CFC,2017-08-24T10:55:20-07:00,1257821731,conversations-with-people-who-hate-me,https://podcasts.apple.com/us/podcast/conversa...,Conversations with People Who Hate Me,society,try it now ! even well than i expect . i be in...


In [6]:
sample.to_csv('df_lem.csv')

## Read data after lemmatization

In [33]:
df = pd.read_csv('df_lem.csv', lineterminator='\n', index_col = 0)
df.head(2)

Unnamed: 0,podcast_id,title,content,rating,author_id,created_at,itunes_id,slug,itunes_url,podcast_title,category,reviews_title
0,b313ef8ef0d5b64290d3036ff1bbf2d2,감성 라디오 음악도시,미국 서부에 있는 유학생이에요. 성시경씨 제대 후 라디오 복귀만 기다려오다가 6 월...,5,664CCA7142E9AE8,2011-09-14T13:25:46-07:00,442838670,fm-%EC%9D%8C%EC%95%85%EB%8F%84%EC%8B%9C-%EC%A2...,https://podcasts.apple.com/us/podcast/fm-%EC%9...,FM 음악도시(종영),music,감성 라디오 음악도시 미국 서부에 있는 유학생이에요 . 성시경씨 제대 후 라디오 복...
1,abfb842993be20d21bfae7103addc5e9,They’ve really cut back on the content this se...,Last season there was a new pod every 3-4 days...,1,AD790CE113DCBC1,2018-04-11T13:46:47-07:00,1015394113,the-good-phight-for-philadelphia-phillies-fans,https://podcasts.apple.com/us/podcast/the-good...,The Good Phight: for Philadelphia Phillies fans,sports,they ’ ve really cut back on the content this ...


## Category Combination
Merge similar categories into a new or existing category: 
- society/ religion/ government/ history/ education/ kids as “society”
- tv/ leisure/ sports/ music/ fiction/ arts as “entertainment”
- science/ technology/ health/ crime as “others”. 

In [34]:
df["category"].value_counts(normalize=True)

comedy        0.16038
society       0.12890
news          0.10412
business      0.07566
sports        0.07178
arts          0.06362
education     0.05976
crime         0.05042
health        0.04706
tv            0.04354
religion      0.04186
leisure       0.03452
history       0.02834
kids          0.02448
music         0.01782
science       0.01640
fiction       0.01552
government    0.00826
technology    0.00756
Name: category, dtype: float64

In [35]:
import re
def replace_cat(line):
    line = re.sub(r'\b(society|religion|government|history|education|kids)\b', 'society', line)
    line = re.sub(r'\b(tv|leisure|sports|music|fiction|arts)\b', 'entertainment', line)
    line = re.sub(r'\b(science|technology|health|crime)\b', 'others', line)
    return line

In [36]:
df['category'] = df['category'].apply(lambda x: replace_cat(x))

In [37]:
df["category"].value_counts(normalize=True)

society          0.29160
entertainment    0.24680
comedy           0.16038
others           0.12144
news             0.10412
business         0.07566
Name: category, dtype: float64

In [38]:
df['reviews_title_pod'] = df['reviews_title'] + ' ' + df['podcast_title']
df['reviews_title_pod'] = df['reviews_title_pod'].astype(str)
df['reviews_title_pod'] = df['reviews_title_pod'].apply(lambda x: x.lower())
df.head(2)

Unnamed: 0,podcast_id,title,content,rating,author_id,created_at,itunes_id,slug,itunes_url,podcast_title,category,reviews_title,reviews_title_pod
0,b313ef8ef0d5b64290d3036ff1bbf2d2,감성 라디오 음악도시,미국 서부에 있는 유학생이에요. 성시경씨 제대 후 라디오 복귀만 기다려오다가 6 월...,5,664CCA7142E9AE8,2011-09-14T13:25:46-07:00,442838670,fm-%EC%9D%8C%EC%95%85%EB%8F%84%EC%8B%9C-%EC%A2...,https://podcasts.apple.com/us/podcast/fm-%EC%9...,FM 음악도시(종영),entertainment,감성 라디오 음악도시 미국 서부에 있는 유학생이에요 . 성시경씨 제대 후 라디오 복...,감성 라디오 음악도시 미국 서부에 있는 유학생이에요 . 성시경씨 제대 후 라디오 복...
1,abfb842993be20d21bfae7103addc5e9,They’ve really cut back on the content this se...,Last season there was a new pod every 3-4 days...,1,AD790CE113DCBC1,2018-04-11T13:46:47-07:00,1015394113,the-good-phight-for-philadelphia-phillies-fans,https://podcasts.apple.com/us/podcast/the-good...,The Good Phight: for Philadelphia Phillies fans,entertainment,they ’ ve really cut back on the content this ...,they ’ ve really cut back on the content this ...


## Word Frequency
Use count vectorization to find high frequency words to draw quick insights of regex target

In [39]:
def Countvec(words, Ngram=(1,1), token_pattern=None, min_df=1, max_df=1.0):
    '''Create count vectorizer'''
    vectorizer = CountVectorizer(stop_words="english", ngram_range=Ngram, lowercase=True, 
                                 token_pattern=token_pattern, min_df=min_df, max_df=max_df)
    X = vectorizer.fit_transform(words) 
    X = X.toarray()
    print(X.shape)    
    feature = vectorizer.get_feature_names()
    corpus_df = pd.DataFrame(X, columns=feature)
    return corpus_df

In [40]:
sen = df['reviews_title_pod'].tolist()
sen_vec = Countvec(words=sen, Ngram=(1,1), token_pattern=r'[a-zA-Z]{3,}')
sen_vec.sum().sort_values(ascending=False)[:50]

(50000, 45539)




podcast     41705
listen      21148
love        17756
like        14220
great       13912
episode     13436
just        11623
good         8906
make         8830
time         8276
really       7798
story        7398
talk         6899
people       5900
host         5790
guy          5635
say          5579
don          5369
know         5288
way          5131
work         4868
want         4813
podcasts     4802
need         4614
best         4584
new          4491
life         4444
guest        4428
thing        4405
think        4367
crime        4228
feel         4138
look         3950
content      3935
come         3925
enjoy        3816
try          3652
use          3593
start        3578
hear         3516
year         3319
bad          3318
topic        3193
real         3104
lot          3090
sound        2992
fun          2981
favorite     2956
thank        2896
learn        2854
dtype: int64

## Regex Cleaning
Use Regex to remove high-frequency words based on Count Vectorization result 

In [41]:
def word_replace(line):
    line = re.sub(r'\b(pod(s?|casts?)|listen|love|great|episodes?|just|good|make|time|really)\b', '', line)
    line = re.sub(r'\b(story|talk|people|host|guy|say|don|know|way|work|want|need|best|new|life)\b', '', line)
    line = re.sub(r'\b(guest|thing|think|feel|look|come|use|year|minutes?|lot|thank|favorite)\b', '', line)
    return line

In [42]:
df['reviews_title_pod'] = df['reviews_title_pod'].apply(lambda x: word_replace(x))

## Get Feature Space and Target Labels

In [43]:
labels = df["category"]
docs = df["reviews_title_pod"]

## Perform Label Categorical Encoding of Categories

In [44]:
encoder = LabelEncoder()
labels = to_categorical(encoder.fit_transform(labels))

## Remove Stopwords Using SpaCy

In [45]:
nlp = spacy.load('en_core_web_md')
stopwords_removed_docs = list(
    map(lambda doc: " ".join([token.text for token in nlp(doc) if not token.is_stop]), docs))

## Tokenize the Text

In [46]:
tokenizer = Tokenizer(num_words=10000, oov_token="UNKNOWN_TOKEN")
tokenizer.fit_on_texts(stopwords_removed_docs)

## Integer Encode Tokens

In [47]:
def integer_encode_documents(docs, tokenizer):
    return tokenizer.texts_to_sequences(docs)

## Get Max Length Per Token

In [48]:
def get_max_token_length_per_doc(docs: List[List[str]])-> int:
    return max(list(map(lambda x: len(x.split()), docs)))

# get the max length in terms of token length
max_length = get_max_token_length_per_doc(docs)

In [49]:
MAX_SEQUENCE_LENGTH = 500
# integer encode the documents
encoded_docs = integer_encode_documents(stopwords_removed_docs, tokenizer)
# this is a list of lists, the numbers represent the index position of that word.
# for instance, 33 means the 33rd word in the vocabulary
padded_docs = pad_sequences(encoded_docs, maxlen=MAX_SEQUENCE_LENGTH, padding='post')

In [50]:
padded_docs.shape

(50000, 500)

## Split into Train/Test Set

In [51]:
X_train, X_test, y_train, y_test = train_test_split(padded_docs, labels, test_size=0.2)

## Keras RNN/LSTM Architecture

In [52]:
# Toolkit

VOCAB_SIZE = int(len(tokenizer.word_index) * 1.1)

## Load in Glove Vectors

In [53]:
def load_glove_vectors():
    embeddings_index = {}
    with open('../datasets/glove.6B.100d.txt') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    print('Loaded %s word vectors.' % len(embeddings_index))
    return embeddings_index


embeddings_index = load_glove_vectors()

Loaded 400000 word vectors.


## Load in the Embeddings

In [54]:
embedding_matrix = zeros((VOCAB_SIZE, 100))
for word, i in tokenizer.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: # check that it is an actual word that we have embeddings for
        embedding_matrix[i] = embedding_vector

## Define Model

In [55]:
def make_classification_rnn_model(plot=False):
    model = Sequential() # keras model
    model.add(Embedding(VOCAB_SIZE, 100, weights=[embedding_matrix], input_length=MAX_SEQUENCE_LENGTH, trainable=False))
    model.add(Masking(mask_value=0.0)) # masking layer, masks any words that don't have an embedding as 0s.
    model.add(SimpleRNN(units=64, input_shape=(1, MAX_SEQUENCE_LENGTH)))
    model.add(Dense(16))
    model.add(Dense(6, activation='softmax')) # we changed the number of categories from 19 to 6
    
    # Compile the model
    model.compile(
    optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    # summarize the model
    model.summary()
    
    if plot:
        plot_model(model, to_file='model.png', show_shapes=True)
    return model

def make_lstm_classification_model(plot=False):
    model = Sequential()
    model.add(Embedding(VOCAB_SIZE, 100, weights=[embedding_matrix], input_length=MAX_SEQUENCE_LENGTH, trainable=False))
    model.add(Masking(mask_value=0.0)) # masking layer, masks any words that don't have an embedding as 0s.
    model.add(LSTM(units=32, input_shape=(1, MAX_SEQUENCE_LENGTH)))
    model.add(Dense(16))
    model.add(Dense(6, activation='softmax')) # we changed the number of categories from 19 to 6
    
    # Compile the model
    model.compile(
    optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    # summarize the model
    model.summary()
    
    if plot:
        plot_model(model, to_file='model.png', show_shapes=True)
    return model

## Compile Model - RNN

In [56]:
# Here we try RNN model

rnn = make_classification_rnn_model()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 500, 100)          5695400   
                                                                 
 masking_2 (Masking)         (None, 500, 100)          0         
                                                                 
 simple_rnn_1 (SimpleRNN)    (None, 64)                10560     
                                                                 
 dense_4 (Dense)             (None, 16)                1040      
                                                                 
 dense_5 (Dense)             (None, 6)                 102       
                                                                 
Total params: 5,707,102
Trainable params: 11,702
Non-trainable params: 5,695,400
_________________________________________________________________


## Fit the Model - RNN

In [57]:
rnn.fit(X_train, y_train, epochs=20, verbose=1)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7fa1601f7810>

## Evaluate the Model - RNN

In [58]:
loss, accuracy = rnn.evaluate(X_test, y_test, verbose=1)
print('Accuracy: %f' % (accuracy*100))

Accuracy: 69.790000


In [59]:
predictions_rnn = rnn.predict(X_test)



In [60]:
# calculate roc-auc score

from sklearn.metrics import roc_auc_score
roc_auc_score(y_test, predictions_rnn, multi_class='ovo')

0.9078637554790334

In [62]:
pred_rnn = encoder.inverse_transform(predictions_rnn.argmax(axis=1))
true_rnn = encoder.inverse_transform(y_test.argmax(axis=1))

In [63]:
# get the confusion matrix

from sklearn.metrics import confusion_matrix
labels = ['society', 'entertainment', 'comedy', 'news', 'business', 'others']
confusion_matrix_rnn = confusion_matrix(true_rnn, pred_rnn, labels=labels)
confusion_matrix_rnn

array([[2154,  384,  149,   91,   46,  126],
       [ 273, 1823,  188,   98,   41,   53],
       [ 152,  273, 1088,   51,    8,   34],
       [  60,   95,   45,  790,   14,   14],
       [ 137,  110,   49,   43,  381,   34],
       [ 206,  137,   54,   37,   19,  743]])

In [64]:
cmtx = pd.DataFrame(
    confusion_matrix_rnn, 
    index=['society', 'entertainment', 'comedy', 'news', 'business', 'others'], 
    columns=['society', 'entertainment', 'comedy', 'news', 'business', 'others']
)
cmtx

Unnamed: 0,society,entertainment,comedy,news,business,others
society,2154,384,149,91,46,126
entertainment,273,1823,188,98,41,53
comedy,152,273,1088,51,8,34
news,60,95,45,790,14,14
business,137,110,49,43,381,34
others,206,137,54,37,19,743


## Compile Model - LSTM

In [65]:
lstm = make_lstm_classification_model()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, 500, 100)          5695400   
                                                                 
 masking_3 (Masking)         (None, 500, 100)          0         
                                                                 
 lstm_1 (LSTM)               (None, 32)                17024     
                                                                 
 dense_6 (Dense)             (None, 16)                528       
                                                                 
 dense_7 (Dense)             (None, 6)                 102       
                                                                 
Total params: 5,713,054
Trainable params: 17,654
Non-trainable params: 5,695,400
_________________________________________________________________


## Fit the Model - LSTM

In [66]:
lstm.fit(X_train, y_train, epochs=20, verbose=1)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7fa11fa19190>

## Evaluate the Model - LSTM

In [67]:
loss, accuracy = lstm.evaluate(X_test, y_test, verbose=1)
print('Accuracy: %f' % (accuracy*100))

Accuracy: 74.629998


In [68]:
predictions_lstm = lstm.predict(X_test)



In [69]:
# calculate roc-auc score

from sklearn.metrics import roc_auc_score
roc_auc_score(y_test, predictions_lstm, multi_class='ovo')

0.9318895742414628

In [70]:
pred_lstm = encoder.inverse_transform(predictions_lstm.argmax(axis=1))
true_lstm = encoder.inverse_transform(y_test.argmax(axis=1))

In [71]:
# get the confusion matrix

from sklearn.metrics import confusion_matrix
labels = ['society', 'entertainment', 'comedy', 'news', 'business', 'others']
confusion_matrix_lstm = confusion_matrix(true_lstm, pred_lstm, labels=labels)
confusion_matrix

<function sklearn.metrics._classification.confusion_matrix(y_true, y_pred, *, labels=None, sample_weight=None, normalize=None)>

In [72]:
cmtx = pd.DataFrame(
    confusion_matrix_lstm, 
    index=['society', 'entertainment', 'comedy', 'news', 'business', 'others'], 
    columns=['society', 'entertainment', 'comedy', 'news', 'business', 'others']
)
cmtx

Unnamed: 0,society,entertainment,comedy,news,business,others
society,2333,234,154,47,60,122
entertainment,295,1799,216,39,54,73
comedy,188,156,1206,10,11,35
news,76,72,28,801,19,22
business,125,54,36,12,490,37
others,175,83,55,17,32,834


In [73]:
# Time consumed to scored 50,000 reviews 
lstm.predict(padded_docs, verbose=1)



array([[9.5982669e-04, 1.0512244e-02, 9.5710284e-01, 3.1097555e-03,
        1.5565929e-02, 1.2749541e-02],
       [3.9443714e-04, 3.1592820e-03, 8.6127573e-01, 1.1532735e-01,
        5.3606858e-03, 1.4482460e-02],
       [6.2588160e-03, 8.1369625e-03, 2.2225364e-01, 7.3155355e-01,
        6.3836025e-03, 2.5413450e-02],
       ...,
       [4.9092807e-02, 7.1147867e-02, 5.8065748e-01, 5.9593464e-03,
        5.0274100e-02, 2.4286832e-01],
       [1.0365872e-02, 1.5421054e-01, 4.9356303e-01, 6.8957236e-04,
        4.8770517e-02, 2.9240042e-01],
       [4.0157106e-02, 4.7633151e-04, 6.7028826e-01, 2.4843028e-02,
        2.6207998e-01, 2.1553049e-03]], dtype=float32)

In [76]:
# get the recall for each categories
for i in ['society', 'entertainment', 'comedy', 'news', 'business', 'others']:
    print(f'Recall of {i}: {cmtx.loc[i, i]/cmtx.loc[i, :].sum()}')

Recall of society: 0.7908474576271186
Recall of entertainment: 0.7265751211631664
Recall of comedy: 0.75093399750934
Recall of news: 0.7868369351669942
Recall of business: 0.649867374005305
Recall of others: 0.697324414715719


In [77]:
# get the precision for each categories
for i in ['society', 'entertainment', 'comedy', 'news', 'business', 'others']:
    print(f'Precision of {i}: {cmtx.loc[i, i]/cmtx.loc[:, i].sum()}')

Precision of society: 0.7308897243107769
Precision of entertainment: 0.750208507089241
Precision of comedy: 0.7115044247787611
Precision of news: 0.8650107991360692
Precision of business: 0.7357357357357357
Precision of others: 0.742653606411398
