In [1]:
# NOTE: we no longer need to bucket the rating into 0.5 increments

In [21]:
from collections import Counter, defaultdict
import tensorflow_hub as hub
import pandas as pd
import json
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
import nltk
import numpy as np
import string
import seaborn as sns


data_df = pd.read_csv('processed.csv',header = 0)

# save for later NLP processing
cmu_summaries = data_df[['title_cmu','summary_cmu']]
cmu_summaries = cmu_summaries.drop_duplicates()

In [22]:
# NLP SECTION
cmu_summaries['cleaned'] = cmu_summaries['summary_cmu'].apply(lambda x: nltk.tokenize.word_tokenize(x))

def remove_punc(text):
    text = text.translate(str.maketrans('', '', string.punctuation))
    return text

cmu_summaries['cleaned'] = cmu_summaries.cleaned.apply(lambda summary: [remove_punc(w) for w in summary])

from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = stopwords.words('english')

cmu_summaries['cleaned'] = cmu_summaries.cleaned.apply(lambda tweet: [w for w in tweet if w.lower() not in stop_words])
cmu_summaries['cleaned'] = cmu_summaries.cleaned.apply(lambda tweet: [w.lower() for w in tweet])

from nltk.stem import WordNetLemmatizer 
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer() 

from nltk.stem import PorterStemmer 
ps = PorterStemmer()

nltk.download('omw-1.4')

cmu_summaries['cleaned'] = cmu_summaries.cleaned.apply(lambda tweet: [lemmatizer.lemmatize(w) for w in tweet ])
cmu_summaries['cleaned'] = cmu_summaries.cleaned.apply(lambda tweet: [ps.stem(w) for w in tweet ])
cmu_summaries['cleaned'] = cmu_summaries.cleaned.apply(lambda tweet: ' '.join(tweet))

cmu_summaries.head()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kjaga\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\kjaga\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\kjaga\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Unnamed: 0,title_cmu,summary_cmu,cleaned
0,Animal Farm,"Old Major, the old boar on the Manor Farm, ca...",old major old boar manor farm call anim farm...
7,A Clockwork Orange,"Alex, a teenager living in near-future Englan...",alex teenag live nearfutur england lead gang...
12,The Plague,The text of The Plague is divided into five p...,text plagu divid five part town oran thousan...
16,An Enquiry Concerning Human Understanding,The argument of the Enquiry proceeds by a ser...,argument enquiri proce seri increment step se...
17,A Fire Upon the Deep,The novel posits that space around the Milky ...,novel posit space around milki way divid conce...


In [23]:
# ENCODING
module_url = "https://tfhub.dev/google/universal-sentence-encoder/4" #@param ["https://tfhub.dev/google/universal-sentence-encoder/4", "https://tfhub.dev/google/universal-sentence-encoder-large/5"]
model = hub.load(module_url)
df_embed = cmu_summaries[["title_cmu", "cleaned"]].drop_duplicates()
df_embed["encoding"] = df_embed.cleaned.apply(lambda x : model([x]).numpy())
df_embed["encoding"] = df_embed.encoding.apply(lambda x : x.reshape(-1))
df_embed = df_embed.drop(columns=["cleaned"])
df_embed

Unnamed: 0,title_cmu,encoding
0,Animal Farm,"[0.031182911, -0.045368824, -0.0033902207, -0...."
7,A Clockwork Orange,"[0.045271024, -0.045923527, -0.045832865, -0.0..."
12,The Plague,"[0.04395455, -0.046985425, -0.046915844, -0.04..."
16,An Enquiry Concerning Human Understanding,"[0.046199907, -0.046197694, -0.04619985, -0.04..."
17,A Fire Upon the Deep,"[0.048338868, -0.048177764, -0.048294846, -0.0..."
...,...,...
46261,Suspicion,"[-0.03547619, -0.057844806, 0.023499582, 0.055..."
46266,Who Killed Zebedee?,"[-0.04873011, -0.05490303, 0.021128226, 0.0320..."
46268,Charon's Claw,"[0.045435745, -0.045492705, -0.044836313, -0.0..."
46277,The Last Girl,"[-0.041102313, -0.05113369, 0.027463512, 0.001..."


In [52]:
df_embed = df_embed.sort_values("title_cmu")
df_embed

Unnamed: 0,title_cmu,encoding
7071,'Art',"[-0.06923865, -0.06796477, 0.013774357, 0.0477..."
12947,'Tis Pity She's a Whore,"[0.04556423, -0.046983995, -0.04700185, -0.046..."
6366,'Tis: A Memoir,"[-0.05231972, -0.040128753, 0.051210675, 0.037..."
43968,...And Call Me Conrad,"[0.030534877, -0.052636765, -0.042618882, -0.0..."
18199,...And Now Miguel,"[-0.05811629, -0.050462883, -0.0016676751, 0.0..."
...,...,...
45120,Zulu Hart,"[-0.02220043, -0.064509906, 0.019458866, -0.03..."
19504,Zyword,"[-0.031300556, -0.056047563, 0.015485473, 0.04..."
40772,"\\""U\\"" Is for Undertow","[0.04656952, -0.04657597, -0.04643509, -0.0465..."
44492,"\\""V\\"" Is for Vengeance","[0.04552539, -0.046659097, -0.04664292, -0.046..."


In [53]:


genre_count = Counter()
book_to_genre = defaultdict(set)

publisher_count = Counter()
book_to_publishers = defaultdict(list)

for row in data_df.to_dict('records'):
    entry = row['genre']
    if not isinstance(entry, str):
        continue
    temp_one = entry.replace("'s", "").replace("'", '"').replace('/','')
    temp = json.loads(temp_one)
    book_to_genre[row['title_cmu']].update(list(temp.values()))
    for val in temp.values():
        genre_count[val]+=1
    
    book_to_publishers[row['title_cmu']].append(row['publisher'])
    entry = row['publisher']
    publisher_count[entry]+=1

genre_freq = list(genre_count.items())
genre_freq.sort(key = lambda x: x[1], reverse=True)
top_genres = [pair[0] for pair in genre_freq[:15]]

publisher_freq = list(publisher_count.items())
publisher_freq.sort(key = lambda x: x[1], reverse=True)
top_publishers = [pair[0] for pair in publisher_freq[:15]]


# aim is to one-hot the publisher and genre
# and aggregate the rating and num_rating
# train a linear regression model on this for starters, then use the encoded goog / cmu summaries to improve


In [54]:
# data_df.columns
new_df = data_df[["title_cmu", "rating", "num_ratings"]]
new_df["sum_ratings"] = new_df["rating"] * new_df["num_ratings"]
new_df = new_df.drop(["rating"], axis=1)
new_df = new_df.groupby('title_cmu').agg({'sum_ratings':'sum', 'num_ratings':'sum'})
new_df = new_df.reset_index()
new_df = new_df.rename(columns={"title_cmu": "title"})
new_df["rating"] = new_df["sum_ratings"] / new_df["num_ratings"]
new_df = new_df.drop(["sum_ratings"], axis=1)
new_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df["sum_ratings"] = new_df["rating"] * new_df["num_ratings"]


Unnamed: 0,title,num_ratings,rating
0,'Art',74,3.648649
1,'Tis Pity She's a Whore,2,2.500000
2,'Tis: A Memoir,471,3.299363
3,...And Call Me Conrad,18,3.388889
4,...And Now Miguel,24,3.500000
...,...,...,...
11307,Zulu Hart,9,2.555556
11308,Zyword,2,5.000000
11309,"\\""U\\"" Is for Undertow",366,3.912568
11310,"\\""V\\"" Is for Vengeance",70,3.542857


In [56]:

for genre in top_genres:
    new_df[genre] = 0

new_df['other_genre'] = 0

for pub in top_publishers:
    new_df[pub] = 0

new_df['other_publisher'] = 0


i = -1
for row in new_df.to_dict('records'):
    i+=1
    title = row['title']
    for genre in book_to_genre[title]:
        if genre in top_genres:
            new_df.at[i, genre] = 1
        else:
            new_df.at[i, 'other_genre'] = 1
    
    for pub in book_to_publishers[title]:
        if pub in top_publishers:
            new_df.at[i,pub] = 1
        else:
            new_df.at[i,'other_publisher'] = 1
            
new_df

Unnamed: 0,title,num_ratings,rating,Fiction,Speculative fiction,Science Fiction,Fantasy,Novel,Children literature,Mystery,...,Open Road Media,Vintage,HarperCollins,Houghton Mifflin Harcourt,Hachette UK,Tor Books,Scholastic Inc.,HarperCollins UK,Bantam,other_publisher
0,'Art',74,3.648649,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,'Tis Pity She's a Whore,2,2.500000,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,'Tis: A Memoir,471,3.299363,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,1
3,...And Call Me Conrad,18,3.388889,0,0,1,0,1,0,0,...,0,0,1,0,0,0,0,0,0,1
4,...And Now Miguel,24,3.500000,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11307,Zulu Hart,9,2.555556,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,1
11308,Zyword,2,5.000000,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11309,"\\""U\\"" Is for Undertow",366,3.912568,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
11310,"\\""V\\"" Is for Vengeance",70,3.542857,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1


In [35]:
# COMBINE WITH DATASET
from sklearn import decomposition

prev_X = new_df.drop(columns=['title', 'rating']).to_numpy()
y = new_df['rating'].to_numpy()

encoding_X = np.stack(df_embed["encoding"])

# compress dimensions with pca
pca = decomposition.PCA(n_components=20)
pca.fit(encoding_X)
encoding_X = pca.transform(encoding_X)
print(prev_X.shape)
print(encoding_X.shape)
X = np.concatenate((prev_X, encoding_X), axis=1)
print(X.shape)
print(y.shape)

(11312, 33)
(11312, 20)
(11312, 53)
(11312,)


In [58]:


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

model = tf.keras.models.Sequential([
#   tf.keras.layers.Dense(83, activation='relu'),
#   tf.keras.layers.Dropout(.4),
  tf.keras.layers.Dense(53, activation='relu'),
  tf.keras.layers.Dropout(.2),
  tf.keras.layers.Dense(50, activation='relu'),
  tf.keras.layers.Dropout(.2),
  tf.keras.layers.Dense(40, activation='relu'),
  tf.keras.layers.Dropout(.2),
  tf.keras.layers.Dense(30, activation='relu'),
  tf.keras.layers.Dropout(.2),
  tf.keras.layers.Dense(20, activation='relu'),
  tf.keras.layers.Dropout(.2),
  tf.keras.layers.Dense(10, activation='relu'),
  tf.keras.layers.Dense(5, activation='relu'),
  tf.keras.layers.Dense(1, activation='relu')
])


mae = tf.keras.losses.MeanAbsoluteError()

model.compile(optimizer='adam',
              loss=mae,
              metrics=['mae', 'mean_absolute_percentage_error'])

model.fit(X_train, y_train, epochs=50, verbose=2)

model.evaluate(X_test,  y_test)

Epoch 1/50
283/283 - 2s - loss: 3.5778 - mae: 3.5778 - mean_absolute_percentage_error: 95.1573 - 2s/epoch - 7ms/step
Epoch 2/50
283/283 - 0s - loss: 1.7220 - mae: 1.7220 - mean_absolute_percentage_error: 46.4531 - 331ms/epoch - 1ms/step
Epoch 3/50
283/283 - 0s - loss: 0.7640 - mae: 0.7640 - mean_absolute_percentage_error: 21.2963 - 338ms/epoch - 1ms/step
Epoch 4/50
283/283 - 0s - loss: 0.4936 - mae: 0.4936 - mean_absolute_percentage_error: 14.3169 - 339ms/epoch - 1ms/step
Epoch 5/50
283/283 - 0s - loss: 0.4358 - mae: 0.4358 - mean_absolute_percentage_error: 12.8317 - 339ms/epoch - 1ms/step
Epoch 6/50
283/283 - 0s - loss: 0.4129 - mae: 0.4129 - mean_absolute_percentage_error: 12.2515 - 334ms/epoch - 1ms/step
Epoch 7/50
283/283 - 0s - loss: 0.4011 - mae: 0.4011 - mean_absolute_percentage_error: 11.9402 - 334ms/epoch - 1ms/step
Epoch 8/50
283/283 - 0s - loss: 0.3919 - mae: 0.3919 - mean_absolute_percentage_error: 11.7302 - 348ms/epoch - 1ms/step
Epoch 9/50
283/283 - 0s - loss: 0.3874 - ma

[0.3761027753353119, 0.3761027753353119, 10.85794448852539]

TODO:
* make average rating weighted by number of ratings
* remove the bucketing by 0.5
* throw out the outliers
* play with model structure / architecture, pca, etc.
* plotting genre / publisher frequencies, some books and their predicted vs actual ratings

In [59]:
model(X_test[0:20])

<tf.Tensor: shape=(20, 1), dtype=float32, numpy=
array([[3.73329  ],
       [3.7220101],
       [3.858255 ],
       [4.718364 ],
       [3.8125608],
       [3.7220101],
       [3.7220101],
       [3.7220101],
       [3.7798734],
       [3.8232741],
       [3.9594948],
       [3.7220101],
       [4.0716453],
       [3.7220101],
       [3.7009168],
       [3.6795857],
       [3.7220101],
       [3.7009313],
       [3.7989728],
       [3.7220101]], dtype=float32)>