<h3>Load and preprocess data</h3>

In [1]:
import pandas as pd
import seaborn as sns
from src.recsys_utils import anaylze_item, build_profile, train_user_model, preprocess_items
from string import punctuation
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
book_df = pd.read_csv('data/books_data.csv')
book_df

Unnamed: 0,Title,description,authors,image,previewLink,publisher,publishedDate,infoLink,categories,ratingsCount
0,Its Only Art If Its Well Hung!,,['Julie Strain'],http://books.google.com/books/content?id=DykPA...,http://books.google.nl/books?id=DykPAAAACAAJ&d...,,1996,http://books.google.nl/books?id=DykPAAAACAAJ&d...,['Comics & Graphic Novels'],
1,Dr. Seuss: American Icon,Philip Nel takes a fascinating look into the k...,['Philip Nel'],http://books.google.com/books/content?id=IjvHQ...,http://books.google.nl/books?id=IjvHQsCn_pgC&p...,A&C Black,2005-01-01,http://books.google.nl/books?id=IjvHQsCn_pgC&d...,['Biography & Autobiography'],
2,Wonderful Worship in Smaller Churches,This resource includes twelve principles in un...,['David R. Ray'],http://books.google.com/books/content?id=2tsDA...,http://books.google.nl/books?id=2tsDAAAACAAJ&d...,,2000,http://books.google.nl/books?id=2tsDAAAACAAJ&d...,['Religion'],
3,Whispers of the Wicked Saints,Julia Thomas finds her life spinning out of co...,['Veronica Haddon'],http://books.google.com/books/content?id=aRSIg...,http://books.google.nl/books?id=aRSIgJlq6JwC&d...,iUniverse,2005-02,http://books.google.nl/books?id=aRSIgJlq6JwC&d...,['Fiction'],
4,"Nation Dance: Religion, Identity and Cultural ...",,['Edward Long'],,http://books.google.nl/books?id=399SPgAACAAJ&d...,,2003-03-01,http://books.google.nl/books?id=399SPgAACAAJ&d...,,
...,...,...,...,...,...,...,...,...,...,...
212399,The Orphan Of Ellis Island (Time Travel Advent...,"During a school trip to Ellis Island, Dominick...",['Elvira Woodruff'],http://books.google.com/books/content?id=J7M-N...,http://books.google.com/books?id=J7M-NwAACAAJ&...,Scholastic Paperbacks,2000-06-01,http://books.google.com/books?id=J7M-NwAACAAJ&...,['Juvenile Fiction'],2.0
212400,Red Boots for Christmas,Everyone in the village of Friedensdorf is hap...,,http://books.google.com/books/content?id=3n8k6...,http://books.google.com/books?id=3n8k6wl4BbYC&...,,1995,http://books.google.com/books?id=3n8k6wl4BbYC&...,['Juvenile Fiction'],
212401,Mamaw,"Give your Mamaw a useful, beautiful and though...",['Wild Wild Cabbage'],,http://books.google.com/books?id=zytVswEACAAJ&...,,2018-01-17,http://books.google.com/books?id=zytVswEACAAJ&...,,
212402,The Autograph Man,Alex-Li Tandem sells autographs. His business ...,['Zadie Smith'],http://books.google.com/books/content?id=JM6YV...,http://books.google.com/books?id=JM6YVPx_clMC&...,Vintage,2003-08-12,https://play.google.com/store/books/details?id...,['Fiction'],19.0


In [3]:
book_df = book_df.drop(columns = ['image', 'previewLink', 'ratingsCount', 'infoLink', 'publisher', 'publishedDate'])

In [4]:
book_df = book_df.dropna()
book_df.shape

(136138, 4)

In [5]:
book_df

Unnamed: 0,Title,description,authors,categories
1,Dr. Seuss: American Icon,Philip Nel takes a fascinating look into the k...,['Philip Nel'],['Biography & Autobiography']
2,Wonderful Worship in Smaller Churches,This resource includes twelve principles in un...,['David R. Ray'],['Religion']
3,Whispers of the Wicked Saints,Julia Thomas finds her life spinning out of co...,['Veronica Haddon'],['Fiction']
5,The Church of Christ: A Biblical Ecclesiology ...,In The Church of Christ: A Biblical Ecclesiolo...,['Everett Ferguson'],['Religion']
8,Saint Hyacinth of Poland,The story for children 10 and up of St. Hyacin...,['Mary Fabyan Windeatt'],['Biography & Autobiography']
...,...,...,...,...
212394,Final things,Grace's father believes in science and builds ...,['Jenny Offill'],['Fiction']
212397,The Magic of the Soul: Applying Spiritual Powe...,"""The Magic of the Soul, Applying Spiritual Pow...",['Patrick J. Harbula'],"['Body, Mind & Spirit']"
212398,Autodesk Inventor 10 Essentials Plus,Autodesk Inventor 2017 Essentials Plus provide...,"['Daniel Banach', 'Travis Jones']",['Computers']
212399,The Orphan Of Ellis Island (Time Travel Advent...,"During a school trip to Ellis Island, Dominick...",['Elvira Woodruff'],['Juvenile Fiction']


In [6]:
book_df['authors'] = book_df['authors'].apply(lambda x: ''.join([char for char in x if char not in punctuation]))
book_df['categories'] = book_df['categories'].apply(lambda x: ''.join([char for char in x if char not in punctuation]))
book_df['combined_text'] = book_df["Title"] + " " + book_df['description'] + " " + book_df['authors'] + " " + book_df['categories']
book_df

Unnamed: 0,Title,description,authors,categories,combined_text
1,Dr. Seuss: American Icon,Philip Nel takes a fascinating look into the k...,Philip Nel,Biography Autobiography,Dr. Seuss: American Icon Philip Nel takes a fa...
2,Wonderful Worship in Smaller Churches,This resource includes twelve principles in un...,David R Ray,Religion,Wonderful Worship in Smaller Churches This res...
3,Whispers of the Wicked Saints,Julia Thomas finds her life spinning out of co...,Veronica Haddon,Fiction,Whispers of the Wicked Saints Julia Thomas fin...
5,The Church of Christ: A Biblical Ecclesiology ...,In The Church of Christ: A Biblical Ecclesiolo...,Everett Ferguson,Religion,The Church of Christ: A Biblical Ecclesiology ...
8,Saint Hyacinth of Poland,The story for children 10 and up of St. Hyacin...,Mary Fabyan Windeatt,Biography Autobiography,Saint Hyacinth of Poland The story for childre...
...,...,...,...,...,...
212394,Final things,Grace's father believes in science and builds ...,Jenny Offill,Fiction,Final things Grace's father believes in scienc...
212397,The Magic of the Soul: Applying Spiritual Powe...,"""The Magic of the Soul, Applying Spiritual Pow...",Patrick J Harbula,Body Mind Spirit,The Magic of the Soul: Applying Spiritual Powe...
212398,Autodesk Inventor 10 Essentials Plus,Autodesk Inventor 2017 Essentials Plus provide...,Daniel Banach Travis Jones,Computers,Autodesk Inventor 10 Essentials Plus Autodesk ...
212399,The Orphan Of Ellis Island (Time Travel Advent...,"During a school trip to Ellis Island, Dominick...",Elvira Woodruff,Juvenile Fiction,The Orphan Of Ellis Island (Time Travel Advent...


In [7]:
book_df['combined_text'] = book_df['combined_text'].apply(preprocess_items)
book_df['vector_text'] = book_df['combined_text'].apply(lambda text: anaylze_item(text, 30, 256))



['dr', 'seuss', 'american', 'icon', 'philip', 'nel', 'takes', 'fascinating', 'look', 'key', 'aspects', 'seusss', 'career', 'poetry', 'politics', 'art', 'marketing', 'place', 'popular', 'imagination', 'nel', 'argues', 'convincingly', 'dr', 'seuss', 'one', 'influential', 'poets', 'america', 'nonsense', 'verse', 'like', 'lewis', 'carroll', 'edward', 'lear', 'changed', 'language', 'giving', 'us', 'new', 'words', 'like', 'nerd', 'seusss', 'famously', 'loopy', 'artistic', 'style', 'nel', 'terms', 'energetic', 'cartoon', 'surrealism', 'equally', 'important', 'inspiring', 'artists', 'like', 'filmmaker', 'tim', 'burton', 'illustrator', 'lane', 'smith', 'back', 'cover', 'philip', 'nel', 'biography', 'autobiography']
['wonderful', 'worship', 'smaller', 'churches', 'resource', 'includes', 'twelve', 'principles', 'understanding', 'small', 'church', 'worship', 'fifteen', 'practices', 'planning', 'worship', 'fewer', '100', 'people', 'suggestions', 'congregational', 'study', 'david', 'r', 'ray', 'reli

KeyboardInterrupt: 

In [None]:
book_df

In [None]:
ratings_df = pd.read_csv('data/Books_rating.csv') 

In [None]:
ratings_df.columns

In [None]:
ratings_df = ratings_df.drop(columns = ['Price', 'review/summary', 'review/time', 'review/helpfulness'])

In [None]:
sns.countplot(data = ratings_df, x = 'review/score')

In [None]:
ratings_df = ratings_df.dropna()
ratings_df.shape

In [None]:
ratings_df

In [None]:
ratings_df['User_id'].value_counts()

<p>is there enough data for a machine learning model?</p>

In [None]:
ratings_df[ratings_df.User_id == 'A1D2C0WDCSHUWZ']['review/score'].value_counts()

In [None]:
user_df = pd.merge(ratings_df[ratings_df.User_id == "A1D2C0WDCSHUWZ"], book_df, on = 'Title')

In [None]:
user_df['vector_text'][1].shape

In [None]:
user_df['vector_text'].apply(lambda vec: print(vec.shape))

In [None]:
user_vector = build_profile(user_df['vector_text'].tolist())

user_df['user_vector'] = [user_vector.flatten()] * len(user_df)
user_df['vector_text'] = user_df['vector_text'].apply(lambda vec: vec.flatten())

In [None]:
user_df

In [None]:
vector_text_df = pd.DataFrame(data = user_df["vector_text"].tolist())
vector_text_df.fillna(value = 0, inplace = True)

user_vector_df = pd.DataFrame(data = user_df["user_vector"].tolist())
model_df = pd.concat((vector_text_df, user_vector_df, user_df['review/score']), axis = 1)

In [None]:
predictions, true_scores = train_user_model(model_df.drop(columns = 'review/score'), model_df["review/score"], "regression", True)

<p>it doesnt seem to work that well :( I wonder whats wrong. Not enough data?</p>

In [None]:
plt.plot(true_scores, 'o', label = 'true value')
plt.plot(predictions, 'x', color = 'red', label = 'predicted value')
plt.title("True user scores vs predicted scores")
plt.ylabel("Score")
plt.legend()
plt.xticks([])
plt.show()

In [None]:
model_df['score_class'] = model_df['review/score'].apply(lambda score: 0 if score >= 3.0 else 1)

In [None]:
predictions, true_scores = train_user_model(model_df.drop(columns = ['review/score', 'score_class']), model_df["score_class"], "classification", True)

<p> Bad precision and recall scores. I'm thinking a data issue</p>