<a href="https://colab.research.google.com/github/timhalter/PA-Addee/blob/main/PA_Addee.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Imports**

In [1]:
import pandas as pd
import numpy as np

import plotly.express as px

import re
from nltk.corpus import stopwords
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import TruncatedSVD
from keras import Sequential
from keras.layers import Dense,LSTM,Embedding
from plotly.subplots import make_subplots
import plotly.graph_objs as go

import tensorflow as tf

**Load dataset**

In [2]:
url = 'https://raw.githubusercontent.com/timhalter/PA-Addee/main/children_books.csv'
df = pd.read_csv(url,encoding='ISO-8859-1')
df.drop(['Author', 'Inerest_age'], axis=1, inplace=True)
df.rename(columns = {'Title':'title', 'Desc':'desc', 'Reading_age':'age'}, inplace = True)

df.head()

Unnamed: 0,title,desc,age
0,The Girl Who Drank the Moon,"Every year, the evil Protectorate offers a bab...",10-14
1,Time Between Us,Sixteen-year-old Anna is struggling to underst...,12+
2,Girl Out of Water,Lou Brown's swimming ambitions sank without tr...,10+
3,Captive,Robyn is scared. Ever since the attempted assa...,13
4,The School of Music,Welcome to the School of Music. In charge is M...,10+


In [3]:
df['age'].value_counts().sort_index()

10+                                     525
10-12                                    85
10-14                                    91
11+                                     448
12+                                     708
13                                       87
13+                                     533
14+                                     173
8+                                      267
9+                                      352
Name: age, dtype: int64

**Preprocessing**

In [4]:
def get_min_age(sir):
    if sir.find('-') != -1:
        return int(sir.split('-')[0])
    else:
        return int(sir.replace('+',' ').split(' ')[0])

def get_max_age(sir):
    if sir.find('-') != -1:
        return int(sir.split('-')[1])
    elif sir.find('+') != -1:
        sp = sir.replace(' ', '')
        sp = sir.replace('+', '')
        return int(sp)
    else:
        return 99

In [5]:
#Preprocessing And Feature Engineering
df.names = df.title.str.lower()
df.desc = df.desc.str.lower()
df['Min_Age'] = df.age.apply(get_min_age)
df['Max_Age'] = df.age.apply(get_max_age)
df['Title Nb Words'] = df.names.apply(lambda x: len(re.sub(' +', ' ', x).strip().split(' ')))
df['Title Avg Word Length'] = df.names.apply(lambda x: np.mean(np.array([len(word) for word in (re.sub(' +', ' ', x).strip().split(' '))])))
df['Range Length'] = df.Max_Age - df.Min_Age
df


Pandas doesn't allow columns to be created via a new attribute name - see https://pandas.pydata.org/pandas-docs/stable/indexing.html#attribute-access



Unnamed: 0,title,desc,age,Min_Age,Max_Age,Title Nb Words,Title Avg Word Length,Range Length
0,The Girl Who Drank the Moon,"every year, the evil protectorate offers a bab...",10-14,10,14,6,3.666667,4
1,Time Between Us,sixteen-year-old anna is struggling to underst...,12+,12,12,3,4.333333,0
2,Girl Out of Water,lou brown's swimming ambitions sank without tr...,10+,10,10,4,3.500000,0
3,Captive,robyn is scared. ever since the attempted assa...,13,13,99,1,7.000000,86
4,The School of Music,welcome to the school of music. in charge is m...,10+,10,10,4,4.000000,0
...,...,...,...,...,...,...,...,...
3264,Spy for the Queen of Scots,"beautiful young aristocrat ginette, known as j...",12+,12,12,6,3.500000,0
3265,Jimmy Corrigan:,this extraordinary graphic novel is so obvious...,14+,14,14,2,7.000000,0
3266,The Martian Girl,"following on from the brilliant lost on mars, ...",10+,10,10,3,4.666667,0
3267,Buffalo Soldier,plantation slave charley smith is eleven when ...,11+,11,11,2,7.000000,0


In [6]:
desc_number_of_components = 90
title_number_of_components = 97


desc_vectorizer = CountVectorizer()
desc_matrix = desc_vectorizer.fit_transform(df.desc)
desc_svd = TruncatedSVD(n_components=desc_number_of_components)
svd_desc_matrix = desc_svd.fit_transform(desc_matrix)

title_vectorizer = CountVectorizer()
title_matrix = title_vectorizer.fit_transform(df.title)
title_svd = TruncatedSVD(n_components=title_number_of_components)
svd_title_matrix = title_svd.fit_transform(title_matrix)

**Model**

In [7]:
model = Sequential(
    [Dense(input_dim =desc_number_of_components,activation='relu',units=100),
    Dense(activation='relu',units=50),
    Dense(activation='relu',units=20),
    Dense(activation='relu',units=2)]
)

model.compile(optimizer='adam',loss='mse',metrics=['mae'])



In [8]:
history = model.fit(svd_desc_matrix,df[['Min_Age','Max_Age']],epochs=550,verbose=False)

In [9]:
predictions = pd.DataFrame({'Min_Age':model.predict(svd_desc_matrix)[:,0],'Max_Age':model.predict(svd_desc_matrix)[:,1]})
predictions.Min_Age = np.round(predictions.Min_Age).astype(np.int)
predictions.Max_Age = (np.round(predictions.Max_Age)).astype(np.int)

In [10]:
fig = go.Figure()
H_diff = pd.DataFrame(history.history)

fig.add_trace(go.Scatter(
    x= np.arange(len(H_diff)),
    y=H_diff.loss,
    name='Loss'
))
fig.add_trace(go.Scatter(
    x= np.arange(len(H_diff)),
    y=H_diff.mae,
    name='Mean Abs Error'
))

fig.update_layout(title='Model Training Evaluation',xaxis_title='Iterration',yaxis_title='Value')
fig.show()

In [11]:
fig = make_subplots(rows=2,cols=1)

fig.add_trace(go.Scatter(
    x= np.arange(len(H_diff)),
    y=df['Min_Age'].sort_values(),
    name='Actual Min Age'
),row=1,col=1)
fig.add_trace(go.Scatter(
    x= np.arange(len(H_diff)),
    y=predictions['Min_Age'].sort_values(),
    name='Predicted Min Age'
),row=1,col=1)


fig.add_trace(go.Scatter(
    x= np.arange(len(H_diff)),
    y=df['Max_Age'].sort_values(),
    name='Actual Max Age'
),row=2,col=1)
fig.add_trace(go.Scatter(
    x= np.arange(len(H_diff)),
    y=predictions['Max_Age'].sort_values(),
    name='Predicted Max Age'
),row=2,col=1)


fig.update_layout(title='Model Prediction Evaluation',xaxis_title='Iterration',yaxis_title='Value')
fig.show()