# Hackathon: Predicting Book Price 

In [21]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

#Import test and train data 
test_data = pd.read_excel(r'Data_Test.xlsx')
train_data = pd.read_excel(r'Data_Train.xlsx')

In [22]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1560 entries, 0 to 1559
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Title         1560 non-null   object
 1   Author        1560 non-null   object
 2   Edition       1560 non-null   object
 3   Reviews       1560 non-null   object
 4   Ratings       1560 non-null   object
 5   Synopsis      1560 non-null   object
 6   Genre         1560 non-null   object
 7   BookCategory  1560 non-null   object
dtypes: object(8)
memory usage: 97.6+ KB


In [23]:
#Loodking at top 5 rows 
train_data.head()

Unnamed: 0,Title,Author,Edition,Reviews,Ratings,Synopsis,Genre,BookCategory,Price
0,The Prisoner's Gold (The Hunters 3),Chris Kuzneski,"Paperback,– 10 Mar 2016",4.0 out of 5 stars,8 customer reviews,THE HUNTERS return in their third brilliant no...,Action & Adventure (Books),Action & Adventure,220.0
1,Guru Dutt: A Tragedy in Three Acts,Arun Khopkar,"Paperback,– 7 Nov 2012",3.9 out of 5 stars,14 customer reviews,A layered portrait of a troubled genius for wh...,Cinema & Broadcast (Books),"Biographies, Diaries & True Accounts",202.93
2,Leviathan (Penguin Classics),Thomas Hobbes,"Paperback,– 25 Feb 1982",4.8 out of 5 stars,6 customer reviews,"""During the time men live without a common Pow...",International Relations,Humour,299.0
3,A Pocket Full of Rye (Miss Marple),Agatha Christie,"Paperback,– 5 Oct 2017",4.1 out of 5 stars,13 customer reviews,A handful of grain is found in the pocket of a...,Contemporary Fiction (Books),"Crime, Thriller & Mystery",180.0
4,LIFE 70 Years of Extraordinary Photography,Editors of Life,"Hardcover,– 10 Oct 2006",5.0 out of 5 stars,1 customer review,"For seven decades, ""Life"" has been thrilling t...",Photography Textbooks,"Arts, Film & Photography",965.62


In [24]:
len(train_data)

6237

In [25]:
#LOOKING At all unique values for 
for i in list(test_data.columns.unique()):
    print(i, len(test_data[str(i)].unique()))

Title 1521
Author 1224
Edition 1259
Reviews 30
Ratings 163
Synopsis 1519
Genre 225
BookCategory 11


# Cleaning / Feature Engineering

In [26]:
#Looking to see if their is a larger range for 5 star reviews
train_data['Reviews'].str.split(" " , expand = True)[3].unique()

array(['5'], dtype=object)

In [27]:
import pandas as pd
from textblob import TextBlob
def cleaning(df):
    df['Reviews'] = df['Reviews'].str.split(" " , expand = True)[0].astype(float)
    df['Edition'] = df['Edition'].str.split("," , expand = True)[0]   #[3].unique()
    df['Ratings'] = pd.DataFrame(df['Ratings'].str.split(" ", expand=True)[0].str.replace(',', '')).astype(int)
    # Define a function to get the polarity of a text
    def get_polarity(text):
        return TextBlob(text).sentiment.polarity
    
    # Getting polarity score as a feature - method is to get a more granular look at the message itselgf 
    df['Synopsis_polarity'] = df['Synopsis'].apply(get_polarity).astype(float)
    df = df.drop('Synopsis', axis=1)
    df = df.drop('Genre', axis=1)  
    return df

In [28]:
train_data = cleaning(train_data)

# Identifying Feature Datatypes:

    - Title ( Qualitative )
    - Author ( Qualitative ) 
    - Edition (Qualitative )
    - Reviews (Quantitative continuous)
    - Rating ( Quantitative discrete)
    - Genre ( Qualitative Categorica) 
    - BookCategory ( Qualitative Categorical ) 
    - Price ( Quantitative Continous )
    - Synopsis_polarity ( Quantitative Continuous ? )

In [10]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6237 entries, 0 to 6236
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Title              6237 non-null   object 
 1   Author             6237 non-null   object 
 2   Edition            6237 non-null   object 
 3   Reviews            6237 non-null   float64
 4   Ratings            6237 non-null   int64  
 5   BookCategory       6237 non-null   object 
 6   Price              6237 non-null   float64
 7   Synopsis_polarity  6237 non-null   float64
dtypes: float64(3), int64(1), object(4)
memory usage: 389.9+ KB


In [11]:
train_data.head()

Unnamed: 0,Title,Author,Edition,Reviews,Ratings,BookCategory,Price,Synopsis_polarity
0,The Prisoner's Gold (The Hunters 3),Chris Kuzneski,Paperback,4.0,8,Action & Adventure,220.0,0.2088
1,Guru Dutt: A Tragedy in Three Acts,Arun Khopkar,Paperback,3.9,14,"Biographies, Diaries & True Accounts",202.93,0.183471
2,Leviathan (Penguin Classics),Thomas Hobbes,Paperback,4.8,6,Humour,299.0,0.191075
3,A Pocket Full of Rye (Miss Marple),Agatha Christie,Paperback,4.1,13,"Crime, Thriller & Mystery",180.0,0.1
4,LIFE 70 Years of Extraordinary Photography,Editors of Life,Hardcover,5.0,1,"Arts, Film & Photography",965.62,0.35625


# Data Preprocessing 

In [32]:
def preprocessing(df):    
    #book_cat = pd.get_dummies(df['BookCategory'], prefix='BookCategory')
    #model_df = pd.concat([df, book_cat], axis=1)
    #Edition = pd.get_dummies(df['Edition'], prefix='Edition')
    #model_df = pd.concat([df, Edition], axis=1)   
    df = df.drop(['Title','Author'], axis=1)
    #model_df['Edition'] = model_df.Edition.map({'Paperback': 0 , 'Hardcover' : 1})

    return df

In [30]:
train_data = preprocessing(train_data)

In [31]:
test_data

Unnamed: 0,Title,Author,Edition,Reviews,Ratings,Synopsis,Genre,BookCategory
0,The Complete Sherlock Holmes: 2 Boxes sets,Sir Arthur Conan Doyle,"Mass Market Paperback,– 1 Oct 1986",4.4 out of 5 stars,960 customer reviews,A collection of entire body of work of the She...,Short Stories (Books),"Crime, Thriller & Mystery"
1,Learn Docker - Fundamentals of Docker 18.x: Ev...,Gabriel N. Schenker,"Paperback,– Import, 26 Apr 2018",5.0 out of 5 stars,1 customer review,Enhance your software deployment workflow usin...,Operating Systems Textbooks,"Computing, Internet & Digital Media"
2,Big Girl,Danielle Steel,"Paperback,– 17 Mar 2011",5.0 out of 5 stars,4 customer reviews,"'Watch out, world. Here I come!'\nFor Victoria...",Romance (Books),Romance
3,Think Python: How to Think Like a Computer Sci...,Allen B. Downey,"Paperback,– 2016",4.1 out of 5 stars,11 customer reviews,"If you want to learn how to program, working w...",Programming & Software Development (Books),"Computing, Internet & Digital Media"
4,Oxford Word Skills: Advanced - Idioms & Phrasa...,Redman Gairns,"Paperback,– 26 Dec 2011",4.4 out of 5 stars,9 customer reviews,"Learn and practise the verbs, prepositions and...",Linguistics (Books),"Language, Linguistics & Writing"
...,...,...,...,...,...,...,...,...
1555,100 Things Every Designer Needs to Know About ...,Susan Weinschenk,"Paperback,– 14 Apr 2011",5.0 out of 5 stars,4 customer reviews,We design to elicit responses from people. We ...,Design,"Computing, Internet & Digital Media"
1556,"Modern Letter Writing Course: Personal, Busine...",ARUN SAGAR,"Paperback,– 8 May 2013",3.6 out of 5 stars,13 customer reviews,"A 30-day course to write simple, sharp and att...",Children's Reference (Books),"Biographies, Diaries & True Accounts"
1557,The Kite Runner Graphic Novel,Khaled Hosseini,"Paperback,– 6 Sep 2011",4.0 out of 5 stars,5 customer reviews,The perennial bestseller-now available as a se...,Humour (Books),Humour
1558,Panzer Leader (Penguin World War II Collection),Heinz Guderian,"Paperback,– 22 Sep 2009",3.5 out of 5 stars,3 customer reviews,Heinz Guderian - master of the Blitzkrieg and ...,United States History,"Biographies, Diaries & True Accounts"


# Cross Validation

In [17]:
def evaluate_regression_models(df, target_column, test_size=0.2, random_state=42):
    # Split data into features and target
    
    features = df.drop(target_column, axis=1)
    target = df[target_column]

    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=test_size, random_state=random_state)

    # Regression models being tested
    models = [LinearRegression(), Ridge(), Lasso(), ElasticNet(), 
              DecisionTreeRegressor(), RandomForestRegressor(), 
              GradientBoostingRegressor(), KNeighborsRegressor(), SVR()]

    # Loop through each model and evaluate its performance using multiple metrics
    metrics = {'MSE': mean_squared_error, 'RMSE': lambda y_true, y_pred: mean_squared_error(y_true, y_pred, squared=False), 
               'MAE': mean_absolute_error, 'R2': r2_score}
    model_metrics = []
    for model in models:
        mse_scores = -cross_val_score(model, X_train, y_train, scoring='neg_mean_squared_error', cv=5)
        model_metrics.append({'Model': str(model), 'MSE': mse_scores.mean(), 
                              'RMSE': metrics['RMSE'](y_train, model.fit(X_train, y_train).predict(X_train)),
                              'MAE': metrics['MAE'](y_train, model.fit(X_train, y_train).predict(X_train)),
                              'R2': metrics['R2'](y_train, model.fit(X_train, y_train).predict(X_train))})

    return model_metrics


In [18]:
train_data

Unnamed: 0,Reviews,Ratings,Price,Synopsis_polarity
0,4.0,8,220.00,0.208800
1,3.9,14,202.93,0.183471
2,4.8,6,299.00,0.191075
3,4.1,13,180.00,0.100000
4,5.0,1,965.62,0.356250
...,...,...,...,...
6232,5.0,2,322.00,0.058490
6233,3.3,9,421.00,0.065029
6234,3.8,3,399.00,0.367284
6235,3.5,4,319.00,0.185119


In [122]:
results = evaluate_regression_models(train_data, 'Price')


5 fits failed out of a total of 5.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/tylerbrown/opt/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/tylerbrown/opt/anaconda3/lib/python3.9/site-packages/sklearn/linear_model/_base.py", line 662, in fit
    X, y = self._validate_data(
  File "/Users/tylerbrown/opt/anaconda3/lib/python3.9/site-packages/sklearn/base.py", line 581, in _validate_data
    X, y = check_X_y(X, y, **check_params)
  File "/Users/tylerbrown/opt/anaconda3/lib/python3.9/site-packages/sklearn/utils/validation.py"

ValueError: could not convert string to float: 'How to be an Explorer of the World'

# Models that Will Be Considered: 

    - Random Forest Regressor ( decent mean squared error, good RMSE, 2nd best mean absolute error/ R2) - overall has best all around metrics 
    -  Decision tree has a terrible mean squared error, but is the best fit for the model 

In [28]:
pd.DataFrame(results)

Unnamed: 0,Model,MSE,RMSE,MAE,R2
0,LinearRegression(),454433.51758,643.040138,343.257296,0.152024
1,Ridge(),453829.365723,649.700072,345.406985,0.134369
2,Lasso(),453165.406105,647.287427,345.091254,0.140786
3,ElasticNet(),469144.079739,683.704598,356.288652,0.041385
4,DecisionTreeRegressor(),735654.22404,99.303073,10.071208,0.979778
5,RandomForestRegressor(),469223.870806,262.045002,129.618907,0.85243
6,GradientBoostingRegressor(),438097.121828,588.743746,312.099357,0.28918
7,KNeighborsRegressor(),476598.800116,565.631551,291.472692,0.343893
8,SVR(),518019.665737,719.222596,313.322187,-0.060801


# Creating model - Random Forest

In [29]:
# Model features and target
target = train_data['Price']
features = train_data.drop('Price', axis = 1)

In [30]:
#Creating random forest object to fit the model to 
rfr = RandomForestRegressor( n_estimators = 100 , random_state = 42)

In [31]:
#Fitting the model 
rfr.fit(features,target)

RandomForestRegressor(random_state=42)

In [32]:
rf_t_pred = rfr.predict(features)

# Creating Model - Decision Tree 

In [33]:
from sklearn import tree

# Create a decision tree classifier
clf = DecisionTreeRegressor()

# Fit the classifier to the training data
clf.fit(features,target)

DecisionTreeRegressor()

In [34]:
Decision_t_pred = clf.predict(features)

In [22]:
#- np.array(target[i]

# Ensembling Models  - Decision Tree and Random Forest (Regressors)

### Decision Tree seems to have the best metrics with the training data even after ensembling 
- Ensembling did improve the model we orginally had (rf regressor)

In [24]:
#Ensemble Predictions error 
print("MSE:", mean_squared_error(target, new_predictions))
print("RSME:", np.sqrt(mean_squared_error(target, new_predictions)))
print("MAE:", mean_absolute_error(target, new_predictions))
print("R2:", r2_score(target, new_predictions))

MSE: 24322.22186669215
RSME: 155.95583306401898
MAE: 68.28693048344712
R2: 0.9489218198175037


In [25]:
# Decsion Tree
print("MSE:", mean_squared_error(target, Decision_t_pred))
print("RSME:", np.sqrt(mean_squared_error(target, Decision_t_pred)))
print("MAE:", mean_absolute_error(target, Decision_t_pred))
print("R2:", r2_score(target, Decision_t_pred))

MSE: 9053.472833969323
RSME: 95.14973901156704
MAE: 10.323616321949656
R2: 0.9809871433939965


In [26]:
#Random Forest
print("MSE:", mean_squared_error(target, rf_t_pred))
print("RSME:", np.sqrt(mean_squared_error(target, rf_t_pred)))
print("MAE:", mean_absolute_error(target, rf_t_pred))
print("R2:", r2_score(target, rf_t_pred))

MSE: 70128.46896486063
RSME: 264.8178033381831
MAE: 126.68048538105347
R2: 0.8527258490880254


# Predicting the test

In [None]:
test_data

In [92]:
a = test_data

In [93]:
#train_data
a = cleaning(a)

In [94]:
a = preprocessing(test_df)

In [96]:
rfr.predict(test_df)

Feature names unseen at fit time:
- Author
- BookCategory
- Edition
- Title
Feature names seen at fit time, yet now missing:
- Edition_(French)
- Edition_(German)
- Edition_(Kannada)
- Edition_(Spanish)
- Edition_Board book
- ...



ValueError: could not convert string to float: 'The Complete Sherlock Holmes: 2 Boxes sets'

In [45]:
test_data['Price']

KeyError: 'Price'

In [None]:

# Create a decision tree classifier
clf = DecisionTreeRegressor()

# Fit the classifier to the training data
clf.fit(features,target)