# Hackathon: Predicting Book Price 

In [306]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

#Import test and train data 
test_data = pd.read_excel(r'Data_Test.xlsx')
train_data = pd.read_excel(r'Data_Train.xlsx')

In [307]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6237 entries, 0 to 6236
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Title         6237 non-null   object 
 1   Author        6237 non-null   object 
 2   Edition       6237 non-null   object 
 3   Reviews       6237 non-null   object 
 4   Ratings       6237 non-null   object 
 5   Synopsis      6237 non-null   object 
 6   Genre         6237 non-null   object 
 7   BookCategory  6237 non-null   object 
 8   Price         6237 non-null   float64
dtypes: float64(1), object(8)
memory usage: 438.7+ KB


In [308]:
#Loodking at top 5 rows 
train_data.head()

Unnamed: 0,Title,Author,Edition,Reviews,Ratings,Synopsis,Genre,BookCategory,Price
0,The Prisoner's Gold (The Hunters 3),Chris Kuzneski,"Paperback,– 10 Mar 2016",4.0 out of 5 stars,8 customer reviews,THE HUNTERS return in their third brilliant no...,Action & Adventure (Books),Action & Adventure,220.0
1,Guru Dutt: A Tragedy in Three Acts,Arun Khopkar,"Paperback,– 7 Nov 2012",3.9 out of 5 stars,14 customer reviews,A layered portrait of a troubled genius for wh...,Cinema & Broadcast (Books),"Biographies, Diaries & True Accounts",202.93
2,Leviathan (Penguin Classics),Thomas Hobbes,"Paperback,– 25 Feb 1982",4.8 out of 5 stars,6 customer reviews,"""During the time men live without a common Pow...",International Relations,Humour,299.0
3,A Pocket Full of Rye (Miss Marple),Agatha Christie,"Paperback,– 5 Oct 2017",4.1 out of 5 stars,13 customer reviews,A handful of grain is found in the pocket of a...,Contemporary Fiction (Books),"Crime, Thriller & Mystery",180.0
4,LIFE 70 Years of Extraordinary Photography,Editors of Life,"Hardcover,– 10 Oct 2006",5.0 out of 5 stars,1 customer review,"For seven decades, ""Life"" has been thrilling t...",Photography Textbooks,"Arts, Film & Photography",965.62


In [309]:
len(train_data)

6237

In [310]:
#LOOKING At all unique values for 
for i in list(test_data.columns.unique()):
    print(i, len(test_data[str(i)].unique()))

Title 1521
Author 1224
Edition 1259
Reviews 30
Ratings 163
Synopsis 1519
Genre 225
BookCategory 11


In [311]:
#LOOKING At all unique values for 
for i in list(train_data.columns.unique()):
    print(i, len(train_data[str(i)].unique()))

Title 5568
Author 3679
Edition 3370
Reviews 36
Ratings 342
Synopsis 5549
Genre 345
BookCategory 11
Price 1614


# Cleaning / Feature Engineering

In [312]:
#Looking to see if their is a larger range for 5 star reviews
train_data['Reviews'].str.split(" " , expand = True)[3].unique()

array(['5'], dtype=object)

In [313]:
import pandas as pd
from textblob import TextBlob
def cleaning(df):
    df['Reviews'] = df['Reviews'].str.split(" " , expand = True)[0].astype(float)
    df['Edition'] = df['Edition'].str.split("," , expand = True)[0]   #[3].unique()
    df['Ratings'] = pd.DataFrame(df['Ratings'].str.split(" ", expand=True)[0].str.replace(',', '')).astype(int)
    # Define a function to get the polarity of a text
    def get_polarity(text):
        return TextBlob(text).sentiment.polarity
    
    # Getting polarity score as a feature - method is to get a more granular look at the message itselgf 
    df['Synopsis_polarity'] = df['Synopsis'].apply(get_polarity).astype(float)
    df = df.drop('Synopsis', axis=1)
    df = df.drop('Genre', axis=1)  
    return df

In [314]:
train_data = cleaning(train_data)
test_data = cleaning(test_data)

# Sampling
- Not all observations line up when creating dummies so creating an sample of equal observations is essential for the test data to be fit into the model

In [315]:
# Get the unique values of 'Edition' in full_train_data
unique_editions = test_data['Edition'].unique()

# Filter the rows in train_data that are present in full_train_data
train_data_samp = train_data[train_data['Edition'].isin(unique_editions)]

# Filter the rows in test_data that are present in full_train_data
test_data_samp = test_data[test_data['Edition'].isin(unique_editions)]

In [316]:
#LOOKING At all unique values for 
for i in list(test_data_samp.columns.unique()):
    print(i, len(test_data_samp[str(i)].unique()))

Title 1521
Author 1224
Edition 9
Reviews 30
Ratings 163
BookCategory 11
Synopsis_polarity 1421


In [317]:
#LOOKING At all unique values for 
for i in list(train_data_samp.columns.unique()):
    print(i, len(train_data_samp[str(i)].unique()))

Title 5555
Author 3669
Edition 8
Reviews 36
Ratings 341
BookCategory 11
Price 1611
Synopsis_polarity 4804


In [318]:
train_data_samp.Edition.unique()

array(['Paperback', 'Hardcover', 'Mass Market Paperback', 'Sheet music',
       'Flexibound', 'Loose Leaf', 'Cards', 'Spiral-bound'], dtype=object)

In [319]:
test_data_samp.Edition.unique()

array(['Mass Market Paperback', 'Paperback', 'Hardcover', 'Cards',
       'Sheet music', 'Flexibound', 'Spiral-bound', '(Chinese)',
       'Loose Leaf'], dtype=object)

In [320]:
# reset indices of train_data_samp and test_data_samp
train_data_samp = train_data_samp.reset_index(drop=True)
test_data_samp = test_data_samp.reset_index(drop=True)

# filter out rows in test_data_samp where Edition is '(Chinese)'
test_data_samp = test_data_samp.drop(test_data_samp[test_data_samp['Edition'] == '(Chinese)'].index)

In [321]:
train_data = train_data_samp
test_data = test_data_samp

# Identifying Feature Datatypes:

    - Title ( Qualitative )
    - Author ( Qualitative ) 
    - Edition (Qualitative )
    - Reviews (Quantitative continuous)
    - Rating ( Quantitative discrete)
    - Genre ( Qualitative Categorica) 
    - BookCategory ( Qualitative Categorical ) 
    - Price ( Quantitative Continous )
    - Synopsis_polarity ( Quantitative Continuous ? )

In [322]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6224 entries, 0 to 6223
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Title              6224 non-null   object 
 1   Author             6224 non-null   object 
 2   Edition            6224 non-null   object 
 3   Reviews            6224 non-null   float64
 4   Ratings            6224 non-null   int64  
 5   BookCategory       6224 non-null   object 
 6   Price              6224 non-null   float64
 7   Synopsis_polarity  6224 non-null   float64
dtypes: float64(3), int64(1), object(4)
memory usage: 389.1+ KB


In [323]:
train_data.head()

Unnamed: 0,Title,Author,Edition,Reviews,Ratings,BookCategory,Price,Synopsis_polarity
0,The Prisoner's Gold (The Hunters 3),Chris Kuzneski,Paperback,4.0,8,Action & Adventure,220.0,0.2088
1,Guru Dutt: A Tragedy in Three Acts,Arun Khopkar,Paperback,3.9,14,"Biographies, Diaries & True Accounts",202.93,0.183471
2,Leviathan (Penguin Classics),Thomas Hobbes,Paperback,4.8,6,Humour,299.0,0.191075
3,A Pocket Full of Rye (Miss Marple),Agatha Christie,Paperback,4.1,13,"Crime, Thriller & Mystery",180.0,0.1
4,LIFE 70 Years of Extraordinary Photography,Editors of Life,Hardcover,5.0,1,"Arts, Film & Photography",965.62,0.35625


# Data Preprocessing 

In [324]:
def preprocessing(df):    
    book_cat = pd.get_dummies(df['BookCategory'], prefix='BookCategory')
    #model_df = pd.concat([df, book_cat], axis=1)
    Edition = pd.get_dummies(df['Edition'], prefix='Edition')
    df = df.drop(['Title','Author','Edition','BookCategory'], axis=1)
    df = pd.concat([df, Edition,book_cat], axis=1)
    return df

In [325]:
train_data = preprocessing(train_data)
test_data = preprocessing(test_data)

# Cross Validation

In [327]:
def evaluate_regression_models(df, target_column, test_size=0.2, random_state=42):
    # Split data into features and target
    
    features = df.drop(target_column, axis=1)
    target = df[target_column]

    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=test_size, random_state=random_state)

    # Regression models being tested
    models = [LinearRegression(), Ridge(), Lasso(), ElasticNet(), 
              DecisionTreeRegressor(), RandomForestRegressor(), 
              GradientBoostingRegressor(), KNeighborsRegressor(), SVR()]

    # Loop through each model and evaluate its performance using multiple metrics
    metrics = {'MSE': mean_squared_error, 'RMSE': lambda y_true, y_pred: mean_squared_error(y_true, y_pred, squared=False), 
               'MAE': mean_absolute_error, 'R2': r2_score}
    model_metrics = []
    for model in models:
        mse_scores = -cross_val_score(model, X_train, y_train, scoring='neg_mean_squared_error', cv=5)
        model_metrics.append({'Model': str(model), 'MSE': mse_scores.mean(), 
                              'RMSE': metrics['RMSE'](y_train, model.fit(X_train, y_train).predict(X_train)),
                              'MAE': metrics['MAE'](y_train, model.fit(X_train, y_train).predict(X_train)),
                              'R2': metrics['R2'](y_train, model.fit(X_train, y_train).predict(X_train))})

    return model_metrics


In [328]:
results = evaluate_regression_models(train_data, 'Price')


# Models that Will Be Considered: 

    - Random Forest Regressor ( decent mean squared error, good RMSE, 2nd best mean absolute error/ R2) - overall has best all around metrics 
    -  Decision tree has a terrible mean squared error, but is the best fit for the model 

In [330]:
pd.DataFrame(results)

Unnamed: 0,Model,MSE,RMSE,MAE,R2
0,LinearRegression(),388976.652827,621.681352,318.557099,0.155985
1,Ridge(),388928.085155,621.691839,318.578249,0.155957
2,Lasso(),388852.782344,622.018203,318.338283,0.15507
3,ElasticNet(),428101.099971,653.707387,345.500017,0.066786
4,DecisionTreeRegressor(),752075.669534,72.423852,3.577191,0.988545
5,RandomForestRegressor(),432660.692337,251.681909,122.98007,0.868665
6,GradientBoostingRegressor(),379509.935646,548.494722,286.568069,0.343009
7,KNeighborsRegressor(),429471.101536,534.463035,269.351213,0.376194
8,SVR(),487083.016624,697.26159,312.198627,-0.06171


# Creating model - Random Forest

In [331]:
# Model features and target
target = train_data['Price']
features = train_data.drop('Price', axis = 1)

In [332]:
#Creating random forest object to fit the model to 
rfr = RandomForestRegressor( n_estimators = 100 , random_state = 42)

In [333]:
#Fitting the model 
rfr.fit(features,target)

RandomForestRegressor(random_state=42)

In [334]:
rf_t_pred = rfr.predict(features)

# Creating Model - Decision Tree 

In [338]:
from sklearn import tree

# Create a decision tree classifier
clf = DecisionTreeRegressor()

# Fit the classifier to the training data
clf.fit(features,target)

DecisionTreeRegressor()

In [339]:
Decision_t_pred = clf.predict(features)

In [340]:
#- np.array(target[i]

# Ensembling Models  - Decision Tree and Random Forest (Regressors)

In [341]:
import numpy as np
new_predictions = []
for i in range(len(list(rfr.predict(features)))):
    #print('Ensemble:',  (list(rfr.predict(features))[i] + list(clf.predict(features))[i]) / 2)
    new_predictions.append((list(rfr.predict(features))[i] + list(clf.predict(features))[i]) / 2)

### Decision Tree seems to have the best metrics with the training data even after ensembling 
- Ensembling did improve the model we orginally had (rf regressor)

In [342]:
#Ensemble Predictions error 
print("MSE:", mean_squared_error(target, new_predictions))
print("RSME:", np.sqrt(mean_squared_error(target, new_predictions)))
print("MAE:", mean_absolute_error(target, new_predictions))
print("R2:", r2_score(target, new_predictions))

MSE: 19484.483670640664
RSME: 139.58683201018877
MAE: 63.26047183663086
R2: 0.9564343133876724


In [343]:
# Decsion Tree
print("MSE:", mean_squared_error(target, Decision_t_pred))
print("RSME:", np.sqrt(mean_squared_error(target, Decision_t_pred)))
print("MAE:", mean_absolute_error(target, Decision_t_pred))
print("R2:", r2_score(target, Decision_t_pred))

MSE: 4834.477919071337
RSME: 69.530410030945
MAE: 3.7965038560411313
R2: 0.9891905090472662


In [344]:
#Random Forest
print("MSE:", mean_squared_error(target, rf_t_pred))
print("RSME:", np.sqrt(mean_squared_error(target, rf_t_pred)))
print("MAE:", mean_absolute_error(target, rf_t_pred))
print("R2:", r2_score(target, rf_t_pred))

MSE: 63434.500925348635
RSME: 251.8620672617229
MAE: 122.85840409842086
R2: 0.8581657264088909


# Predicting the test

In [365]:
clf_test = clf.predict(test_data)

In [366]:
rf_test = rfr.predict(test_data)

In [372]:
pd.DataFrame(rf_test)

Unnamed: 0,0
0,386.088400
1,1076.510000
2,351.295000
3,1067.233000
4,457.288667
...,...
1554,933.160000
1555,324.286600
1556,549.829400
1557,433.244400


In [364]:
pd.DataFrame(clf_test).to_csv("results.csv", index = False)

In [369]:
ensemble = []
for i in range(len(list(rfr.predict(test_data)))):
    #print('Ensemble:',  (list(rfr.predict(features))[i] + list(clf.predict(features))[i]) / 2)
    ensemble.append((list(clf_test)[i] + list(rf_test)[i]) / 2)

In [371]:
pd.DataFrame(ensemble)

Unnamed: 0,0
0,437.044200
1,1180.755000
2,267.147500
3,1373.116500
4,458.644333
...,...
1554,634.080000
1555,254.643300
1556,538.414700
1557,271.622200
