In [1]:
%load_ext lab_black

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import random
import numpy as np
import re
import tensorflow as tf
from sklearn.feature_extraction.text import TfidfTransformer
from nltk.corpus import stopwords
from textblob import Word
from gensim.models import Word2Vec
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.models import KeyedVectors
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPRegressor
from livelossplot import PlotLossesKeras
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.model_selection import cross_validate, KFold
from sklearn.metrics import make_scorer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import FunctionTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_extraction.text import (
    CountVectorizer,
    TfidfVectorizer,
    HashingVectorizer,
)



In [2]:
# prepare to generate word embbeding vectors
glove_input_file = "/Users/kdang/Documents/glove.6B/glove.6B.100d.txt"
word2vec_output_file = "glove.6B.100d.txt.word2vec"
# glove2word2vec(glove_input_file, word2vec_output_file)
word_model = KeyedVectors.load_word2vec_format(word2vec_output_file)

In [3]:
train = pd.read_excel("Participants_Data/Data_Train.xlsx")
test = pd.read_excel("Participants_Data/Data_Test.xlsx")

In [4]:
train["type"] = "train"
test["type"] = "test"

In [5]:
df = pd.concat([train, test], axis=0).reset_index()

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7797 entries, 0 to 7796
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   index         7797 non-null   int64  
 1   Title         7797 non-null   object 
 2   Author        7797 non-null   object 
 3   Edition       7797 non-null   object 
 4   Reviews       7797 non-null   object 
 5   Ratings       7797 non-null   object 
 6   Synopsis      7797 non-null   object 
 7   Genre         7797 non-null   object 
 8   BookCategory  7797 non-null   object 
 9   Price         6237 non-null   float64
 10  type          7797 non-null   object 
dtypes: float64(1), int64(1), object(9)
memory usage: 670.2+ KB


In [7]:
df["RatingValue"] = df["Reviews"].apply(lambda x: float(x.split()[0]) / 5)
df["NumReview"] = df["Ratings"].apply(lambda x: int(x.split()[0].replace(",", "")))
months = [
    "Jan",
    "Feb",
    "Mar",
    "Apr",
    "May",
    "Jun",
    "Jul",
    "Aug",
    "Sep",
    "Oct",
    "Nov",
    "Dec",
]
monthsMaxDays = {
    "Jan": 31,
    "Feb": 28,
    "Mar": 31,
    "Apr": 30,
    "May": 31,
    "Jun": 30,
    "Jul": 31,
    "Aug": 31,
    "Sep": 30,
    "Oct": 31,
    "Nov": 30,
    "Dec": 31,
}
months_to_num = {}
for i, m in enumerate(months):
    months_to_num[m] = i + 1
df["Edition"] = df.Edition.apply(lambda x: x.replace("–", "-"))
df.loc[df.Edition.str.match(".*\(\w+\).*"), "Edition"] = df.loc[
    df.Edition.str.match(".*\(\w+\).*"), "Edition"
].apply(lambda x: re.sub("\(\w+\),", "", x))
df.loc[
    df.Edition.str.match(r".*[^\d]+$"), "EditionDate"
] = np.nan  # pattern 1, year and month
df.loc[df.Edition.str.match(r".*\d+$"), "EditionDate"] = df.Edition.apply(
    lambda x: x.split(",")[-1].split("-")[-1].strip()
)
df["Year"] = df.EditionDate.apply(
    lambda x: int(x[-4:].strip()) if pd.notnull(x) else np.nan
)
df["Month"] = df.EditionDate.apply(
    lambda x: x[-8:-4] if (pd.notnull(x) and len(x) > 8) else np.nan
)
df["Day"] = df.EditionDate.apply(
    lambda x: int(x[-11:-9].strip()) if (pd.notnull(x) and len(x) >= 10) else np.nan
)
df["PrintEdition"] = df.Edition.apply(lambda x: x.split(",")[0])
df["PrintEdition"] = df["PrintEdition"].map(df["PrintEdition"].value_counts())
df["IsImported"] = df.Edition.str.contains("Import")
df["IsBook"] = df.Genre.str.contains("(Books)")
df["IsMultipleAuthor"] = df.Author.str.match(".*[,&-].*")
df["IsSpecialAuthor"] = df.Author.apply(
    lambda x: any(
        i
        for i in x.lower().split()
        if i
        in [
            "phd.",
            "phd",
            "dr",
            "dr.",
            "prof",
            "prof.",
            "sir",
            "sir.",
            "m.d.",
            "m.d.",
            "mr",
            "mr.",
            "mrs",
            "mrs.",
            "m.a",
            "m.a.",
        ]
    )
)
df["IsFamousAuthor"] = df.NumReview > 100
df["BookCategory"] = df["BookCategory"].map(df["BookCategory"].value_counts())
df["Genre"] = df["Genre"].map(df["Genre"].value_counts())
df.loc[pd.isnull(df.Year), "Year"] = df.loc[pd.isnull(df.Year), "Year"].apply(
    lambda x: random.randrange(df.Year.min(), df.Year.max())
)
df.loc[pd.isnull(df.Month), "Month"] = df.loc[pd.isnull(df.Month), "Month"].apply(
    lambda x: random.choice(months)
)
df.loc[pd.isnull(df.Day), "Day"] = df.loc[
    df.loc[pd.isnull(df.Day)].index, "Month"
].apply(lambda x: random.randint(1, monthsMaxDays[x] + 1))
df["Month"] = df["Month"].map(df["Month"].value_counts())
df["Day"] = df["Day"].map(df["Day"].value_counts())

  return func(self, *args, **kwargs)


In [8]:
df.columns

Index(['index', 'Title', 'Author', 'Edition', 'Reviews', 'Ratings', 'Synopsis',
       'Genre', 'BookCategory', 'Price', 'type', 'RatingValue', 'NumReview',
       'EditionDate', 'Year', 'Month', 'Day', 'PrintEdition', 'IsImported',
       'IsBook', 'IsMultipleAuthor', 'IsSpecialAuthor', 'IsFamousAuthor'],
      dtype='object')

In [9]:
stop = stopwords.words("english")

categorical_features = ["BookCategory", "Month", "PrintEdition", "Day"]
int_features = ["Year"]
bool_features = [
    "IsBook",
    "IsImported",
    "IsMultipleAuthor",
    "IsSpecialAuthor",
    "IsFamousAuthor",
]
text_features = ["Synopsis", "Title", "Author"]


def lemmatize(s):
    dff = []
    for n, c in s.iteritems():
        dff.append(
            c.str.lower().apply(
                lambda t: " ".join(
                    [Word(i).lemmatize() for i in t.split() if i not in stop]
                )
            )
        )
    dff = pd.concat(dff, axis=1)
    return dff


def tovector(s):
    dff = []
    for n, c in s.iteritems():
        dff.append(
            c.apply(
                lambda x: np.mean(
                    [(word_model[i] if i in word_model else np.zeros(100)) for i in x],
                    axis=0,
                )
                if x
                else np.zeros(100)
            )
        )
    dff = pd.concat(dff, axis=1)
    return dff


def text_vector_to_column(s):
    dff = []
    for n, c in s.iteritems():
        for i in range(100):
            dff.append(c.apply(lambda x: x[i]))
    dff = pd.concat(dff, axis=1)
    return dff


def as_int(s):
    dff = pd.concat([c.astype(int) for n, c in s.iteritems()], axis=1)
    return dff


def text_pro(s):
    dff = []
    for n, c in s.iteritems():
        count_vectorizer = CountVectorizer()
        tfidf_vectorizer = TfidfVectorizer()
        count = count_vectorizer.fit_transform(c)
        dff.append(pd.DataFrame.sparse.from_spmatrix(tfidf_vectorizer.fit_transform(c)))
    return pd.concat(dff, axis=1)


text_transformer = Pipeline(
    [
        ("tokenize_lemmatize", FunctionTransformer(lemmatize)),
        ("count_vectorizer", FunctionTransformer(text_pro)),
        #     ('tfidf', TfidfTransformer())
        #     ('vector_to_column', FunctionTransformer(text_vector_to_column))
        #     ('tfidf', TfidfTransformer())
    ]
)

bool_transformer = Pipeline([("to_int", FunctionTransformer(as_int))])

categorical_transformer = Pipeline(
    [
        ("onehot", OneHotEncoder())
        #     ('numeric', FunctionTransformer(lambda x: x, validate = False))
    ]
)

int_transformer = Pipeline([("minmaxscaler", MinMaxScaler())])

preprocessor = ColumnTransformer(
    transformers=[
        ("cat", categorical_transformer, categorical_features),
        ("text", text_transformer, text_features),
        ("bool", bool_transformer, bool_features),
        ("int", int_transformer, int_features),
    ],
    verbose=True,
)
train_df = df[df.type == "train"]
# train_df.head()
y = np.array(train_df.Price).reshape(-1, 1)
x = preprocessor.fit_transform(train_df)
x
# vectorizer = TfidfVectorizer()
# vectorizer.fit_transform(train_df['Synopsis'])
# vectorizer.get_feature_names()
# train_df.drop(['Price', 'type', 'index'], axis=1)
# print(x.shape)
# model = MLPRegressor(hidden_layer_sizes=(20, 20, 10), max_iter=1000)
# model = RandomForestRegressor(n_estimators=1000)
import math


def metric_np(y_pred, y_true):
    y_pred = math.e ** y_pred
    y_true = math.e ** y_true
    return 1 - np.sqrt(np.square(np.log10(y_pred + 1) - np.log10(y_true + 1)).mean())

[ColumnTransformer] ........... (1 of 4) Processing cat, total=   0.0s
[ColumnTransformer] .......... (2 of 4) Processing text, total=  28.2s
[ColumnTransformer] .......... (3 of 4) Processing bool, total=   0.0s
[ColumnTransformer] ........... (4 of 4) Processing int, total=   0.0s


In [14]:
x.shape

(6237, 51403)

In [19]:
import scipy

sparse_matrix = scipy.sparse.csr_matrix(x)

In [20]:
sparse_matrix

<6237x51403 sparse matrix of type '<class 'numpy.float64'>'
	with 587967 stored elements in Compressed Sparse Row format>

In [228]:
300+12+5+1+11+15

344

In [201]:
import math
def metric_np(y_pred, y_true):
    y_pred = math.e**y_pred
    y_true = math.e**y_true
    return 1 - np.sqrt(np.square(np.log10(y_pred +1) - np.log10(y_true +1)).mean())

In [None]:
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.2)

# model = MLPRegressor(hidden_layer_sizes=(20, 20, 10), max_iter=1000)
model = LinearRegression()
cross_validate(
    model,
    xtrain,
    np.log(ytrain.ravel()),
    scoring={
        "loss": "neg_mean_squared_error",
        "metric": make_scorer(metric_np, greater_is_better=True),
    },
    verbose=True,
    n_jobs=-1,
)

In [241]:
def metric_tf(ypred, yval):
    print('fuck')
    ypred = math.e **ypred
    yval = math.e**yval
    return 1 - tf.math.reduce_mean(tf.sqrt(tf.square(log10(ypred + 1) - log10(yval + 1))))

def create_model():
    model = Sequential()
    model.add(Dense(40, input_dim=x.shape[-1], activation="relu"))
#     model.add(Dense(20, activation="relu"))
#     model.add(Dense(10, activation="relu"))
    model.add(Dense(1))
    optim = tf.keras.optimizers.Adam(learning_rate=1e-4)
    model.compile(
        loss='mean_squared_error',
        optimizer='adam',
        metrics=[tf.keras.metrics.MeanSquaredError()],
    )
    return model

def log10(x):
    numerator = tf.math.log(x)
    denominator = tf.math.log(tf.constant(10, dtype=numerator.dtype))
    return numerator / denominator

early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor="mean_squared_error",
    min_delta=1e-4,
    patience=10,
    verbose=1,
    mode="auto"
)
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.2)
model = KerasRegressor(build_fn=create_model, epochs=1000, batch_size=10, verbose=1, callbacks=[early_stopping])
cross_validate(model, xtrain, np.log(ytrain), scoring={'loss': 'neg_mean_squared_error', 'metric': make_scorer(metric_tf, greater_is_better=True)}, 
               verbose=True, n_jobs=-1)
# model.fit(xtrain, np.log(ytrain), callbacks=[PlotLossesKeras(), early_stopping], validation_data=(xtest, np.log(ytest)))

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.


KeyboardInterrupt: 