<a href="https://colab.research.google.com/github/tozdo/ML-hse-2022/blob/main/HSE_AML_HW_2_Zdorova.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Loading the dataset from Kaggle

In [None]:
 ! pip install -q kaggle

In [None]:
 from google.colab import files

In [None]:
files.upload()

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"tozdova","key":"67ff513ef58a13e8bb41ec7ae0e8d186"}'}

In [None]:
! mkdir ~/.kaggle

In [None]:
! cp kaggle.json ~/.kaggle/

In [None]:
! chmod 600 ~/.kaggle/kaggle.json

In [None]:
! kaggle competitions download -c hse-aml-2022

Downloading hse-aml-2022.zip to /content
  0% 0.00/698k [00:00<?, ?B/s]
100% 698k/698k [00:00<00:00, 27.3MB/s]


In [None]:
! unzip hse-aml-2022.zip

Archive:  hse-aml-2022.zip
  inflating: books_sample_submission.csv  
  inflating: books_test.csv          
  inflating: books_train.csv         


## First look at the data

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import scipy.optimize as opt
import sklearn.linear_model
import sklearn.model_selection
from sklearn import metrics

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
import datetime

In [4]:
from sklearn.feature_extraction.text import HashingVectorizer

In [5]:
train = pd.read_csv('books_train.csv')

In [5]:
train.head(2)

Unnamed: 0,bookID,title,authors,average_rating,isbn,isbn13,language_code,num_pages,ratings_count,text_reviews_count,publication_date,publisher
0,26237,Revolutionary Girl Utena Vol. 3: To Sprout,Chiho Saito/Be-Pas,4.05,1591162076,9781591162070,eng,200,1153,16,2/4/2004,VIZ Media LLC
1,33448,Positioning: The Battle for Your Mind,Al Ries/Jack Trout,4.04,71359168,9780071359160,en-US,246,126,9,1/18/2001,McGraw-Hill Education


In [6]:
test = pd.read_csv('books_test.csv')

In [7]:
test.head(2)

Unnamed: 0,bookID,title,authors,isbn,isbn13,language_code,num_pages,ratings_count,text_reviews_count,publication_date,publisher
0,2538,El hombre duplicado,José Saramago/Pilar del Río,8466312803,9788466312806,spa,380,1295,106,9/1/2004,Punto de Lectura
1,31912,Buffy the Vampire Slayer and Philosophy: Fear ...,James B. South/William Irwin,812695313,9780812695311,eng,335,2519,85,3/13/2003,Open Court


In [7]:
train.shape, test.shape

((8342, 12), (2781, 11))

In [9]:
train.columns

Index(['bookID', 'title', 'authors', 'average_rating', 'isbn', 'isbn13',
       'language_code', '  num_pages', 'ratings_count', 'text_reviews_count',
       'publication_date', 'publisher'],
      dtype='object')

In [10]:
train.describe()

Unnamed: 0,bookID,average_rating,isbn13,num_pages,ratings_count,text_reviews_count
count,8342.0,8342.0,8342.0,8342.0,8342.0,8342.0
mean,21368.066291,3.932359,9755123000000.0,336.865979,16611.27,512.318029
std,13104.17081,0.348794,490333400000.0,235.805259,98542.13,2249.333746
min,2.0,0.0,20049130000.0,0.0,0.0,0.0
25%,10387.25,3.77,9780345000000.0,192.0,102.25,9.0
50%,20424.5,3.96,9780590000000.0,300.0,753.0,47.0
75%,32188.25,4.13,9780875000000.0,416.0,4929.75,235.0
max,45641.0,5.0,9790008000000.0,4736.0,2457092.0,56604.0


character columns: title, authors, language_code, publication_date, publisher

I feel like isbn and isbn13 do not matter in terms of book rating, this is just a number. So maybe I will not use it as a feature.

In [8]:
train.authors.nunique()

5299

In [9]:
train.publisher.nunique()

1950

In [10]:
train.language_code.nunique()

23

## Feature engineering

In [11]:
from sklearn.feature_extraction.text import HashingVectorizer

vectorizer = HashingVectorizer()


In [12]:
from sklearn import preprocessing

In [13]:
train['label'] = 'train'
test['label'] = 'test'

In [14]:
data = pd.concat([train,test])

In [15]:
data.shape

(11123, 13)

In [16]:
data.head(3)

Unnamed: 0,bookID,title,authors,average_rating,isbn,isbn13,language_code,num_pages,ratings_count,text_reviews_count,publication_date,publisher,label
0,26237,Revolutionary Girl Utena Vol. 3: To Sprout,Chiho Saito/Be-Pas,4.05,1591162076,9781591162070,eng,200,1153,16,2/4/2004,VIZ Media LLC,train
1,33448,Positioning: The Battle for Your Mind,Al Ries/Jack Trout,4.04,71359168,9780071359160,en-US,246,126,9,1/18/2001,McGraw-Hill Education,train
2,13739,Twelve Fair Kingdoms,Suzette Haden Elgin,3.99,425058506,9780425058503,eng,195,141,10,3/1/1983,Berkley,train


In [17]:
#renaming the column with a space in name
data.rename(columns = {'  num_pages': 'num_pages'}, inplace=True)

In [18]:
data['language_code'].value_counts()[:10]

eng      8908
en-US    1408
spa       218
en-GB     214
fre       144
ger        99
jpn        46
mul        19
zho        14
grc        11
Name: language_code, dtype: int64

In [19]:
# I think we should remove english language codes like en-CA, en-GB and say that they are all ENG
encoding = {'language_code':{'en-US': 'eng', 'en-GB': 'eng', 'en-CA': 'eng'}} 
data.replace(encoding, inplace=True)

In [20]:
data['publication_date'] = pd.to_datetime(data['publication_date'], format='%m/%d/%Y', errors='coerce')

In [22]:
data[data['publication_date'].isnull()]

Unnamed: 0,bookID,title,authors,average_rating,isbn,isbn13,language_code,num_pages,ratings_count,text_reviews_count,publication_date,publisher,label


In [21]:
# If we will not reduce Nones, later our model will curse us and raise Error.
# I had to choose between dropping these two books and 
#               put data manually, as some other guy on Kaggle did.
# I have decided that I do not want to lose data as this is not a large dataset :)

data.loc[data.bookID == 31373, 'publication_date'] = '1999-10-01 00:00:00'
data.loc[data.bookID == 45531, 'publication_date'] = '1975-10-01 00:00:00'

In [23]:
# Extracting year from the date because it can be important
# Like in the winter this is bad weather and you tend to dislike things...

data['year'] = pd.DatetimeIndex(data['publication_date']).year

In [24]:
# Month can be also important, maybe liking books are seasonal

data['month'] = pd.DatetimeIndex(data['publication_date']).month

In [25]:
# Count number of authors. 
# In 'authors' column sometimes also illustrators are stated (like Mary GrandPre in Harry Potter books).
# It is possible that good pictured books are more loved (but who knows)

data['num_authors'] = data['authors'].apply(lambda x: x.count('/') + 1 if x != None else 0)

In [26]:
# I will use LabelEncoder for some discrete columns, like authors or language code

le = preprocessing.LabelEncoder()

In [27]:
# I have decided that maybe we should take only first mentioned author, as there are illustrators stated in this column.
data['main_author'] = data['authors'].apply(lambda x: x.split("/")[0])

In [28]:
# number of books written by the author

data['author_books'] = data.groupby('main_author')['title'].transform('count')

In [29]:
# number of book appereances in the list

data['book_app'] = data.groupby('title')['title'].transform('count') 

In [30]:
# Encode authors column
data['main_author'] = le.fit_transform(data['main_author'])

In [31]:
# Encode language column
enc_lang = pd.get_dummies(data['language_code'])
data = pd.concat([data, enc_lang], axis = 1)

In [35]:
data.columns

Index(['bookID', 'title', 'authors', 'average_rating', 'isbn', 'isbn13',
       'language_code', 'num_pages', 'ratings_count', 'text_reviews_count',
       'publication_date', 'publisher', 'label', 'year', 'month',
       'num_authors', 'main_author', 'author_books', 'book_app', 'ale', 'ara',
       'eng', 'enm', 'fre', 'ger', 'gla', 'glg', 'grc', 'ita', 'jpn', 'lat',
       'msa', 'mul', 'nl', 'nor', 'por', 'rus', 'spa', 'srp', 'swe', 'tur',
       'wel', 'zho'],
      dtype='object')

In [32]:
train_data = data.loc[data['label'] == 'train']

In [33]:
test_data = data.loc[data['label'] == 'test']

In [34]:
train_data.shape, test_data.shape

((8342, 43), (2781, 43))

## Train a model

In [36]:
X_train = train_data[['main_author',  'num_pages', 'num_authors', 'ratings_count', \
                      'text_reviews_count',  'year', 'month', \
                      'author_books', 'book_app', \
                      'ale', 'ara', 'eng', 'enm', 'fre', 'ger',\
       'gla', 'grc', 'ita', 'jpn', 'lat', 'mul', 'nor', 'por', 'rus', 'spa',\
       'swe', 'tur', 'wel', 'zho']].copy()

X_test = test_data[['main_author',  'num_pages', 'num_authors', 'ratings_count',\
                    'text_reviews_count',  'year', 'month',\
                    'author_books', 'book_app', \
                    'ale', 'ara', 'eng', 'enm', 'fre', 'ger',\
       'gla', 'grc', 'ita', 'jpn', 'lat', 'mul', 'nor', 'por', 'rus', 'spa',\
       'swe', 'tur', 'wel', 'zho']].copy()

y_train = train_data[['average_rating']].copy()

In [37]:
X_train.shape, y_train.shape, X_test.shape

((8342, 29), (8342, 1), (2781, 29))

In [43]:
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(max_depth=15, min_samples_leaf=5, min_samples_split=5, n_estimators=150)

In [44]:
model.fit(X_train, y_train)

RandomForestRegressor(max_depth=13)

In [45]:
def save_submission(model):
  """ Simple function to save submissions. """
  y_test = model.predict(X_test)
  submission = pd.DataFrame()
  submission['bookID'] = test['bookID'].copy()
  submission['average_rating'] = y_test
  dttm = datetime.datetime.now().strftime("%d-%b-%Y_%H:%M")
  submission.to_csv(f'submission_{dttm}.csv', encoding='utf-8', index=False)
  return print(f"saved submission_{dttm}.csv")

In [46]:
save_submission(model=model)

saved submission_28-Mar-2022_20:51.csv


In [47]:
# for GridSearch
from sklearn.model_selection import GridSearchCV


parameters = {
    'n_estimators': [100, 150],
    'max_depth': [7, 10, 12, 15],
    'min_samples_split': [5, 10],
    'min_samples_leaf': [5, 10]
}

In [48]:
model_grid = RandomForestRegressor()

In [46]:
grad_rf = GridSearchCV(model, parameters, refit=True, cv=10)
grad_rf.fit(X_train, y_train)

print('Best Score: ', grad_rf.best_score_*100, '\nBest Parameters: ', grad_rf.best_params_)

KeyboardInterrupt: ignored