In [20]:
import pymongo
import pandas as pd
import numpy as npp
import dotenv
import os
from data_manager import DataImporter
import lightgbm as lgb
from sklearn.preprocessing import MultiLabelBinarizer
import ast
import re

In [21]:
# Import cleaned data from csv
df = pd.read_csv('processed_books.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5040 entries, 0 to 5039
Data columns (total 17 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   book_id          5040 non-null   object 
 1   title            5040 non-null   object 
 2   author           5040 non-null   object 
 3   price            5040 non-null   float64
 4   genres           5040 non-null   object 
 5   language         5040 non-null   object 
 6   series           5040 non-null   int64  
 7   publisher        4733 non-null   object 
 8   year_published   4987 non-null   object 
 9   description      4966 non-null   object 
 10  current_readers  5040 non-null   float64
 11  wanted_to_read   5040 non-null   float64
 12  num_reviews      5040 non-null   object 
 13  num_ratings      5040 non-null   object 
 14  rating           5040 non-null   float64
 15  awards           5040 non-null   object 
 16  primary_lists    5040 non-null   object 
dtypes: float64(4),

In [22]:
# Feature Selection
# Drop price, auther, title, description, series, publisher, year_published, language
df_used = df.drop(['price', 'author', 'title', 'description', 'series', 'publisher', 'year_published', 'language', 'primary_lists'], axis = 1)
# Remove commas from num_reviews and num_ratings
df_used['num_reviews'] = df_used['num_reviews'].str.replace(',', '')
df_used['num_ratings'] = df_used['num_ratings'].str.replace(',', '')
# Change num_reviews to float64 and num_ratings to float64
df_used['num_reviews'] = df_used['num_reviews'].astype('float64')
df_used['num_ratings'] = df_used['num_ratings'].astype('float64')
df_used.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5040 entries, 0 to 5039
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   book_id          5040 non-null   object 
 1   genres           5040 non-null   object 
 2   current_readers  5040 non-null   float64
 3   wanted_to_read   5040 non-null   float64
 4   num_reviews      5040 non-null   float64
 5   num_ratings      5040 non-null   float64
 6   rating           5040 non-null   float64
 7   awards           5040 non-null   object 
dtypes: float64(5), object(3)
memory usage: 315.1+ KB


In [23]:
mlb = MultiLabelBinarizer()
for col in df_used.columns:
    if df_used[col].dtype == 'O':
        try:
            df_used[col] = df_used[col].apply(ast.literal_eval)
        except:
            continue

    if isinstance(df_used[col][0], list):
        mlb.fit(df_used[col])
        one_hot_col = mlb.transform(df_used[col])
        one_hot_df = pd.DataFrame(one_hot_col, columns=mlb.classes_)
        df_used = pd.concat([df_used, one_hot_df], axis=1)
        df_used.drop(columns=[col], axis=1)
        
# Drop columns genres, awards, primary_lists
df_used = df_used.drop(['genres', 'awards'], axis = 1)

In [24]:
df_used.head()

Unnamed: 0,book_id,current_readers,wanted_to_read,num_reviews,num_ratings,rating,12th Century,15th Century,16th Century,17th Century,...,このミステリーがすごい！ for Best Translated Mystery Novel of the Year in Japan (2009),このミステリーがすごい！ for Best Translated Mystery Novel of the Year in Japan (2014),亞洲週刊中文十大好書 (2005),亞洲週刊中文十大好書 for 小說類 (2006),本屋大賞 Nominee for Translated Fiction (2012),本屋大賞 Nominee for Translated Fiction (2013),本屋大賞 Nominee for Translated Fiction (2014),本屋大賞 Nominee for Translated Fiction (2015),本屋大賞 Nominee for Translated Fiction (2017),本屋大賞 Nominee for Translated Fiction (2020)
0,77203.The_Kite_Runner,42900.0,1000000.0,90234.0,2935385.0,4.33,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,929.Memoirs_of_a_Geisha,12300.0,793000.0,34102.0,1922540.0,4.14,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,128029.A_Thousand_Splendid_Suns,32700.0,760000.0,69431.0,1417260.0,4.42,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,19063.The_Book_Thief,86000.0,2000000.0,134883.0,2345385.0,4.39,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,4214.Life_of_Pi,24900.0,726000.0,51257.0,1544622.0,3.93,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [25]:

renamed_feature_names = []
for name in df_used.columns:
    renamed_feature_names.append(re.sub('[^0-9a-zA-Z_\-\.]+','_',name))

df_used.columns = renamed_feature_names
for column in df_used.columns:
    if column == '_':
        df_used.drop(columns=[column], axis=1)
print(len(df_used.columns))

4502


In [26]:
duplicated_columns = df_used.columns[df_used.columns.duplicated()]
# instances = df_used[duplicated_columns].value_counts()

print(duplicated_columns)
# sum the duplicated columns
for column in duplicated_columns:
    # Select the duplicated columns with the same name
    same_name_cols = df_used.filter(like=column, axis=1)
    
    # Combine the duplicated columns by adding them up
    combined_col = same_name_cols.sum(axis=1)
    
    # Clip the combined values to 1
    combined_col = combined_col.clip(0, 1)
    
    # Create a new column with the combined values
    df_used[column] = combined_col
    
    # Drop the duplicated columns
    df_used = df_used.drop(columns=same_name_cols.columns)

print(len(df_used.columns))

Index(['Romantic_Times_Reviewers_Choice_Award_RT_Award_Nominee_for_Best_Young_Adult_Paranormal_Fantasy_Novel_2009_', 'The_Kitschies_Nominee_for_Golden_Tentacle_Debut_2020_'], dtype='object')
4498


In [27]:
df_used.shape

(5040, 4498)

In [28]:
df_used.head()

Unnamed: 0,book_id,current_readers,wanted_to_read,num_reviews,num_ratings,rating,12th_Century,15th_Century,16th_Century,17th_Century,...,_for_Best_Translated_Mystery_Novel_of_the_Year_in_Japan_2009_,_for_Best_Translated_Mystery_Novel_of_the_Year_in_Japan_2014_,_2005_,_for_2006_,_Nominee_for_Translated_Fiction_2012_,_Nominee_for_Translated_Fiction_2013_,_Nominee_for_Translated_Fiction_2014_,_Nominee_for_Translated_Fiction_2015_,_Nominee_for_Translated_Fiction_2017_,_Nominee_for_Translated_Fiction_2020_
0,77203.The_Kite_Runner,42900.0,1000000.0,90234.0,2935385.0,4.33,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,929.Memoirs_of_a_Geisha,12300.0,793000.0,34102.0,1922540.0,4.14,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,128029.A_Thousand_Splendid_Suns,32700.0,760000.0,69431.0,1417260.0,4.42,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,19063.The_Book_Thief,86000.0,2000000.0,134883.0,2345385.0,4.39,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,4214.Life_of_Pi,24900.0,726000.0,51257.0,1544622.0,3.93,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [29]:
train_data = df_used.sample(frac=0.8, random_state=42)
test_data = df_used.drop(train_data.index)

train_set = lgb.Dataset(train_data.drop(columns=['rating', 'book_id']), label=train_data['rating'])
test_set = lgb.Dataset(test_data.drop(columns=['rating', 'book_id']), label=test_data['rating'])

In [30]:
params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'rmse',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
}

In [31]:
lightgbm_model = lgb.train(params, train_set, num_boost_round=1000, valid_sets=[train_set,test_set], early_stopping_rounds=10)



You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1187
[LightGBM] [Info] Number of data points in the train set: 4032, number of used features: 168
[LightGBM] [Info] Start training from score 3.995667
[1]	training's rmse: 0.342073	valid_1's rmse: 0.34069
Training until validation scores don't improve for 10 rounds
[2]	training's rmse: 0.339793	valid_1's rmse: 0.338789
[3]	training's rmse: 0.338136	valid_1's rmse: 0.337577
[4]	training's rmse: 0.335879	valid_1's rmse: 0.335797
[5]	training's rmse: 0.333797	valid_1's rmse: 0.334142
[6]	training's rmse: 0.331891	valid_1's rmse: 0.33258
[7]	training's rmse: 0.330268	valid_1's rmse: 0.331368
[8]	training's rmse: 0.328583	valid_1's rmse: 0.330236
[9]	training's rmse: 0.327254	valid_1's rmse: 0.329291
[10]	training's rmse: 0.325882	valid_1's rmse: 0.328083
[11]	training's rmse: 0.324451	valid_1's rmse: 0.326946
[12]	training's rmse: 0.323073	v