In [176]:
import pymongo
import pandas as pd
import numpy as npp
import dotenv
import os
from data_importer import DataImporter
import lightgbm as lgb
from sklearn.preprocessing import MultiLabelBinarizer
import ast
import re

In [177]:
# Import cleaned data from csv
df = pd.read_csv('processed_books.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5040 entries, 0 to 5039
Data columns (total 17 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   book_id          5040 non-null   object 
 1   title            5040 non-null   object 
 2   author           5040 non-null   object 
 3   price            5040 non-null   float64
 4   genres           5040 non-null   object 
 5   language         5040 non-null   object 
 6   series           5040 non-null   int64  
 7   publisher        4733 non-null   object 
 8   year_published   4987 non-null   object 
 9   description      4966 non-null   object 
 10  current_readers  5040 non-null   float64
 11  wanted_to_read   5040 non-null   float64
 12  num_reviews      5040 non-null   object 
 13  num_ratings      5040 non-null   object 
 14  rating           5040 non-null   float64
 15  awards           5040 non-null   object 
 16  primary_lists    5040 non-null   object 
dtypes: float64(4),

In [178]:
# Feature Selection
# Drop price, auther, title, description, series, publisher, year_published, language
df_used = df.drop(['price', 'author', 'title', 'description', 'series', 'publisher', 'year_published', 'language'], axis = 1)
# Remove commas from num_reviews and num_ratings
df_used['num_reviews'] = df_used['num_reviews'].str.replace(',', '')
df_used['num_ratings'] = df_used['num_ratings'].str.replace(',', '')
# Change num_reviews to float64 and num_ratings to float64
df_used['num_reviews'] = df_used['num_reviews'].astype('float64')
df_used['num_ratings'] = df_used['num_ratings'].astype('float64')
df_used.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5040 entries, 0 to 5039
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   book_id          5040 non-null   object 
 1   genres           5040 non-null   object 
 2   current_readers  5040 non-null   float64
 3   wanted_to_read   5040 non-null   float64
 4   num_reviews      5040 non-null   float64
 5   num_ratings      5040 non-null   float64
 6   rating           5040 non-null   float64
 7   awards           5040 non-null   object 
 8   primary_lists    5040 non-null   object 
dtypes: float64(5), object(4)
memory usage: 354.5+ KB


In [179]:
mlb = MultiLabelBinarizer()
for col in df_used.columns:
    if df_used[col].dtype == 'O':
        try:
            df_used[col] = df_used[col].apply(ast.literal_eval)
        except:
            continue

    if isinstance(df_used[col][0], list):
        mlb.fit(df_used[col])
        one_hot_col = mlb.transform(df_used[col])
        one_hot_df = pd.DataFrame(one_hot_col, columns=mlb.classes_)
        df_used = pd.concat([df_used, one_hot_df], axis=1)
        df_used.drop(columns=[col], axis=1)
        
# Drop columns genres, awards, primary_lists
df_used = df_used.drop(['genres', 'awards', 'primary_lists'], axis = 1)

In [180]:

renamed_feature_names = []
for name in df_used.columns:
    renamed_feature_names.append(re.sub('[^0-9a-zA-Z_\-\.]+','_',name))

df_used.columns = renamed_feature_names
for column in df_used.columns:
    if column == '_':
        df_used.drop(columns=[column], axis=1)
print(len(df_used.columns))

10277


In [181]:
duplicated_columns = df_used.columns[df_used.columns.duplicated()]
# instances = df_used[duplicated_columns].value_counts()

print(duplicated_columns)
# sum the duplicated columns
for column in duplicated_columns:
    # Select the duplicated columns with the same name
    same_name_cols = df_used.filter(like=column, axis=1)
    
    # Combine the duplicated columns by adding them up
    combined_col = same_name_cols.sum(axis=1)
    
    # Clip the combined values to 1
    combined_col = combined_col.clip(0, 1)
    
    # Create a new column with the combined values
    df_used[column] = combined_col
    
    # Drop the duplicated columns
    df_used = df_used.drop(columns=same_name_cols.columns)

print(len(df_used.columns))

Index(['Romantic_Times_Reviewers_Choice_Award_RT_Award_Nominee_for_Best_Young_Adult_Paranormal_Fantasy_Novel_2009_',
       'The_Kitschies_Nominee_for_Golden_Tentacle_Debut_2020_',
       'Alternate_History', 'American', 'Anthologies', 'Art_History', 'Canada',
       'Chinese_Literature', 'Cities', 'Dragons', 'Epic_Fantasy', 'Espionage',
       'European_History', 'Fantasy_Romance', 'Ghost_Stories',
       'Greek_Mythology', 'India', 'Italy', 'Japanese_Literature',
       'Military_Science_Fiction', 'Mystery', 'Nordic_Noir', 'Nursing',
       'Occult', 'Physics', 'Poetry', 'Rabbits', 'Race', 'Romantic_Suspense',
       'Space_Opera', 'Sword_and_Sorcery', 'Tasmania', 'Thelema', 'Theosophy',
       'True_Crime', 'Young_Adult_Romance', '_'],
      dtype='object')
421


  combined_col = same_name_cols.sum(axis=1)


In [182]:
df_used.shape

(5040, 421)

In [183]:
train_data = df_used.sample(frac=0.8, random_state=42)
test_data = df_used.drop(train_data.index)

train_set = lgb.Dataset(train_data.drop(columns=['rating', 'book_id']), label=train_data['rating'])
test_set = lgb.Dataset(test_data.drop(columns=['rating', 'book_id']), label=test_data['rating'])

KeyError: "['book_id'] not found in axis"

In [None]:
params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'rmse',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
}

In [None]:
lightgbm_model = lgb.train(params, train_set, num_boost_round=1000, valid_sets=[train_set,test_set], early_stopping_rounds=10)



LightGBMError: Feature (Romantic_Times_Reviewers_Choice_Award_RT_Award_Nominee_for_Best_Young_Adult_Paranormal_Fantasy_Novel_2009_) appears more than one time.

In [None]:
book_list = 