In [93]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.cluster import KMeans
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
from sklearn.metrics.pairwise import cosine_similarity
import time
import gc  
import pprint
from rich import print
import plotly.express as px
from langdetect import detect, DetectorFactory
from sklearn.neighbors import KNeighborsClassifier
from tabulate import tabulate
import joblib
import warnings
warnings.filterwarnings("ignore")


## ===== PHASE 1: DATA LOADING & PREPROCESSING =====


In [94]:
start_time = time.time()

### Loading Data

In [95]:
df = pd.read_csv("/Users/whoseunassailable/Documents/coding_projects/college_projects/data/raw/good_read_books_100k.csv", encoding='utf-8')


In [96]:
df.head(5)

Unnamed: 0,author,bookformat,desc,genre,img,isbn,isbn13,link,pages,rating,reviews,title,totalratings
0,Laurence M. Hauptman,Hardcover,Reveals that several hundred thousand Indians ...,"History,Military History,Civil War,American Hi...",https://i.gr-assets.com/images/S/compressed.ph...,002914180X,9780000000000.0,https://goodreads.com/book/show/1001053.Betwee...,0,3.52,5,Between Two Fires: American Indians in the Civ...,33
1,"Charlotte Fiell,Emmanuelle Dirix",Paperback,Fashion Sourcebook - 1920s is the first book i...,"Couture,Fashion,Historical,Art,Nonfiction",https://i.gr-assets.com/images/S/compressed.ph...,1906863482,9780000000000.0,https://goodreads.com/book/show/10010552-fashi...,576,4.51,6,Fashion Sourcebook 1920s,41
2,Andy Anderson,Paperback,The seminal history and analysis of the Hungar...,"Politics,History",https://i.gr-assets.com/images/S/compressed.ph...,948984147,9780000000000.0,https://goodreads.com/book/show/1001077.Hungar...,124,4.15,2,Hungary 56,26
3,Carlotta R. Anderson,Hardcover,"""All-American Anarchist"" chronicles the life a...","Labor,History",https://i.gr-assets.com/images/S/compressed.ph...,814327079,9780000000000.0,https://goodreads.com/book/show/1001079.All_Am...,324,3.83,1,All-American Anarchist: Joseph A. Labadie and ...,6
4,Jean Leveille,,"Aujourdâ€™hui, lâ€™oiseau nous invite Ã sa ta...",,https://i.gr-assets.com/images/S/compressed.ph...,2761920813,,https://goodreads.com/book/show/10010880-les-o...,177,4.0,1,Les oiseaux gourmands,1


In [97]:
df.columns

Index(['author', 'bookformat', 'desc', 'genre', 'img', 'isbn', 'isbn13',
       'link', 'pages', 'rating', 'reviews', 'title', 'totalratings'],
      dtype='object')

In [98]:
df.describe()

Unnamed: 0,pages,rating,reviews,totalratings
count,100000.0,100000.0,100000.0,100000.0
mean,255.01024,3.833055,181.52845,2990.764
std,367.913582,0.621237,1449.451229,36353.38
min,0.0,0.0,0.0,0.0
25%,135.0,3.66,3.0,31.0
50%,240.0,3.91,15.0,146.0
75%,336.0,4.14,67.0,744.0
max,70000.0,5.0,158776.0,3819326.0


In [99]:
# Read only necessary columns to save memory
cols_to_use = ['author', 'bookformat', 'genre', 'pages', 'rating', 'reviews', 'title', 'totalratings']
print(f"Original shape: {df.shape}")

In [100]:
df = df.dropna(subset=['rating', 'genre', 'author'])
print(f"Shape after dropping NAs: {df.shape}")

In [101]:
df.columns

Index(['author', 'bookformat', 'desc', 'genre', 'img', 'isbn', 'isbn13',
       'link', 'pages', 'rating', 'reviews', 'title', 'totalratings'],
      dtype='object')

In [102]:
df.describe()

Unnamed: 0,pages,rating,reviews,totalratings
count,89533.0,89533.0,89533.0,89533.0
mean,259.271174,3.890213,202.626897,3339.266
std,338.807842,0.385779,1530.447003,38404.55
min,0.0,0.0,0.0,0.0
25%,144.0,3.68,5.0,52.0
50%,242.0,3.91,20.0,200.0
75%,338.0,4.13,81.0,918.0
max,70000.0,5.0,158776.0,3819326.0


In [103]:
# Initialize tracking dictionaries
model_metrics = {}
model_training_time = {}

In [104]:
# Reduce memory usage
df['pages'] = df['pages'].fillna(0).astype('int32')
df['reviews'] = df['reviews'].fillna(0).astype('int32')
df['totalratings'] = df['totalratings'].fillna(0).astype('int32')
df['rating'] = df['rating'].astype('float32')

In [105]:
df.head()

Unnamed: 0,author,bookformat,desc,genre,img,isbn,isbn13,link,pages,rating,reviews,title,totalratings
0,Laurence M. Hauptman,Hardcover,Reveals that several hundred thousand Indians ...,"History,Military History,Civil War,American Hi...",https://i.gr-assets.com/images/S/compressed.ph...,002914180X,9780000000000.0,https://goodreads.com/book/show/1001053.Betwee...,0,3.52,5,Between Two Fires: American Indians in the Civ...,33
1,"Charlotte Fiell,Emmanuelle Dirix",Paperback,Fashion Sourcebook - 1920s is the first book i...,"Couture,Fashion,Historical,Art,Nonfiction",https://i.gr-assets.com/images/S/compressed.ph...,1906863482,9780000000000.0,https://goodreads.com/book/show/10010552-fashi...,576,4.51,6,Fashion Sourcebook 1920s,41
2,Andy Anderson,Paperback,The seminal history and analysis of the Hungar...,"Politics,History",https://i.gr-assets.com/images/S/compressed.ph...,948984147,9780000000000.0,https://goodreads.com/book/show/1001077.Hungar...,124,4.15,2,Hungary 56,26
3,Carlotta R. Anderson,Hardcover,"""All-American Anarchist"" chronicles the life a...","Labor,History",https://i.gr-assets.com/images/S/compressed.ph...,814327079,9780000000000.0,https://goodreads.com/book/show/1001079.All_Am...,324,3.83,1,All-American Anarchist: Joseph A. Labadie and ...,6
5,Jeffrey Pfeffer,Hardcover,Why is common sense so uncommon when it comes ...,"Business,Leadership,Romance,Historical Romance...",https://i.gr-assets.com/images/S/compressed.ph...,875848419,9780000000000.0,https://goodreads.com/book/show/1001090.The_Hu...,368,3.73,7,The Human Equation: Building Profits by Puttin...,119


In [106]:
# Create more informative features
df['log_pages'] = np.log1p(df['pages']).astype('float32')
df['log_reviews'] = np.log1p(df['reviews']).astype('float32')
df['log_totalratings'] = np.log1p(df['totalratings']).astype('float32')
df['popularity_score'] = df['rating'] * np.log1p(df['totalratings']).astype('float32')
df['review_ratio'] = (df['reviews'] / (df['totalratings'] + 1)).astype('float32')

In [107]:
len(df.columns)

18

In [108]:
# # One-hot encode genre (limited to top genres to save memory)
# genre_counts = df['genre'].value_counts().head(20)

In [109]:
# top_genres = genre_counts.index.tolist()
# df['top_genre'] = df['genre'].apply(lambda x: x if x in top_genres else 'Other')
# genre_ohe = pd.get_dummies(df['top_genre'], prefix='genre', sparse=True)
# df = pd.concat([df, genre_ohe], axis=1)

In [110]:
# Target variable
target = (df['rating'] >= 4.0).astype('int8')
print(f"Target distribution: {np.bincount(target)}")

In [111]:
df.isna().sum()

author                  0
bookformat           2324
desc                 4048
genre                   0
img                  1224
isbn                12624
isbn13               9789
link                    0
pages                   0
rating                  0
reviews                 0
title                   1
totalratings            0
log_pages               0
log_reviews             0
log_totalratings        0
popularity_score        0
review_ratio            0
dtype: int64

In [112]:
df['title']

0        Between Two Fires: American Indians in the Civ...
1                                 Fashion Sourcebook 1920s
2                                               Hungary 56
3        All-American Anarchist: Joseph A. Labadie and ...
5        The Human Equation: Building Profits by Puttin...
                               ...                        
99993                                       The Sea Inside
99994                                    A Horse for Angel
99997    A Faith Worth Sharing: A Lifetime of Conversat...
99998    A Volcano Beneath the Snow: John Brown's War A...
99999    Paranormal Nation: Why America Needs Ghosts, U...
Name: title, Length: 89533, dtype: object

In [113]:
# Sample: apply language detection
def detect_language(text):
    try:
        return detect(text)
    except:
        return 'unknown'


In [114]:
# Detect languages
df['language'] = df['title'].apply(detect_language)
df['language'].value_counts()


language
en         67549
de          3694
nl          1429
da          1428
pt          1363
fr          1344
it          1140
id          1118
ro           984
tl           940
af           923
no           832
es           720
ca           681
tr           517
et           508
fi           501
sv           479
cy           444
pl           370
sw           365
so           326
vi           278
hu           260
sl           235
lt           230
hr           212
unknown      144
sk           138
sq           130
cs           124
zh-cn         68
lv            42
ko            17
Name: count, dtype: int64

In [115]:
# Set a threshold for suspicious languages (e.g., languages with < 100 titles)
suspicious_langs = df['language'].value_counts()
suspicious_langs = suspicious_langs[suspicious_langs < 100].index.tolist()

# Filter rows with suspicious language codes
suspicious_titles = df[df['language'].isin(suspicious_langs)]

# View them (e.g., top 30)
print(suspicious_titles[['title', 'language']].head(5))


In [116]:

# Step 2: Drop rows where the language is in the suspicious list
df = df[~df['language'].isin(suspicious_langs)].reset_index(drop=True)

# ===== PHASE 3: ENHANCED FEATURE ENGINEERING =====

#### Columns before One-hot encoding

In [117]:
df.columns

Index(['author', 'bookformat', 'desc', 'genre', 'img', 'isbn', 'isbn13',
       'link', 'pages', 'rating', 'reviews', 'title', 'totalratings',
       'log_pages', 'log_reviews', 'log_totalratings', 'popularity_score',
       'review_ratio', 'language'],
      dtype='object')

In [118]:
# ===== PHASE 2: ENHANCED FEATURE ENGINEERING =====
print("Performing feature engineering...")
# Encode categorical features
genre_encoder = LabelEncoder()
author_encoder = LabelEncoder()
format_encoder = LabelEncoder()

df['genre_encoded'] = genre_encoder.fit_transform(df['genre'])
df['author_encoded'] = author_encoder.fit_transform(df['author'])
df['bookformat'] = df['bookformat'].fillna('Unknown')
df['format_encoded'] = format_encoder.fit_transform(df['bookformat'])

# Create more informative features
df['log_pages'] = np.log1p(df['pages']).astype('float32')
df['log_reviews'] = np.log1p(df['reviews']).astype('float32')
df['log_totalratings'] = np.log1p(df['totalratings']).astype('float32')
df['popularity_score'] = df['rating'] * np.log1p(df['totalratings']).astype('float32')
df['review_ratio'] = (df['reviews'] / (df['totalratings'] + 1)).astype('float32')

# One-hot encode genre (limited to top genres to save memory)
genre_counts = df['genre'].value_counts().head(20)
top_genres = genre_counts.index.tolist()
df['top_genre'] = df['genre'].apply(lambda x: x if x in top_genres else 'Other')
genre_ohe = pd.get_dummies(df['top_genre'], prefix='genre', sparse=True)
df = pd.concat([df, genre_ohe], axis=1)

# Target variable
target = (df['rating'] >= 4.0).astype('int8')
print(f"Target distribution: {np.bincount(target)}")


#### Columns after One-hot encoding

In [119]:
len(df.columns)

44

In [120]:
# ===== PHASE 3: PREPARE TRAINING DATA =====
print("Preparing training data...")
# Select features with better predictive power
numerical_features = [
    'log_pages', 'log_reviews', 'log_totalratings', 
    'popularity_score', 'review_ratio'
]

categorical_features = ['author_encoded', 'format_encoded']

In [121]:
# Combine all features, including one-hot encoded genres
feature_cols = numerical_features + categorical_features + list(genre_ohe.columns)
features = df[feature_cols].copy()

In [122]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    features, target, test_size=0.2, random_state=42, stratify=target
)

In [123]:
# Scale numerical features only
scaler = StandardScaler()
X_train[numerical_features] = scaler.fit_transform(X_train[numerical_features])
X_test[numerical_features] = scaler.transform(X_test[numerical_features])

In [124]:
# Memory cleanup
del genre_ohe
gc.collect()

print(f"Training data shape: {X_train.shape}")
print(f"Test data shape: {X_test.shape}")

In [125]:
# ===== MODEL 1: XGBOOST WITH EPOCHS =====
print("\n=== Training XGBoost Model ===")
start_time = time.time()

#### Function to print metrics (reusable)

In [126]:
def print_metrics(y_true, y_pred, model_name):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, zero_division=0)
    recall = recall_score(y_true, y_pred, zero_division=0)
    f1 = f1_score(y_true, y_pred, zero_division=0)
    print(f"{model_name} - Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}")
    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

In [127]:
import xgboost as xgb
from tabulate import tabulate

def train_xgboost_with_epochs(X_train, y_train, X_test, y_test, epochs_list=[3, 10, 50, 100]):
    results = {}
    best_accuracy = 0
    best_model = None
    best_epoch = 0

    print("\nTraining and Evaluating XGBoost Models...\n")
    
    for epochs in epochs_list:
        print(f"Training XGBoost with {epochs} estimators...")
        model = xgb.XGBClassifier(
            n_estimators=epochs,
            learning_rate=0.1,
            max_depth=4,
            subsample=0.8,
            colsample_bytree=0.8,
            eval_metric='logloss',
            use_label_encoder=False,
            random_state=42
        )

        # Fit model (eval_set given, early stopping removed for this version)
        eval_set = [(X_test, y_test)]
        model.fit(X_train, y_train, eval_set=eval_set, verbose=False)

        # Evaluate
        y_pred = model.predict(X_test)
        metrics = print_metrics(y_test, y_pred, f"XGBoost ({epochs} epochs)")
        results[epochs] = metrics

        # Track best
        if metrics["accuracy"] > best_accuracy:
            best_accuracy = metrics["accuracy"]
            best_model = model
            best_epoch = epochs

    # Tabulate the results
    table = []
    for epochs, metrics in results.items():
        row = [epochs] + [round(metrics.get(m, 0), 4) for m in ["accuracy", "precision", "recall", "f1_score"]]
        table.append(row)

    headers = ["Epochs", "Accuracy", "Precision", "Recall", "F1 Score"]
    print(tabulate(table, headers=headers, tablefmt="grid"))

    print(f"\n✅ Best XGBoost model: {best_epoch} epochs with accuracy {best_accuracy:.4f}")
    return best_model, results


In [128]:
# Train XGBoost with different epochs
xgb_model, xgb_results = train_xgboost_with_epochs(
    X_train, y_train, X_test, y_test, 
    epochs_list=[3, 10, 50, 100, 200]
)

In [129]:
# Store metrics
model_metrics["XGBoost"] = xgb_results
model_training_time["XGBoost"] = time.time() - start_time

In [130]:

# Memory cleanup
gc.collect()

35

In [131]:
# ===== MODEL 2: KMEANS CLUSTERING =====
print("\n=== Training KMeans Model ===")
start_time = time.time()

In [132]:

def train_kmeans_with_different_clusters(X_train, y_train, X_test, y_test, clusters_list=[3, 5, 8, 10]):
    results = {}
    sample_size = min(10000, len(X_train))
    X_train_sample = X_train.iloc[:sample_size].copy()
    y_train_sample = y_train[:sample_size].copy()

    print("\nTraining and Evaluating KMeans Models...\n")
    for n_clusters in clusters_list:
        kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
        kmeans.fit(X_train_sample)
        
        # Assign cluster to each point
        train_clusters = kmeans.predict(X_train_sample)
        
        # For each cluster, determine the majority class
        cluster_to_label = {}
        for cluster in range(n_clusters):
            cluster_points = (train_clusters == cluster)
            if sum(cluster_points) > 0:
                labels = y_train_sample[cluster_points]
                majority_label = 1 if sum(labels) > len(labels)/2 else 0
                cluster_to_label[cluster] = majority_label
        
        # Predict using cluster assignments
        test_clusters = kmeans.predict(X_test)
        y_pred = np.array([cluster_to_label.get(cluster, 0) for cluster in test_clusters])
        
        # Evaluate
        metrics = print_metrics(y_test, y_pred, f"KMeans ({n_clusters} clusters)")
        results[n_clusters] = metrics

    # Tabulate the results
    table = []
    for n_clusters, metrics in results.items():
        row = [n_clusters] + [round(metrics.get(m, 0), 4) for m in ["accuracy", "precision", "recall", "f1_score"]]
        table.append(row)

    headers = ["Clusters", "Accuracy", "Precision", "Recall", "F1 Score"]
    print(tabulate(table, headers=headers, tablefmt="grid"))

    # Find best number of clusters
    best_n_clusters = max(results.items(), key=lambda x: x[1]["accuracy"])[0]
    print(f"\nBest KMeans model: {best_n_clusters} clusters with accuracy {results[best_n_clusters]['accuracy']:.4f}")
    
    # Train final model with best clusters
    best_model = KMeans(n_clusters=best_n_clusters, random_state=42, n_init=10)
    best_model.fit(X_train_sample)
    return best_model, results

In [133]:
# Train KMeans with different numbers of clusters
kmeans_model, kmeans_results = train_kmeans_with_different_clusters(
    X_train, y_train, X_test, y_test,
    clusters_list=[3, 5, 8, 10, 15]
)

In [134]:
# Store metrics
model_metrics["KMeans"] = kmeans_results
model_training_time["KMeans"] = time.time() - start_time

In [135]:
# Memory cleanup
gc.collect()

0

In [136]:
# ===== MODEL 3: KNN WITH DIFFERENT K VALUES =====
print("\n=== Training KNN Model ===")
start_time = time.time()

In [137]:

# Function to train KNN with different K values and display results using tabulate
def train_knn_with_different_k(X_train, y_train, X_test, y_test, k_values=[1, 3, 5, 7, 10]):
    results = {}
    best_accuracy = 0
    best_model = None
    best_k = 0

    # Use a smaller sample for memory efficiency
    sample_size = min(10000, len(X_train))
    X_train_sample = X_train.iloc[:sample_size].copy()
    y_train_sample = y_train[:sample_size].copy()

    print("\nTraining and Evaluating KNN Models...\n")
    for k in k_values:
        model = KNeighborsClassifier(n_neighbors=k)
        model.fit(X_train_sample, y_train_sample)

        # Evaluate on test data
        y_pred = model.predict(X_test)
        metrics = print_metrics(y_test, y_pred, f"KNN (k={k})")
        results[k] = metrics

        # Track best model
        if metrics["accuracy"] > best_accuracy:
            best_accuracy = metrics["accuracy"]
            best_model = model
            best_k = k

    # Tabulate the results
    table = []
    for k, metrics in results.items():
        row = [k] + [round(metrics.get(m, 0), 4) for m in ["accuracy", "precision", "recall", "f1_score"]]
        table.append(row)

    headers = ["k", "Accuracy", "Precision", "Recall", "F1 Score"]
    print(tabulate(table, headers=headers, tablefmt="grid"))

    print(f"\nBest KNN model: k={best_k} with accuracy {best_accuracy:.4f}")
    return best_model, results


In [138]:
# Train KNN with different K values
knn_model, knn_results = train_knn_with_different_k(
    X_train, y_train, X_test, y_test,
    k_values=[1, 3, 5, 7, 11, 15]
)

In [139]:
# Store metrics
model_metrics["KNN"] = knn_results
model_training_time["KNN"] = time.time() - start_time

In [140]:
# Memory cleanup
gc.collect()

0

In [141]:
# ===== MODEL 4: SVD DIMENSIONALITY REDUCTION =====
print("\n=== Training SVD Model ===")
start_time = time.time()

In [142]:
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LogisticRegression
from tabulate import tabulate
import numpy as np

def train_svd_with_different_components(X_train, y_train, X_test, y_test, n_components_list=None):
    # Get the maximum number of components possible
    max_components = min(X_train.shape[0], X_train.shape[1]) - 1
    
    if n_components_list is None:
        if max_components <= 5:
            n_components_list = [2, max_components]
        elif max_components <= 10:
            n_components_list = [2, 5, max_components]
        elif max_components <= 20:
            n_components_list = [2, 5, 10, max_components]
        else:
            n_components_list = [2, 5, 10, max(15, max_components // 4), max(20, max_components // 2)]
    else:
        n_components_list = [n for n in n_components_list if n < max_components]

    print(f"Testing SVD with components: {n_components_list} (max possible: {max_components})")

    results = {}
    best_accuracy = 0
    best_model = None
    best_n_components = 0

    for n_components in n_components_list:
        print(f"\nTraining SVD with {n_components} components...")
        
        # Dimensionality reduction
        svd = TruncatedSVD(n_components=n_components, random_state=42)
        X_train_svd = svd.fit_transform(X_train)
        X_test_svd = svd.transform(X_test)

        # Train classifier
        clf = LogisticRegression(max_iter=1000, random_state=42)
        clf.fit(X_train_svd, y_train)

        # Evaluate
        y_pred = clf.predict(X_test_svd)
        metrics = print_metrics(y_test, y_pred, f"SVD ({n_components} components) + LR")
        results[n_components] = metrics

        # Track best
        if metrics["accuracy"] > best_accuracy:
            best_accuracy = metrics["accuracy"]
            best_model = (svd, clf)
            best_n_components = n_components

    # Tabulate the results
    table = []
    for n_components, metrics in results.items():
        row = [n_components] + [round(metrics.get(m, 0), 4) for m in ["accuracy", "precision", "recall", "f1_score"]]
        table.append(row)

    headers = ["Components", "Accuracy", "Precision", "Recall", "F1 Score"]
    print(tabulate(table, headers=headers, tablefmt="grid"))

    print(f"\nBest SVD model: {best_n_components} components with accuracy {best_accuracy:.4f}")
    return best_model, results


In [143]:
# Train SVD with different components
svd_models, svd_results = train_svd_with_different_components(
    X_train, y_train, X_test, y_test,
    n_components_list=[2, 5, 10, 15, 20, 25]  # Reduced max components to 25
)


In [144]:
# Store metrics
model_metrics["SVD"] = svd_results
model_training_time["SVD"] = time.time() - start_time

In [145]:
# Memory cleanup
gc.collect()

0

In [146]:
# ===== MODEL 5: LOGISTIC REGRESSION WITH DIFFERENT REGULARIZATION =====
print("\n=== Training Logistic Regression Model ===")
start_time = time.time()

In [147]:
from sklearn.linear_model import LogisticRegression
from tabulate import tabulate

def train_logreg_with_different_params(X_train, y_train, X_test, y_test):
    results = {}
    best_accuracy = 0
    best_model = None
    best_params = None

    param_grid = [
        {'C': 0.01, 'solver': 'liblinear'},
        {'C': 0.1, 'solver': 'liblinear'},
        {'C': 1.0, 'solver': 'liblinear'},
        {'C': 10.0, 'solver': 'liblinear'},
        {'C': 100.0, 'solver': 'liblinear'}
    ]

    print("\nTraining and Evaluating Logistic Regression Models...\n")
    for params in param_grid:
        c_val = params['C']
        solver = params['solver']
        model = LogisticRegression(C=c_val, solver=solver, max_iter=1000, random_state=42)
        model.fit(X_train, y_train)

        # Evaluate
        y_pred = model.predict(X_test)
        metrics = print_metrics(y_test, y_pred, f"LogReg (C={c_val})")
        results[c_val] = metrics

        # Track best model
        if metrics["accuracy"] > best_accuracy:
            best_accuracy = metrics["accuracy"]
            best_model = model
            best_params = params

    # Tabulate the results
    table = []
    for c_val, metrics in results.items():
        row = [c_val] + [round(metrics.get(m, 0), 4) for m in ["accuracy", "precision", "recall", "f1_score"]]
        table.append(row)

    headers = ["C", "Accuracy", "Precision", "Recall", "F1 Score"]
    print(tabulate(table, headers=headers, tablefmt="grid"))

    print(f"\nBest Logistic Regression model: C={best_params['C']} with accuracy {best_accuracy:.4f}")
    return best_model, results


In [148]:
# Train Logistic Regression with different parameters
logreg_model, logreg_results = train_logreg_with_different_params(
    X_train, y_train, X_test, y_test
)

In [149]:
# Store metrics
model_metrics["LogisticRegression"] = logreg_results
model_training_time["LogisticRegression"] = time.time() - start_time

In [150]:
# Memory cleanup
gc.collect()

0

In [151]:
# ===== MODEL COMPARISON AND VISUALIZATION =====
print("\n=== Model Comparison ===")

In [152]:
def visualize_model_comparison():
    # Prepare data for visualization
    models = []
    accuracy_scores = []
    f1_scores = []
    training_times = []

    # Extract best scores for each model
    for model_name, results in model_metrics.items():
        if isinstance(results, dict):
            best_config = max(results.items(), key=lambda x: x[1]["accuracy"])
            best_param = best_config[0]
            best_metrics = best_config[1]

            full_name = f"{model_name} ({best_param})"
            models.append(full_name)
            accuracy_scores.append(best_metrics["accuracy"])
            f1_scores.append(best_metrics["f1"])
            training_times.append(model_training_time.get(model_name, 0))
        else:
            models.append(model_name)
            accuracy_scores.append(results["accuracy"])
            f1_scores.append(results["f1"])
            training_times.append(model_training_time.get(model_name, 0))

    # Build tabular data
    table_data = []
    for i in range(len(models)):
        table_data.append([
            models[i],
            round(accuracy_scores[i], 4),
            round(f1_scores[i], 4),
            round(training_times[i], 2)
        ])

    # Define headers
    headers = ["Model", "Accuracy", "F1 Score", "Training Time (s)"]

    # Print the comparison table
    print("\nModel Performance Comparison:")
    print(tabulate(table_data, headers=headers, tablefmt="grid"))

    # Best model by accuracy
    best_idx = np.argmax(accuracy_scores)
    print(f"\n✅ Best Model: {models[best_idx]} "
          f"with accuracy {accuracy_scores[best_idx]:.4f} "
          f"and F1 {f1_scores[best_idx]:.4f}")

    # Time-efficient model (>= 80% of best accuracy)
    threshold = 0.8 * max(accuracy_scores)
    valid_indices = [i for i, acc in enumerate(accuracy_scores) if acc >= threshold]
    if valid_indices:
        most_efficient_idx = min(valid_indices, key=lambda i: training_times[i])
        print(f"⚡ Most Time-Efficient Model: {models[most_efficient_idx]} "
              f"with accuracy {accuracy_scores[most_efficient_idx]:.4f} "
              f"and training time {training_times[most_efficient_idx]:.2f}s")


In [153]:
# Visualize model comparison
visualize_model_comparison()

In [154]:
# ===== MODEL PARAMETER TUNING VISUALIZATION =====
# Function to visualize parameter tuning results
def visualize_parameter_tuning(model_name, param_results, param_name="Parameter"):
    if not param_results:
        print(f"No parameter tuning results available for {model_name}")
        return
    
    print(f"\n{model_name} Parameter Tuning Results:")
    print("="*50)
    print(f"{param_name:<15} {'Accuracy':<10} {'Precision':<10} {'Recall':<10} {'F1 Score':<10}")
    print("-"*50)
    
    for param, metrics in sorted(param_results.items()):
        print(f"{param:<15} {metrics['accuracy']:<10.4f} {metrics['precision']:<10.4f} "
              f"{metrics['recall']:<10.4f} {metrics['f1']:<10.4f}")
    
    # Find best parameter
    best_param = max(param_results.items(), key=lambda x: x[1]["accuracy"])[0]
    print("="*50)
    print(f"Best {param_name}: {best_param} with accuracy {param_results[best_param]['accuracy']:.4f}")

# Visualize parameter tuning for each model
print("\n=== Parameter Tuning Results ===")
if "XGBoost" in model_metrics:
    visualize_parameter_tuning("XGBoost", model_metrics["XGBoost"], "Epochs")
if "KMeans" in model_metrics:
    visualize_parameter_tuning("KMeans", model_metrics["KMeans"], "Clusters")
if "KNN" in model_metrics:
    visualize_parameter_tuning("KNN", model_metrics["KNN"], "K Value")
if "SVD" in model_metrics:
    visualize_parameter_tuning("SVD", model_metrics["SVD"], "Components")
if "LogisticRegression" in model_metrics:
    visualize_parameter_tuning("LogisticRegression", model_metrics["LogisticRegression"], "C Value")

In [155]:
# ===== RECOMMENDATION FUNCTIONS =====
print("\n=== Generating Recommendations ===")

def content_based_recommendation(book_id, df, feature_matrix, n=5):
    """Generate recommendations based on book similarity"""
    # Ensure book_id is valid
    if book_id >= len(feature_matrix):
        book_id = 0
        
    # Get the book vector
    book_features = feature_matrix.iloc[book_id:book_id+1]
    
    # Calculate similarity with all other books (in batches to save memory)
    batch_size = 1000
    n_batches = len(feature_matrix) // batch_size + 1
    similarities = []
    
    for i in range(n_batches):
        start_idx = i * batch_size
        end_idx = min((i + 1) * batch_size, len(feature_matrix))
        batch = feature_matrix.iloc[start_idx:end_idx]
        
        # Calculate similarity
        sim_batch = cosine_similarity(book_features, batch).flatten()
        similarities.extend(list(zip(range(start_idx, end_idx), sim_batch)))
    
    # Sort by similarity (excluding the book itself)
    similarities.sort(key=lambda x: x[1], reverse=True)
    similar_books = [pair[0] for pair in similarities if pair[0] != book_id][:n]
    
    # Return recommendations
    return df.iloc[similar_books][['title', 'author', 'rating']]

def collaborative_filtering(df, n=5):
    """Simple memory-efficient collaborative filtering"""
    # Create a smaller user-item matrix (simulate collaborative filtering)
    n_users = 100
    n_items = min(1000, len(df))
    
    # Sample some books
    sampled_books = df.iloc[:n_items].copy()
    
    # Generate random user preferences
    np.random.seed(42)
    user_preferences = np.random.rand(n_users, n_items).astype('float32')
    
    # Perform SVD on this small matrix
    svd = TruncatedSVD(n_components=10, random_state=42)
    user_factors = svd.fit_transform(user_preferences)
    item_factors = svd.components_.T
    
    # For a random user, get recommendations
    user_id = 0
    user_vector = user_factors[user_id]
    scores = np.dot(user_vector, item_factors.T)
    
    # Get top books
    top_items = np.argsort(scores)[::-1][:n]
    return sampled_books.iloc[top_items][['title', 'author', 'rating']]


In [158]:
# ===== FINAL EVALUATION =====
print("\n=== Final Evaluation and Insights ===")
print("1. Best Classification Model:")
best_model_name = max([
    ("XGBoost", max(model_metrics.get("XGBoost", {}).items(), key=lambda x: x[1]["accuracy"])[1]["accuracy"] if "XGBoost" in model_metrics else 0),
    ("KNN", max(model_metrics.get("KNN", {}).items(), key=lambda x: x[1]["accuracy"])[1]["accuracy"] if "KNN" in model_metrics else 0),
    ("LogisticRegression", max(model_metrics.get("LogisticRegression", {}).items(), key=lambda x: x[1]["accuracy"])[1]["accuracy"] if "LogisticRegression" in model_metrics else 0),
], key=lambda x: x[1])[0]

print(f"The best classification model for this dataset is: {best_model_name}")
print("This model can be used for predicting whether a book will have a high rating (≥4.0).")

print("\n2. Recommendation Systems:")
print("- Content-based system: Best for recommending books similar to ones the user already likes")
print("- Collaborative filtering: Better for discovering new books based on user behavior patterns")
print("- Hybrid system: Combines strengths of both approaches for better recommendations")

print("\n3. Key Features Importance:")
if hasattr(xgb_model, 'feature_importances_'):
    feature_importance = xgb_model.feature_importances_
    feature_names = X_train.columns
    
    # Sort features by importance
    sorted_idx = np.argsort(feature_importance)[::-1]
    top_features = [(feature_names[i], feature_importance[i]) for i in sorted_idx[:10]]
    
    print("Top 10 important features for predicting book ratings:")
    for i, (feature, importance) in enumerate(top_features):
        print(f"{i+1}. {feature}: {importance:.4f}")

print("\n4. Model Training Efficiency:")
for model, time_taken in sorted(model_training_time.items(), key=lambda x: x[1]):
    print(f"- {model}: {time_taken:.2f} seconds")

print("\n=== Evaluation complete ===")
print("Recommendation system is ready for deployment.")

In [166]:

joblib.dump(xgb_model,           "xgb_model.pkl")
joblib.dump(kmeans_model,        "kmeans_model.pkl")
joblib.dump(knn_model,           "knn_model.pkl")
svd_transformer, logreg_on_svd = svd_models
joblib.dump(svd_transformer,     "svd_transformer.pkl")
joblib.dump(logreg_on_svd,       "svd_logistic_model.pkl")

print("✅ Saved: xgb_model.pkl, kmeans_model.pkl, knn_model.pkl, svd_transformer.pkl, svd_logistic_model.pkl")

In [167]:
import pandas as pd
import numpy as np
import joblib
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics.pairwise import cosine_similarity

def prepare_numeric_features(df):
    df2 = df.copy()
    df2["log_pages"]        = np.log1p(df2["pages"])
    df2["log_reviews"]      = np.log1p(df2["reviews"])
    df2["log_totalratings"] = np.log1p(df2["totalratings"])
    df2["popularity_score"] = df2["rating"] * df2["log_totalratings"]
    df2["review_ratio"]     = df2["reviews"] / df2["totalratings"].replace(0, np.nan)
    return df2[[
        "log_pages","log_reviews","log_totalratings",
        "popularity_score","review_ratio"
    ]].fillna(0)

def recommend_books(df, genres, top_n=10):
    # — filter for candidates —
    df_clean = df.dropna(subset=["genre"]).reset_index(drop=True)
    df_clean["genre_list"] = df_clean["genre"].str.split(",").apply(lambda L: [g.strip() for g in L])
    exploded = df_clean.explode("genre_list")
    cands = exploded[exploded["genre_list"].isin(genres)].drop_duplicates("title")
    if cands.empty:
        print("⚠️  No books found for those genres.")
        return pd.DataFrame()
    idx = cands.index

    # — build feature‐matrix on the *entire* df, filling missing genre as "Other" —
    df_feats = df.copy().reset_index(drop=True)
    df_feats["genre"] = df_feats["genre"].fillna("Other")

    feats_num = prepare_numeric_features(df_feats)
    # label‐encode author & format
    le_a = LabelEncoder().fit(df_feats["author"])
    feats_num["author_encoded"] = le_a.transform(df_feats["author"])
    le_f = LabelEncoder().fit(df_feats["bookformat"].fillna("Unknown"))
    feats_num["format_encoded"] = le_f.transform(df_feats["bookformat"].fillna("Unknown"))

    # one‐hot the entire genre string (matches training)
    genre_dummies = pd.get_dummies(df_feats["genre"], prefix="genre")
    feats_full = pd.concat([feats_num, genre_dummies], axis=1)

    # — load your saved artifacts —
    xgb    = joblib.load("xgb_model.pkl")
    svd    = joblib.load("svd_transformer.pkl")
    logreg = joblib.load("svd_logistic_model.pkl")

    # — reindex to exactly XGB’s training columns (fill zeros for any missing) —
    feat_names = xgb.get_booster().feature_names
    feats_full = feats_full.reindex(columns=feat_names, fill_value=0)

    # — score with XGB —
    cands = cands.copy()
    cands["xgb_proba"] = xgb.predict_proba(feats_full.loc[idx])[:,1]

    # — project + score with SVD+LogReg —
    latent_all  = svd.transform(feats_full)
    latent_cand = latent_all[idx]
    cands["svd_proba"] = logreg.predict_proba(latent_cand)[:,1]

    # — optional semantic sim in SVD space —
    user_vec = latent_cand.mean(axis=0).reshape(1,-1)
    cands["sim_score"] = cosine_similarity(latent_cand, user_vec).flatten()

    # — blend & pick top-N —
    cands["final_score"] = 0.5*cands["xgb_proba"] + 0.5*cands["svd_proba"]
    top = cands.sort_values("final_score", ascending=False).head(top_n)

    print(f"✅ Found {len(top)} recommendations.")
    return top[[
        "title","author","genre","rating",
        "xgb_proba","svd_proba","sim_score","final_score"
    ]].reset_index(drop=True)



In [168]:
# ─ run it ─
recs = recommend_books(df, genres=["Romance", "Action"], top_n=10)
print(recs)

In [169]:
def suggest_library_books(
    df,
    user_df,
    top_m_genres: int = 5,
    top_n_books: int = 5
):
    # 1) explode user_df into one row per (user, genre)
    uf = user_df.copy()
    def norm(g):
        if isinstance(g, str):
            return [x.strip() for x in g.split(",")]
        return list(g)
    uf["genre_list"] = uf["genres"].apply(norm)
    uf_exp = uf.explode("genre_list")

    # 2) count preferences
    genre_counts = uf_exp["genre_list"].value_counts()

    # 3) pick top M genres
    top_genres = genre_counts.head(top_m_genres).index.tolist()

    # 4) for each genre, get your recommend_books top-N
    suggestions = {}
    for genre in top_genres:
        # recommend_books(df, [genre], top_n_books) returns a DataFrame
        recs = recommend_books(df, genres=[genre], top_n=top_n_books)
        suggestions[genre] = recs

    return suggestions, genre_counts

# ─── Example Usage ────────────────────────────────────────────────────────────

# (Re‐use your 20‐user DataFrame from before)
user_data = pd.DataFrame({
    "user_id": [f"user_{i}" for i in range(1,21)],
    "genres": [
        "Fiction,Fantasy", "Romance,Nonfiction", "History", "Science Fiction,Mystery",
        "Fantasy,Young Adult", "Mystery,Thriller", "Business,Leadership",
        "Poetry,Art", "Science,Mathematics", "Biography,Memoir",
        "Self Help,Health", "Travel,Photography", "Cooking,Food",
        "Comics,Graphic Novels", "Religion,Spirituality", "Music",
        "Children’s", "Horror", "Classics,Philosophy", "Sports"
    ]
})

# Get suggestions
suggestions, genre_counts = suggest_library_books(
    df,
    user_data,
    top_m_genres=5,    # e.g. pick the 5 most‐popular genres among users
    top_n_books=5      # and get top‐5 books per genre
)

print("📊 User‐genre counts:\n", genre_counts.head(10), "\n")

for genre, rec_df in suggestions.items():
    print(f"── Top books to stock for genre: {genre} ──")
    print(rec_df[["title","author","rating","final_score"]], "\n")
