## 🧩 Imports and Setup


In [17]:
import pandas as pd
import numpy as np
import os
import ast
import re

## 📁 Directory Setup

In [18]:
data_dir = '../data'
raw_dir = os.path.join(data_dir, 'raw')
processed_dir = os.path.join(data_dir, 'processed')


## 📚 Load Raw Data


In [None]:
def load_raw_data():
    print("Loading raw datasets...")

    books_path = os.path.join(raw_dir, '/Users/whoseunassailable/Documents/coding_projects/college_projects/readiculous/data/raw/Books.csv')
    ratings_path = os.path.join(raw_dir, '/Users/whoseunassailable/Documents/coding_projects/college_projects/readiculous/data/raw/Ratings.csv')
    users_path = os.path.join(raw_dir, '/Users/whoseunassailable/Documents/coding_projects/college_projects/readiculous/data/raw/Users.csv')

    books_df = pd.read_csv(books_path, encoding='latin-1')
    ratings_df = pd.read_csv(ratings_path, encoding='latin-1')
    users_df = pd.read_csv(users_path, encoding='latin-1')

    print(f"Loaded {len(books_df)} books")
    print(f"Loaded {len(ratings_df)} ratings")
    print(f"Loaded {len(users_df)} users")

    return books_df, ratings_df, users_df


In [20]:
def _process_genre_string(self, genre_str):
    if pd.isnull(genre_str) or genre_str == '[]':
        return []
    
    try:
        # Try to parse as a literal Python list
        genres = ast.literal_eval(genre_str)
        if isinstance(genres, list):
            # Clean up each genre
            return [g.strip("'") for g in genres if g.strip("'")]
        return []
    except (ValueError, SyntaxError):
        # If there's an error, try regex matching
        matches = re.findall(r"'([^']*)'", genre_str)
        return [m for m in matches if m]
    

def load_processed_data(filename='combined_books.csv'):
    combined_path = os.path.join(processed_dir, filename)
    
    if not os.path.exists(combined_path):
        print(f"Error: {combined_path} not found")
        return None

    df = pd.read_csv(combined_path)

    # Process genre column if it exists
    if 'matched_genres' in df.columns:
        df['genres_list'] = df['matched_genres'].apply(
            lambda x: _process_genre_string(x) if pd.notnull(x) else []
        )

    print(f"Loaded {len(df)} processed records")
    return df



In [21]:
def get_stats(df):
    stats = {
        'total_entries': len(df),
        'unique_users': df['user_id'].nunique(),
        'unique_books': df['isbn'].nunique(),
        'rating_distribution': df['rating'].value_counts().to_dict(),
        'avg_rating': df['rating'].mean(),
        'missing_values': df.isnull().sum().to_dict()
    }

    if 'genres_list' in df.columns:
        all_genres = []
        for genres in df['genres_list'].dropna():
            all_genres.extend(genres)
        
        genre_counts = pd.Series(all_genres).value_counts().to_dict()
        stats['top_genres'] = dict(sorted(genre_counts.items(), key=lambda item: item[1], reverse=True)[:20])

    return stats


In [22]:
def merge_datasets(self):
    if self.books_df is None or self.ratings_df is None or self.users_df is None:
        self.load_raw_data()
    
    print("Merging datasets...")
    
    # Merge ratings with books
    merged_df = pd.merge(self.ratings_df, self.books_df, on='ISBN', how='inner')
    
    # Merge with users
    self.combined_df = pd.merge(merged_df, self.users_df, on='User-ID', how='inner')
    
    # Rename columns for clarity and consistency
    self.combined_df = self.combined_df.rename(columns={
        'User-ID': 'user_id',
        'ISBN': 'isbn',
        'Book-Rating': 'rating',
        'Book-Title': 'title',
        'Book-Author': 'author',
        'Year-Of-Publication': 'year',
        'Publisher': 'publisher',
        'Age': 'age',
        'Location': 'location'
    })

In [23]:
def save_combined_data(self, filename='combined_books.csv'):
    if self.combined_df is None:
        print("Error: No combined data to save. Run merge_datasets() first.")
        return False
    
    save_path = os.path.join(self.processed_dir, filename)
    try:
        self.combined_df.to_csv(save_path, index=False)
        print(f"Combined dataset saved to {save_path}")
        return True
    except Exception as e:
        print(f"Error saving combined dataset: {e}")
        return False

In [24]:
# Step-by-step run
books_df, ratings_df, users_df = load_raw_data()
combined_df = merge_datasets(books_df, ratings_df, users_df)
save_combined_data(combined_df)
# OR
# combined_df = load_processed_data()

# Show statistics
stats = get_stats(combined_df)
print(f"Total entries: {stats['total_entries']}")
print(f"Unique users: {stats['unique_users']}")
print(f"Unique books: {stats['unique_books']}")
print(f"Average rating: {stats['avg_rating']:.2f}")

if 'top_genres' in stats:
    print("\nTop 10 genres:")
    for genre, count in list(stats['top_genres'].items())[:10]:
        print(f"  {genre}: {count}")


Loading raw datasets...


FileNotFoundError: [Errno 2] No such file or directory: '../data/raw/Books.csv'

In [None]:

# class DataLoader:
#     def __init__(self, data_dir='../data'):
#         self.data_dir = data_dir
#         self.raw_dir = os.path.join(data_dir, 'raw')
#         self.processed_dir = os.path.join(data_dir, 'processed')
        
#         # Initialize dataframes
#         self.books_df = None
#         self.ratings_df = None
#         self.users_df = None
#         self.combined_df = None

#     def load_raw_data(self):
#         print("Loading raw datasets...")
        
#         # Load the books dataset
#         books_path = os.path.join(self.raw_dir, 'Books.csv')
#         self.books_df = pd.read_csv(books_path, error_bad_lines=False, encoding='latin-1')
#         print(f"Loaded {len(self.books_df)} books")
        
#         # Load the ratings dataset
#         ratings_path = os.path.join(self.raw_dir, 'Ratings.csv')
#         self.ratings_df = pd.read_csv(ratings_path, error_bad_lines=False, encoding='latin-1')
#         print(f"Loaded {len(self.ratings_df)} ratings")
        
#         # Load the users dataset
#         users_path = os.path.join(self.raw_dir, 'Users.csv')
#         self.users_df = pd.read_csv(users_path, error_bad_lines=False, encoding='latin-1')
#         print(f"Loaded {len(self.users_df)} users")
        
#         return self.books_df, self.ratings_df, self.users_df

#     def load_processed_data(self, filename='combined_books.csv'):
#         combined_path = os.path.join(self.processed_dir, filename)
        
#         if not os.path.exists(combined_path):
#             print(f"Error: Processed file {combined_path} does not exist.")
#             return None
        
#         print(f"Loading processed dataset: {filename}")
#         self.combined_df = pd.read_csv(combined_path)
        
#         # Process genre strings into lists
#         if 'matched_genres' in self.combined_df.columns:
#             self.combined_df['genres_list'] = self.combined_df['matched_genres'].apply(
#                 lambda x: self._process_genre_string(x) if pd.notnull(x) else []
#             )
        
#         print(f"Loaded {len(self.combined_df)} entries from processed data")
#         return self.combined_df
    


        
#         print(f"Created combined dataset with {len(self.combined_df)} entries")
#         return self.combined_df
    


#     def get_stats(self):
#         if self.combined_df is None:
#             print("No data loaded. Please load data first.")
#             return None
        
#         stats = {
#             'total_entries': len(self.combined_df),
#             'unique_users': self.combined_df['user_id'].nunique(),
#             'unique_books': self.combined_df['isbn'].nunique(),
#             'rating_distribution': self.combined_df['rating'].value_counts().to_dict(),
#             'avg_rating': self.combined_df['rating'].mean(),
#             'missing_values': self.combined_df.isnull().sum().to_dict()
#         }
        
#         # Get genre statistics if available
#         if 'genres_list' in self.combined_df.columns:
#             all_genres = []
#             for genres in self.combined_df['genres_list'].dropna():
#                 all_genres.extend(genres)
            
#             genre_counts = pd.Series(all_genres).value_counts().to_dict()
#             stats['top_genres'] = dict(sorted(genre_counts.items(), 
#                                       key=lambda item: item[1], 
#                                       reverse=True)[:20])
        
#         return stats


# if __name__ == "__main__":
#     # Example usage
#     loader = DataLoader()
    
#     # Either load raw data and merge
#     # books_df, ratings_df, users_df = loader.load_raw_data()
#     # combined_df = loader.merge_datasets()
#     # loader.save_combined_data()
    
#     # Or load existing processed data
#     combined_df = loader.load_processed_data()
    
#     # Print dataset statistics
#     stats = loader.get_stats()
#     if stats:
#         print("\nDataset Statistics:")
#         print(f"Total entries: {stats['total_entries']}")
#         print(f"Unique users: {stats['unique_users']}")
#         print(f"Unique books: {stats['unique_books']}")
#         print(f"Average rating: {stats['avg_rating']:.2f}")
        
#         if 'top_genres' in stats:
#             print("\nTop 10 genres:")
#             for genre, count in list(stats['top_genres'].items())[:10]:
#                 print(f"  {genre}: {count}")