# Cleaning
Clean messy dataframes provided by messy people

In [7]:
import csv
from io import StringIO
from pathlib import Path

import numpy as np
import polars as pl

## Robert

In [8]:
# Open Robert's ridiculous file
path_robert = Path("data/goodreads/messy/Thomas is een worstje.csv")

# Open file with utf-8 encoding
with Path(path_robert).open(encoding="utf-8") as f:
    data = f.read()

# Split data into records using the semicolon as the separator
records = data.strip().split(";")

# Remove empty records and strip whitespace
records = [record.strip() for record in records if record.strip()]

# Store first record as the header
header = records[0]

# Use csv.reader to parse the header
header_io = StringIO(header)
reader = csv.reader(header_io, delimiter=",", quotechar='"')
columns = next(reader)

# Parse the remaining records
parsed_records = []
for record in records[1:]:
    # Remove leading and trailing quotes
    record_stripped = record.strip('"')
    # replace double quotes with single quotes
    record_cleaned = record_stripped.replace('""', '"')
    # Use csv.reader to parse each record
    record_io = StringIO(record_cleaned)
    reader = csv.reader(record_io, delimiter=",", quotechar='"')
    parsed_record = next(reader)
    # Append to list
    parsed_records.append(parsed_record)

# Delete all elementes which are not 24 long (because, for some reason Robert, sometimes they are not)
parsed_records = [record for record in parsed_records if len(record) == 24]  # noqa: PLR2004

# Create a Polars DataFrame
df_clean = pl.from_numpy(np.array(parsed_records), orient="row")

# Set the column names
df_clean.columns = columns

# Write the data to a new CSV file
df_clean.write_csv("data/goodreads/clean/Thomas is een worstje_clean.csv")

print(df_clean)

shape: (663, 24)
┌───────────┬───────────────┬──────────────┬──────────────┬───┬─────────┬─────────┬───────┬────────┐
│ Book Id   ┆ Title         ┆ Author       ┆ Author l-f   ┆ … ┆ Spoiler ┆ Private ┆ Read  ┆ Owned  │
│ ---       ┆ ---           ┆ ---          ┆ ---          ┆   ┆ ---     ┆ Notes   ┆ Count ┆ Copies │
│ str       ┆ str           ┆ str          ┆ str          ┆   ┆ str     ┆ ---     ┆ ---   ┆ ---    │
│           ┆               ┆              ┆              ┆   ┆         ┆ str     ┆ str   ┆ str    │
╞═══════════╪═══════════════╪══════════════╪══════════════╪═══╪═════════╪═════════╪═══════╪════════╡
│ 128427762 ┆ Galapagos by  ┆ Kurt         ┆ Jr., Kurt    ┆ … ┆         ┆         ┆ 1     ┆ 0      │
│           ┆ Kurt Vonnegut ┆ Vonnegut Jr. ┆ Vonnegut     ┆   ┆         ┆         ┆       ┆        │
│           ┆ (19…          ┆              ┆              ┆   ┆         ┆         ┆       ┆        │
│ 19161852  ┆ The Fifth     ┆ N.K. Jemisin ┆ Jemisin,     ┆ … ┆         ┆ 

## Peter

In [9]:
# Open Peter's file
path_peter = Path("data/goodreads/messy/goodreads_library_export-PHT.csv")

# Clean Peter's file - provide the correct separator
df_clean = pl.read_csv(path_peter, separator=";")

# Write the data to a new CSV file
df_clean.write_csv("data/goodreads/clean/goodreads_library_export-PHT_clean.csv")

print(df_clean)

shape: (172, 24)
┌──────────┬───────────────┬───────────────┬──────────────┬───┬─────────┬─────────┬───────┬────────┐
│ Book Id  ┆ Title         ┆ Author        ┆ Author l-f   ┆ … ┆ Spoiler ┆ Private ┆ Read  ┆ Owned  │
│ ---      ┆ ---           ┆ ---           ┆ ---          ┆   ┆ ---     ┆ Notes   ┆ Count ┆ Copies │
│ i64      ┆ str           ┆ str           ┆ str          ┆   ┆ str     ┆ ---     ┆ ---   ┆ ---    │
│          ┆               ┆               ┆              ┆   ┆         ┆ str     ┆ i64   ┆ i64    │
╞══════════╪═══════════════╪═══════════════╪══════════════╪═══╪═════════╪═════════╪═══════╪════════╡
│ 16234584 ┆ The Drowned   ┆ J.G. Ballard  ┆ Ballard,     ┆ … ┆ null    ┆ null    ┆ 1     ┆ 0      │
│          ┆ World         ┆               ┆ J.G.         ┆   ┆         ┆         ┆       ┆        │
│ 4883025  ┆ Vochtige      ┆ Charlotte     ┆ Roche,       ┆ … ┆ null    ┆ null    ┆ 1     ┆ 0      │
│          ┆ streken       ┆ Roche         ┆ Charlotte    ┆   ┆         ┆ 