#
<h1 style="color: #FFC0CB; font-size: 24px;">
CSV_DIVIDER CONCEPT
</h1>

I want to divide by 100mb my csv file with table structured data.

Input data set comes from: https://data.world/promptcloud/amazon-mobile-phone-reviews

Unfortunately you have to register to download the file. Another way is concatenating output files to one csv file.


## Full code

In [15]:
import csv
import os
import math

def split_csv(input_path, output_dir, max_file_size_mb, delimiter=';', output_file_name = 'chunk'):
    # Create the output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)

    # Calculate the maximum chunk size in decimal bytes
    max_file_size = max_file_size_mb * 1000 * 1000
    
    # Get the size of the input file
    file_size = os.path.getsize(input_path)
    
    # Calculate the approximate total number of chunks
    total_n_chunks = math.ceil(file_size / max_file_size)
    
    # Initialize variables
    current_chunk = 1
    output_file = os.path.join(output_dir, f"{output_file_name}_{current_chunk}.csv")

    # Read the input file and extract rows
    with open(input_path, 'r') as f:
        reader = csv.reader(f, delimiter=delimiter)
        rows = list(reader)

        # Get the heade row
        header_row = rows[0]
        header_row_str = delimiter.join(header_row) + '\n'
        current_file_size = len(header_row_str)
    
    # Iterate over rows and split into chunks
    for row in rows:
        row_str = delimiter.join(row) + '\n'
        row_size = len(row_str)

        if current_file_size + row_size > max_file_size:
            print(f"Saved {current_chunk} / {total_n_chunks}")

            # Increment chunk number
            current_chunk += 1
            output_file = os.path.join(output_dir, f"{output_file_name}_{current_chunk}.csv")

            with open(output_file, 'w', newline='') as f_new:
                writer = csv.writer(f_new, delimiter=delimiter)
                writer.writerow(header_row)

            current_file_size = len(header_row_str)
        
        with open(output_file, 'a', newline='') as f_out:
            writer = csv.writer(f_out, delimiter=delimiter)
            writer.writerow(row)

            current_file_size += row_size
    
    print(f"Saved {current_chunk} / {total_n_chunks}")

## Production

## Example of usage

In [16]:
# specifyings args and kwargs, file_path, output_dir, max file size in delimiter bytes, and delimier
input_path = os.path.join(os.getcwd(), "input", "amazon.csv")
output_dir = os.path.join(os.getcwd(),"output")
max_file_size_mb = 75


split_csv(input_path, output_dir, max_file_size_mb, delimiter=',')

Saved 1 / 2
Saved 2 / 2


## Test

In [17]:
import pandas as pd
import os
# read loaded file
input_path = os.path.join(os.getcwd(), "input", "amazon.csv")
df = pd.read_csv(input_path, delimiter=',')

# input len
print(len(df))

413840


In [19]:

# input head
df.head()

Unnamed: 0,Product Name,Brand Name,Price,Rating,Reviews,Review Votes
0,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,5,I feel so LUCKY to have found this used (phone...,1.0
1,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,4,"nice phone, nice up grade from my pantach revu...",0.0
2,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,5,Very pleased,0.0
3,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,4,It works good but it goes slow sometimes but i...,0.0
4,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,4,Great phone to replace my lost phone. The only...,0.0


In [20]:
# Initialize an empty DataFrame to store the concatenated data
concatenated_data = pd.DataFrame()

# Path to the output folder
output_dir = os.path.join(os.getcwd(), "output")

# Loop through the files in the output folder
for filename in os.listdir(output_dir):
    if filename.endswith(".csv"):
        input_path = os.path.join(output_dir, filename)
        # Read the CSV file
        data = pd.read_csv(input_path, delimiter=',')
        # Concatenate the data to the existing DataFrame
        concatenated_data = pd.concat([concatenated_data, data], ignore_index=True)

# Concatenated data len 
print(len(concatenated_data))


413840


In [21]:
# Concatenated data head
concatenated_data.head()

Unnamed: 0,Product Name,Brand Name,Price,Rating,Reviews,Review Votes
0,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,5,I feel so LUCKY to have found this used (phone...,1.0
1,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,4,"nice phone, nice up grade from my pantach revu...",0.0
2,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,5,Very pleased,0.0
3,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,4,It works good but it goes slow sometimes but i...,0.0
4,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,4,Great phone to replace my lost phone. The only...,0.0
