In [3]:
# Cleaning and transposing CSV data into n-columns and saving in XLSX format
# Version 0.1
import os
import pandas as pd
import numpy as np
import re

In [4]:
file_path = 'home/username/Documents/your_file.csv'
df = pd.read_csv(file_path, header = None) # Assumes that your CSV file does not have a header

In [6]:
# Droping empty rows
df_cleaned = df.dropna(axis=1, how='all')
df_cleaned = df_cleaned.dropna()
df_cleaned.head(10)

In [232]:
headers = ['Header 1', 'Header 2', 'Header 3']

In [233]:
chunks = 3 # Number of rows that needs to be transposed into columns

#Pad extra rows with NaN
extra_rows = len(df_cleaned) % chunks
if extra_rows > 0:
    padding = chunks - extra_rows
    padding_df = pd.Series([np.nan] * padding)
    df_cleaned = pd.concat([df_cleaned, padding_df], ignore_index=True)

# Transpose the cleaned data
reshaped_data = df_cleaned.values.reshape(-1,chunks)
df_transposed = pd.DataFrame(reshaped_data)

In [234]:
# Ensure the number of columns in df_transposed matches the number of headers
if len(df_transposed.columns) == len(headers):
    df_transposed.columns = headers
else:
    print(f"Warning: The number of headers does not match the number of columns in df_transposed.")


In [235]:
# Function to remove non-printable characters (including control characters)
def remove_non_printable(text):
    if isinstance(text, str):
        # Remove any non-printable characters (e.g., control characters)
        return re.sub(r'[^\x20-\x7E]', '', text)  # Remove characters outside the printable range
    return text


In [236]:
# Clean both headers and data by removing non-printable characters
df_transposed.columns = df_transposed.columns.map(remove_non_printable)  # Clean column headers
df_transposed = df_transposed.apply(remove_non_printable)  # Clean data cells


In [237]:
# Create the cleaned_data directory if it doesn't exist
output_dir = 'cleaned_data'
os.makedirs(output_dir, exist_ok=True)

In [238]:
# Save the df_transposed to an Excel file in the cleaned_data directory
base_name = os.path.splitext(os.path.basename(file_path))[0]
output_file = os.path.join(output_dir, f"{base_name}_transposed.xlsx")
df_transposed.to_excel(output_file, index=False, header=True)

print(f"File saved to {output_file}")

File saved to cleaned_data/compla_transposed.xlsx


In [128]:
# Alternatively, save  the df_transposed to an Excel file in the cleaned_data directory
# base_name = os.path.splitext(os.path.basename(file_path))[0]
# output_file = f"{base_name}_transposed.csv"
# df_transposed.to_csv(output_file, index=False, header=False)

# output_file = os.path.join(output_dir, f"{base_name}_transposed.xlsx")
# df_transposed.to_excel(output_file, index=False, header=True)
