In [12]:
import pandas as pd
import dask.dataframe as dd
import modin as mpd
import ray as rd
import yaml
import os
import gzip
import time



In [6]:
#Read the CSV file using different methods
pandas_df = pd.read_csv('bmi.csv')
dask_df = dd.read_csv('bmi.csv')


In [14]:
#Presenting Computational Efficiency
# File path
file_path = 'bmi.csv'  # Replace with the actual file path

# Measure time for Pandas
start_time_pandas = time.time()
pandas_df = pd.read_csv(file_path)
pandas_time = time.time() - start_time_pandas

# Measure time for Dask
start_time_dask = time.time()
dask_df = dd.read_csv(file_path)
dask_time = time.time() - start_time_dask

# Print the loading times
print(f"Pandas Loading Time: {pandas_time:.3f} seconds")
print(f"Dask Loading Time: {dask_time:.3f} seconds")

Pandas Loading Time: 0.009 seconds
Dask Loading Time: 0.005 seconds


In [7]:
#Basic Data Validation
pandas_df.columns = pandas_df.columns.str.replace(r'[^\w\s]', '').str.strip()


In [8]:
#YAML file for column names and separator
columns_yaml = {
    'columns': list(pandas_df.columns),
    'separator': '|'
}

with open('columns.yaml', 'w') as yaml_file:
    yaml.dump(columns_yaml, yaml_file)

In [9]:
#Validate number of columns and column names
with open('columns.yaml', 'r') as yaml_file:
    yaml_data = yaml.load(yaml_file, Loader=yaml.FullLoader)

expected_columns = yaml_data['columns']
expected_separator = yaml_data['separator']

if set(expected_columns) == set(pandas_df.columns) and expected_separator == '|':
    print("Columns and separator match the YAML file.")
else:
    print("Columns or separator do not match the YAML file.")


Columns and separator match the YAML file.


In [10]:
#pipe-separated text file (|) in gz format
output_file = 'your_data_output.txt.gz'

with gzip.open(output_file, 'wt', encoding='utf-8') as f:
    pandas_df.to_csv(f, sep=expected_separator, index=False)

print(f"File '{output_file}' written successfully.")


File 'your_data_output.txt.gz' written successfully.


In [11]:
#File Summary
total_rows = len(pandas_df)
total_columns = len(pandas_df.columns)
file_size = os.path.getsize(output_file)

print(f"Total number of rows: {total_rows}")
print(f"Total number of columns: {total_columns}")
print(f"File size: {file_size} bytes")


Total number of rows: 741
Total number of columns: 5
File size: 10498 bytes
