# Installing pandas and pyarrow
## Used for data manipulation and analysis, particularly in the context of working with tabular data. 

In [1]:
!pip install pandas pyarrow



In [2]:
import pandas as pd

# Converting parquet file to csv file format to read the dataset

In [5]:
import pandas as pd

def convert_parquet_to_csv(parquet_file, csv_file):
    df = pd.read_parquet(parquet_file)
    df.to_csv(csv_file, index=False)
    print(f"Converted {parquet_file} to {csv_file}")

convert_parquet_to_csv('data/output_data/employee_earnings/earnings_date=2022-02-10/employee_earnings.parquet', 'data/data_csv/output1.csv')
convert_parquet_to_csv('data/output_data/employee_earnings/earnings_date=2022-02-11/employee_earnings.parquet', 'data/data_csv/output2.csv')
convert_parquet_to_csv('data/output_data/employee_earnings/earnings_date=2022-02-12/employee_earnings.parquet', 'data/data_csv/output3.csv')
convert_parquet_to_csv('data/output_data/employee_earnings/earnings_date=2022-02-13/employee_earnings.parquet', 'data/data_csv/output4.csv')
convert_parquet_to_csv('data/output_data/employee_earnings/earnings_date=2022-02-14/employee_earnings.parquet', 'data/data_csv/output5.csv')

Converted data/output_data/employee_earnings/earnings_date=2022-02-10/employee_earnings.parquet to data/data_csv/output1.csv
Converted data/output_data/employee_earnings/earnings_date=2022-02-11/employee_earnings.parquet to data/data_csv/output2.csv
Converted data/output_data/employee_earnings/earnings_date=2022-02-12/employee_earnings.parquet to data/data_csv/output3.csv
Converted data/output_data/employee_earnings/earnings_date=2022-02-13/employee_earnings.parquet to data/data_csv/output4.csv
Converted data/output_data/employee_earnings/earnings_date=2022-02-14/employee_earnings.parquet to data/data_csv/output5.csv


## Creating two new files by summing the earning column of previous two Dataframes

### For file 1

In [6]:
import pandas as pd

def calculate_sum_csv_files(file1, file2, output_file):
    # Read the input CSV files
    df1 = pd.read_csv(file1)
    df2 = pd.read_csv(file2)

    sum_column = df1['earnings'] + df2['earnings'] 

    new_df = df1.copy()
    new_df['earnings'] = sum_column
    new_df = new_df[[col for col in new_df.columns if col != 'earnings'] + ['earnings']]

    new_df.to_csv(output_file, index=False)
    print(f"Sum data written to {output_file}")

# Example usage
file1 = 'data/data_csv/output1.csv'
file2 = 'data/data_csv/output2.csv'
output_file = 'data/output_data/employee_earnings/earnings_date=2022-02-15/employee_earnings.csv'
calculate_sum_csv_files(file1, file2, output_file)


Sum data written to data/output_data/employee_earnings/earnings_date=2022-02-15/employee_earnings.csv


### For file 2

In [7]:
file1 = 'data/data_csv/output3.csv'
file2 = 'data/data_csv/output4.csv'
output_file = 'data/output_data/employee_earnings/earnings_date=2022-02-16/employee_earnings.csv'
calculate_sum_csv_files(file1, file2, output_file)


Sum data written to data/output_data/employee_earnings/earnings_date=2022-02-16/employee_earnings.csv


## Converting csv file to parquet file format 

In [8]:
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq

def convert_csv_to_parquet(csv_file, parquet_file):
    df = pd.read_csv(csv_file)

    # Convert the DataFrame to a PyArrow Table
    table = pa.Table.from_pandas(df)

    # Write the PyArrow Table to Parquet
    pq.write_table(table, parquet_file)
    print(f"Converted {csv_file} to {parquet_file}")

# Example usage
csv_file = 'data/output_data/employee_earnings/earnings_date=2022-02-15/employee_earnings.csv'
parquet_file = 'data/output_data/employee_earnings/earnings_date=2022-02-15/employee_earnings.parquet'
convert_csv_to_parquet(csv_file, parquet_file)


Converted data/output_data/employee_earnings/earnings_date=2022-02-15/employee_earnings.csv to data/output_data/employee_earnings/earnings_date=2022-02-15/employee_earnings.parquet


In [9]:
csv_file = 'data/output_data/employee_earnings/earnings_date=2022-02-16/employee_earnings.csv'
parquet_file = 'data/output_data/employee_earnings/earnings_date=2022-02-16/employee_earnings.parquet'
convert_csv_to_parquet(csv_file, parquet_file)

Converted data/output_data/employee_earnings/earnings_date=2022-02-16/employee_earnings.csv to data/output_data/employee_earnings/earnings_date=2022-02-16/employee_earnings.parquet
