In [2]:
# Import the necessary functions from the modular files
from read_parquet import read_parquet_file, display_data
from write_parquet import create_dataframe, write_parquet_file
from convert_formats import convert_csv_to_parquet, convert_parquet_to_csv
from merge_parquet import merge_parquet_files
from data_analysis import perform_data_analysis
from query_parquet import query_and_filter_data

In [3]:
# File paths or filenames
parquet_file = 'files/sample.parquet'
csv_file = 'files/sample.csv'
output_parquet_file = 'files/output.parquet'
output_csv_file = 'files/output.csv'
file1 = 'files/file1.parquet'
file2 = 'files/file2.parquet'
merged_file = 'files/merged_file.parquet'

In [4]:
# Reading and displaying Parquet data
df = read_parquet_file(parquet_file)
display_data(df)

    registration_dttm  id first_name last_name                     email  \
0 2016-02-03 07:55:29   1     Amanda    Jordan          ajordan0@com.com   
1 2016-02-03 17:04:03   2     Albert   Freeman           afreeman1@is.gd   
2 2016-02-03 01:09:31   3     Evelyn    Morgan   emorgan2@altervista.org   
3 2016-02-03 00:36:21   4     Denise     Riley          driley3@gmpg.org   
4 2016-02-03 05:05:31   5     Carlos     Burns  cburns4@miitbeian.gov.cn   

   gender      ip_address                cc       country  birthdate  \
0  Female     1.197.201.2  6759521864920116     Indonesia   3/8/1971   
1    Male  218.111.175.34                          Canada  1/16/1968   
2  Female    7.161.136.94  6767119071901597        Russia   2/1/1960   
3  Female   140.35.109.83  3576031598965625         China   4/8/1997   
4          169.113.235.40  5602256255204850  South Africa              

      salary                   title comments  
0   49756.53        Internal Auditor    1E+02  
1  150280.17  

In [5]:
# Writing data to a Parquet file
df = create_dataframe()
write_parquet_file(df, output_parquet_file)

In [6]:
# Converting data between different formats
convert_csv_to_parquet(csv_file, output_parquet_file)
convert_parquet_to_csv(parquet_file, output_csv_file)

In [7]:
# Merging or appending Parquet files
merge_parquet_files(file1, file2, merged_file)

In [9]:
# Performing data analysis
mean_salary, filtered_df = perform_data_analysis(parquet_file)
print('Mean salary:', mean_salary)
print(filtered_df)

Mean salary: 149005.35665236053
      registration_dttm    id first_name last_name                    email  \
1   2016-02-03 17:04:03     2     Albert   Freeman          afreeman1@is.gd   
2   2016-02-03 01:09:31     3     Evelyn    Morgan  emorgan2@altervista.org   
3   2016-02-03 00:36:21     4     Denise     Riley         driley3@gmpg.org   
5   2016-02-03 07:22:34     6    Kathryn     White       kwhite5@google.com   
7   2016-02-03 06:47:06     8      Harry    Howell      hhowell7@eepurl.com   
..                  ...   ...        ...       ...                      ...   
993 2016-02-03 01:14:13   994      Carol  Williams     cwilliamsrl@army.mil   
994 2016-02-03 00:18:26   995       Jose     Mccoy      jmccoyrm@elpais.com   
995 2016-02-03 10:30:59   996     Dennis    Harris     dharrisrn@eepurl.com   
996 2016-02-03 17:16:53   997     Gloria  Hamilton   ghamiltonro@rambler.ru   
999 2016-02-03 09:52:18  1000      Julie     Meyer      jmeyerrr@flavors.me   

     gender       i

In [10]:
# Querying and filtering Parquet data
filtered_df = query_and_filter_data(parquet_file, 'salary > 50000')
print(filtered_df)

      registration_dttm    id first_name last_name                    email  \
1   2016-02-03 17:04:03     2     Albert   Freeman          afreeman1@is.gd   
2   2016-02-03 01:09:31     3     Evelyn    Morgan  emorgan2@altervista.org   
3   2016-02-03 00:36:21     4     Denise     Riley         driley3@gmpg.org   
5   2016-02-03 07:22:34     6    Kathryn     White       kwhite5@google.com   
7   2016-02-03 06:47:06     8      Harry    Howell      hhowell7@eepurl.com   
..                  ...   ...        ...       ...                      ...   
993 2016-02-03 01:14:13   994      Carol  Williams     cwilliamsrl@army.mil   
994 2016-02-03 00:18:26   995       Jose     Mccoy      jmccoyrm@elpais.com   
995 2016-02-03 10:30:59   996     Dennis    Harris     dharrisrn@eepurl.com   
996 2016-02-03 17:16:53   997     Gloria  Hamilton   ghamiltonro@rambler.ru   
999 2016-02-03 09:52:18  1000      Julie     Meyer      jmeyerrr@flavors.me   

     gender       ip_address                  cc   