## Reading a Parquet file from an online source

In [1]:
import pandas as pd
import pyarrow.parquet as pq
import requests
import io

# URL of the Parquet file
url = 'https://github.com/Teradata/kylo/raw/master/samples/sample-data/parquet/userdata1.parquet'

# Download the Parquet file
response = requests.get(url)
parquet_file = io.BytesIO(response.content)

# Read the Parquet file into a pandas DataFrame
df = pd.read_parquet(parquet_file)

# Display the first few rows of the DataFrame
df.head()

Unnamed: 0,registration_dttm,id,first_name,last_name,email,gender,ip_address,cc,country,birthdate,salary,title,comments
0,2016-02-03 07:55:29,1,Amanda,Jordan,ajordan0@com.com,Female,1.197.201.2,6759521864920116.0,Indonesia,3/8/1971,49756.53,Internal Auditor,100.0
1,2016-02-03 17:04:03,2,Albert,Freeman,afreeman1@is.gd,Male,218.111.175.34,,Canada,1/16/1968,150280.17,Accountant IV,
2,2016-02-03 01:09:31,3,Evelyn,Morgan,emorgan2@altervista.org,Female,7.161.136.94,6767119071901597.0,Russia,2/1/1960,144972.51,Structural Engineer,
3,2016-02-03 00:36:21,4,Denise,Riley,driley3@gmpg.org,Female,140.35.109.83,3576031598965625.0,China,4/8/1997,90263.05,Senior Cost Accountant,
4,2016-02-03 05:05:31,5,Carlos,Burns,cburns4@miitbeian.gov.cn,,169.113.235.40,5602256255204850.0,South Africa,,,,


In [2]:
# Get the number of rows
num_rows = df.shape[0]

# Display the number of rows
print(f'Number of rows: {num_rows}')


Number of rows: 1000


## Processing the DataFrame

### Get distinct values from the birthdate column

In [3]:
# Get distinct values from the "birthdate" column
distinct_birthdates = df['birthdate'].dropna().unique()

# Display the distinct birthdates
distinct_birthdates


array(['3/8/1971', '1/16/1968', '2/1/1960', '4/8/1997', '', '2/25/1983',
       '12/18/1987', '3/1/1962', '3/27/1992', '1/28/1997', '8/12/1968',
       '8/15/1975', '6/27/1970', '12/20/1989', '4/13/1990', '1/15/1978',
       '6/5/1985', '1/23/1971', '6/5/1964', '12/10/1979', '9/22/1995',
       '9/21/1968', '5/20/1958', '7/21/1986', '11/13/1969', '2/6/1968',
       '8/20/1964', '8/8/1991', '1/28/1958', '3/8/1972', '12/19/1999',
       '8/28/1967', '9/28/1963', '4/8/1969', '4/6/1990', '12/12/1974',
       '5/29/1978', '11/1/1992', '8/26/1986', '6/26/1971', '9/6/1957',
       '5/7/1985', '8/23/1986', '7/21/1984', '2/15/1963', '11/28/1963',
       '12/13/1962', '5/27/1959', '4/10/1965', '4/5/1991', '5/6/1982',
       '7/25/1994', '5/3/1964', '8/29/1971', '2/13/1978', '11/6/1978',
       '4/11/1985', '12/20/1984', '3/9/1985', '9/24/1960', '9/30/1987',
       '9/12/1997', '11/4/1955', '6/5/1978', '11/4/1970', '10/8/1990',
       '10/28/1961', '12/23/1988', '5/25/1986', '6/24/2000', '2/1/196

### Create an "age" column and handle invalid dates

In [4]:
from datetime import datetime

# Convert the "birthdate" column to datetime format, coerce errors to NaT (Not a Time)
df['birthdate'] = pd.to_datetime(df['birthdate'], format='%m/%d/%Y', errors='coerce')

# Drop rows with NaT in the "birthdate" column
df = df.dropna(subset=['birthdate'])

# Calculate the age
today = datetime.today()

# The lambda function accurately calculates the age considering whether the birthday has occurred this year or not
df['age'] = df['birthdate'].apply(lambda x: today.year - x.year - ((today.month, today.day) < (x.month, x.day)))

# Display the first few rows to check the new "age" column
df.head()


Unnamed: 0,registration_dttm,id,first_name,last_name,email,gender,ip_address,cc,country,birthdate,salary,title,comments,age
0,2016-02-03 07:55:29,1,Amanda,Jordan,ajordan0@com.com,Female,1.197.201.2,6759521864920116.0,Indonesia,1971-03-08,49756.53,Internal Auditor,100.0,53
1,2016-02-03 17:04:03,2,Albert,Freeman,afreeman1@is.gd,Male,218.111.175.34,,Canada,1968-01-16,150280.17,Accountant IV,,56
2,2016-02-03 01:09:31,3,Evelyn,Morgan,emorgan2@altervista.org,Female,7.161.136.94,6767119071901597.0,Russia,1960-02-01,144972.51,Structural Engineer,,64
3,2016-02-03 00:36:21,4,Denise,Riley,driley3@gmpg.org,Female,140.35.109.83,3576031598965625.0,China,1997-04-08,90263.05,Senior Cost Accountant,,27
5,2016-02-03 07:22:34,6,Kathryn,White,kwhite5@google.com,Female,195.131.81.179,3583136326049310.0,Indonesia,1983-02-25,69227.11,Account Executive,,41


### Perform data processing

In [5]:
# Perform some simple data processing
# For example, filtering the data for users older than 30
filtered_df = df[df['age'] > 30]

# Display the filtered DataFrame
filtered_df.head()

Unnamed: 0,registration_dttm,id,first_name,last_name,email,gender,ip_address,cc,country,birthdate,salary,title,comments,age
0,2016-02-03 07:55:29,1,Amanda,Jordan,ajordan0@com.com,Female,1.197.201.2,6759521864920116.0,Indonesia,1971-03-08,49756.53,Internal Auditor,100.0,53
1,2016-02-03 17:04:03,2,Albert,Freeman,afreeman1@is.gd,Male,218.111.175.34,,Canada,1968-01-16,150280.17,Accountant IV,,56
2,2016-02-03 01:09:31,3,Evelyn,Morgan,emorgan2@altervista.org,Female,7.161.136.94,6767119071901597.0,Russia,1960-02-01,144972.51,Structural Engineer,,64
5,2016-02-03 07:22:34,6,Kathryn,White,kwhite5@google.com,Female,195.131.81.179,3583136326049310.0,Indonesia,1983-02-25,69227.11,Account Executive,,41
6,2016-02-03 08:33:08,7,Samuel,Holmes,sholmes6@foxnews.com,Male,232.234.81.197,3582641366974690.0,Portugal,1987-12-18,14247.62,Senior Financial Analyst,,36


In [6]:
# Get the number of rows in the filtered DataFrame
filtered_num_rows = filtered_df.shape[0]

# Display the number of rows in the filtered DataFrame
print(f'Number of rows in filtered DataFrame: {filtered_num_rows}')


Number of rows in filtered DataFrame: 661


## Writing the processed DataFrame back to a Parquet file

In [7]:
# Write the filtered DataFrame to a new Parquet file
filtered_df.to_parquet('filtered_user_data.parquet')

# To confirm, read the Parquet file back into a DataFrame and display
df_written = pd.read_parquet('filtered_user_data.parquet')

df_written.head()

Unnamed: 0,registration_dttm,id,first_name,last_name,email,gender,ip_address,cc,country,birthdate,salary,title,comments,age
0,2016-02-03 07:55:29,1,Amanda,Jordan,ajordan0@com.com,Female,1.197.201.2,6759521864920116.0,Indonesia,1971-03-08,49756.53,Internal Auditor,100.0,53
1,2016-02-03 17:04:03,2,Albert,Freeman,afreeman1@is.gd,Male,218.111.175.34,,Canada,1968-01-16,150280.17,Accountant IV,,56
2,2016-02-03 01:09:31,3,Evelyn,Morgan,emorgan2@altervista.org,Female,7.161.136.94,6767119071901597.0,Russia,1960-02-01,144972.51,Structural Engineer,,64
5,2016-02-03 07:22:34,6,Kathryn,White,kwhite5@google.com,Female,195.131.81.179,3583136326049310.0,Indonesia,1983-02-25,69227.11,Account Executive,,41
6,2016-02-03 08:33:08,7,Samuel,Holmes,sholmes6@foxnews.com,Male,232.234.81.197,3582641366974690.0,Portugal,1987-12-18,14247.62,Senior Financial Analyst,,36
