In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("patients_cleaned.csv")

In [3]:
df.head()

Unnamed: 0,row_id,subject_id,gender,dob,dod,dod_hosp,dod_ssn,expire_flag
0,9467,10006,F,2094-03-05,2165-08-12,2165-08-12,2165-08-12,1
1,9472,10011,F,2090-06-05,2126-08-28,2126-08-28,,1
2,9474,10013,F,2038-09-03,2125-10-07,2125-10-07,2125-10-07,1
3,9478,10017,F,2075-09-21,2152-09-12,,2152-09-12,1
4,9479,10019,M,2114-06-20,2163-05-15,2163-05-15,2163-05-15,1


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   row_id       100 non-null    int64 
 1   subject_id   100 non-null    int64 
 2   gender       100 non-null    object
 3   dob          100 non-null    object
 4   dod          100 non-null    object
 5   dod_hosp     70 non-null     object
 6   dod_ssn      77 non-null     object
 7   expire_flag  100 non-null    int64 
dtypes: int64(3), object(5)
memory usage: 6.4+ KB


In [6]:
import pyarrow as pa
import pyarrow.parquet as pq

# Path to CSV
csv_path = "patients_cleaned.csv"

# Download data
df = pd.read_csv(csv_path)

# Convert columns to datetime
date_cols = ['dob', 'dod', 'dod_hosp', 'dod_ssn']
for col in date_cols:
    df[col] = pd.to_datetime(df[col], errors='coerce')

# Manually define schema in a Hive-compatible manner
schema = pa.schema([
    pa.field('row_id', pa.int64()),           # Hive: bigint
    pa.field('subject_id', pa.int64()),       # Hive: bigint
    pa.field('gender', pa.string()),          # Hive: string
    pa.field('dob', pa.timestamp('ns')),      # Hive: timestamp
    pa.field('dod', pa.timestamp('ns')),      # Hive: timestamp
    pa.field('dod_hosp', pa.timestamp('ns')), # Hive: timestamp
    pa.field('dod_ssn', pa.timestamp('ns')),  # Hive: timestamp
    pa.field('expire_flag', pa.int32())       # Hive: int
])

# Convert the DataFrame to a PyArrow table and apply the schema
table = pa.Table.from_pandas(df)
table = table.cast(schema)

# Save the file in Parquet format compatible with Hive
parquet_path = "patients_cleaned.parquet"
pq.write_table(
    table,
    parquet_path,
    version='2.6',
    use_dictionary=True,
    compression='SNAPPY',
    flavor='spark'
)

