In [1]:
import os
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = 'Key/pragmatic-bongo-404116-e2d94f71da27.json'

#### Data Transformation (Pulling Data From Storage First)

In [None]:
from google.cloud import storage
import pandas as pd
import io
import hashlib

# Replace with your GCS bucket and blob name
bucket_name = 'cis4400_homework'
source_blob_name = 'motor_vehicle_collisions_1.csv'

# Initialize a storage client
storage_client = storage.Client()
bucket = storage_client.bucket(bucket_name)

# Create a function to download the CSV file from GCS into memory
def download_blob_to_memory(bucket_name, source_blob_name):
    blob = bucket.blob(source_blob_name)
    data = blob.download_as_bytes()
    return io.BytesIO(data)

# Download the CSV file from GCS into memory
csv_memory = download_blob_to_memory(bucket_name, source_blob_name)

# Load the CSV data into a Pandas DataFrame
df = pd.read_csv(csv_memory)
# Replace spaces with underscores in the column names
df.columns = df.columns.str.replace(' ', '_')

# 1. Unified date format YYYY-MM-DD
df['CRASH_DATE'] = pd.to_datetime(df['CRASH_DATE']).dt.strftime('%Y-%m-%d')

# 2. Splitting the date into multiple units (Year, Month, Day)
df['YEAR'] = pd.to_datetime(df['CRASH_DATE']).dt.year
df['MONTH'] = pd.to_datetime(df['CRASH_DATE']).dt.month
df['DAY'] = pd.to_datetime(df['CRASH_DATE']).dt.day

# 3. Removing NULL values
# For demonstration, we will replace NaN values in 'ZIP_CODE' with a placeholder value (99999)
df['ZIP_CODE'].fillna(99999, inplace=True)

# 4. Removing Duplicate rows
df.drop_duplicates(inplace=True)

# 5. Verify Data against data reference (ZIP codes should be integers)
df['ZIP_CODE'] = df['ZIP_CODE'].astype(int)

# 6. Correct data types for new facts generated
# As an example, we convert 'COLLISION_ID' to a string, as it is a unique identifier and not a numerical value
df['COLLISION_ID'] = df['COLLISION_ID'].astype(str)

# 7. Adding one or many columns
# Add a column indicating whether an accident resulted in injuries or not
df['INJURIES'] = df['NUMBER_OF_PERSONS_INJURED'] > 0


def create_location_id(row):
    # Create a unique hash for each location based on ZIP_CODE and BOROUGH
    # You can include more fields if needed
    hasher = hashlib.sha1()
    hasher.update(str(row['ZIP_CODE']).encode('utf-8'))
    hasher.update(str(row['BOROUGH']).encode('utf-8'))
    # Return the first 10 characters of the hash as the location ID
    return hasher.hexdigest()[:10]

# Create a DATE_ID using the YYYYMMDD format
df['DATE_ID'] = df['YEAR'].astype(str) + \
                                df['MONTH'].astype(str).str.zfill(2) + \
                                df['DAY'].astype(str).str.zfill(2)

# Create a TIME_ID using the HHMM format (assuming you have a 'CRASH_TIME' column in HH:MM format)
df['TIME_ID'] = df['CRASH_TIME'].str.replace(':', '')

# Create a LOCATION_ID using a combination of ZIP_CODE and BOROUGH
df['LOCATION_ID'] = df.apply(create_location_id, axis=1)