In [None]:
from io import StringIO
import os

import boto3
import pandas as pd

import matplotlib.pyplot as plt

pd.set_option("display.max_columns", 50)

In [None]:
aws_access_key = os.getenv("AWS_ACCESS_KEY")
aws_secret_key = os.getenv("AWS_ACCESS_SECRET_KEY")

In [None]:
def read_csv_from_s3(bucket: str, path: str, filename: str) -> pd.DataFrame:
    """
    Reads a CSV file from an Amazon S3 bucket and returns it as a Pandas DataFrame.

    Parameters:
        bucket (str): The name of the S3 bucket.
        path (str): The path within the S3 bucket where the file is located.
        filename (str): The name of the CSV file to be read.

    Returns:
        pd.DataFrame: A DataFrame containing the data from the CSV file.
    """
    
    s3 = boto3.client("s3", aws_access_key_id = aws_access_key, aws_secret_access_key = aws_secret_key)
    
    full_path = f"{path}{filename}"

    object = s3.get_object(Bucket = bucket, Key = full_path)
    object = object["Body"].read().decode("utf-8")
    output_df = pd.read_csv(StringIO(object))
    
    return output_df

In [None]:
def read_csv_from_s3_folder(bucket: str, folder_path: str) -> pd.DataFrame:
    data_list = []

    for file in s3.list_objects(Bucket = bucket, Prefix = folder_path)['Contents']:
        file_key = file['Key']
        
        filename = file_key.split("/")[-1].strip()
        
        if filename != "":
            if file_key.split(".")[-1] == "csv":

                data = read_csv_from_s3(bucket, folder_path, filename)
                data_list.append(data)
                print(f"{filename} has been added.")

    return pd.concat(data_list, ignore_index=True)

In [None]:
s3 = boto3.client("s3", aws_access_key_id = aws_access_key, aws_secret_access_key = aws_secret_key)

bucket = "cubix-chicago-taxi-vi"

com_areas_path = "transformed_data/community_areas/"
company_path = "transformed_data/company/"
date_path = "transformed_data/date/"
payment_type_path = "transformed_data/payment_type/"
taxi_trips_path = "transformed_data/taxi_trips/"
weather_path = "transformed_data/weather/"


In [None]:
com_ares = read_csv_from_s3(bucket, com_areas_path, "community_areas_master.csv" )
company = read_csv_from_s3(bucket, company_path, "company_master.csv" )
date = read_csv_from_s3(bucket, date_path, "date_dimension.csv" )
payment_type = read_csv_from_s3(bucket, payment_type_path, "payment_type_master.csv" )

trips = read_csv_from_s3_folder(bucket, taxi_trips_path)
weather = read_csv_from_s3_folder(bucket, weather_path)

### Extras of the Hourly weather data

In [None]:
def precipitation(row):
    if row["rain"] != row["precipitation"]:
        val = "Other"
    elif row["rain"] > 0:
        val = "Rainy"
    else:
        val = "No"
    return val

In [None]:
weather["hour"] = pd.to_datetime(weather["datetime"]).dt.hour
weather['is_rainy'] = weather.apply(precipitation, axis=1)
weather.info()
weather.sample()

In [None]:
#com_ares.head()
#company.head()
#date.head()
#payment_type.head()


## Join them together

In [None]:
trips_full = pd.merge(trips, weather, left_on="datetime_for_weather", right_on = "datetime", how="inner")
trips_full = trips_full.drop(columns=["datetime"])

In [None]:
trips_full = pd.merge(trips_full, company, left_on="company_id", right_on = "company_id", how="inner")
trips_full = trips_full.drop(columns=["company_id"])

In [None]:
trips_full = pd.merge(trips_full, payment_type, left_on="payment_type_id", right_on = "payment_type_id", how="inner")
trips_full = trips_full.drop(columns=["payment_type_id"])

In [None]:
trips_full = pd.merge(trips_full, com_ares, left_on="pickup_community_area_id", right_on = "area_code", how="inner")
trips_full = trips_full.drop(columns=["pickup_community_area_id", "area_code"])
trips_full.rename(columns={"community_name": "pickup_community_name"}, inplace=True)

In [None]:
trips_full = pd.merge(trips_full, com_ares, left_on="dropoff_community_area_id", right_on = "area_code", how="inner")
trips_full = trips_full.drop(columns=["dropoff_community_area_id", "area_code"])
trips_full.rename(columns={"community_name": "dropoff_community_name"}, inplace=True)

In [None]:
date["date"] = pd.to_datetime(date["date"])
trips_full["trip_start_timestamp"] = pd.to_datetime(trips_full["trip_start_timestamp"])

trips_full["trip_start_date"] = trips_full["trip_start_timestamp"].dt.date
trips_full["trip_start_date"] = pd.to_datetime(trips_full["trip_start_date"])

In [None]:
trips_full = pd.merge(trips_full, date, left_on="trip_start_date", right_on = "date", how="inner")
trips_full = trips_full.drop(columns=["date"])

In [None]:
trips_full.head()
trips_full.info()

#com_ares.head()

In [None]:
#date.head()
#date.info()