In [8]:
from io import StringIO
import os

import boto3
import pandas as pd

In [None]:
aws_access_key_id = os.getenv('AWS_ACCESS_KEY')
aws_secretkey_id = os.getenv('AWS_SECRET_KEY')

In [4]:
s3 = boto3.client('s3', aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secretkey_id)
bucket = 'cubix-chicago-taxi-zsigy'

In [9]:
def read_csv_from_s3(bucket: str, path: str, filename:str) -> pd.DataFrame:
    """ Downloads a csv file from an s3 bucket.

    Args:
        bucket (str):
            The bucket where the file is.
        path (str):
            The folder of the file.
        filename (str):
            The name of the file.
        
    Returns:
        pd.DataFrame:
            The DataFrame of the downloaded file.
    """

    full_path = f'{path}{filename}'
    object = s3.get_object(Bucket=bucket, Key=full_path)
    object = object['Body'].read().decode('utf-8')
    # with StringIO the Body of the object behaves as a file
    df = pd.read_csv(StringIO(object))
    
    return df


In [16]:
community_areas_path = 'transformed_data/community_areas/'
company_path = 'transformed_data/company/'
date_path = 'transformed_data/date/'
payment_type_path = 'transformed_data/payment_types/'
taxi_trips_path = 'transformed_data/taxi_trips/'
weather_path = 'transformed_data/weather/'

In [None]:
community_areas = read_csv_from_s3(bucket= bucket, path= community_areas_path, filename= 'community_areas_master.csv')
print(community_areas.head())
company = read_csv_from_s3(bucket= bucket, path= company_path, filename='company_master.csv')
print(company.head())
date = read_csv_from_s3(bucket= bucket, path= date_path, filename= 'date_dimension.csv')
print(date.head())
payment_types = read_csv_from_s3(bucket= bucket, path= payment_type_path, filename= 'payment_type_master.csv')
print(payment_types.head())

In [24]:
trips_list = []
weather_list = []

In [25]:
for file in s3.list_objects(Bucket = bucket, Prefix = taxi_trips_path)['Contents']:
    taxi_trip_file_key = file['Key']
    if taxi_trip_file_key.split('/')[-1].strip() != '':
        if taxi_trip_file_key.split('.')[1] == 'csv':
            filename = taxi_trip_file_key.split('/')[-1]
            trip = read_csv_from_s3(bucket= bucket, path= taxi_trips_path, filename= filename)
            trips_list.append(trip)
            print(f'{filename} has been added.')


taxi_2023-11-18.csv has been added.
taxi_2023-11-19.csv has been added.
taxi_2023-11-20.csv has been added.
taxi_2023-11-21.csv has been added.
taxi_2023-11-22.csv has been added.
taxi_2023-11-23.csv has been added.
taxi_2023-11-24.csv has been added.
taxi_2023-11-25.csv has been added.
taxi_2023-11-26.csv has been added.
taxi_2023-11-27.csv has been added.
taxi_2023-11-28.csv has been added.
taxi_2023-11-29.csv has been added.
taxi_2023-11-30.csv has been added.
taxi_2023-12-01.csv has been added.
taxi_2023-12-02.csv has been added.
taxi_2023-12-03.csv has been added.
taxi_2023-12-04.csv has been added.
taxi_2023-12-05.csv has been added.
taxi_2023-12-06.csv has been added.


In [26]:
trips = pd.concat(trips_list, ignore_index=True)

In [30]:
trips.shape

(278740, 20)

In [33]:
for file in s3.list_objects(Bucket = bucket, Prefix = weather_path)['Contents']:
    weather_file_key = file['Key']
    if weather_file_key.split('/')[-1].strip() != '':
        if weather_file_key.split('.')[1] == 'csv':       
            filename = weather_file_key.split('/')[-1]
            weather = read_csv_from_s3(bucket= bucket, path= weather_path, filename= filename)
            weather_list.append(weather)
            print(f'{filename} has been added.')            


weather_2023-11-18.csv
weather_2023-11-18.csv has been added.
weather_2023-11-19.csv
weather_2023-11-19.csv has been added.
weather_2023-11-20.csv
weather_2023-11-20.csv has been added.
weather_2023-11-21.csv
weather_2023-11-21.csv has been added.
weather_2023-11-22.csv
weather_2023-11-22.csv has been added.
weather_2023-11-23.csv
weather_2023-11-23.csv has been added.
weather_2023-11-24.csv
weather_2023-11-24.csv has been added.
weather_2023-11-25.csv
weather_2023-11-25.csv has been added.
weather_2023-11-26.csv
weather_2023-11-26.csv has been added.
weather_2023-11-27.csv
weather_2023-11-27.csv has been added.
weather_2023-11-28.csv
weather_2023-11-28.csv has been added.
weather_2023-11-29.csv
weather_2023-11-29.csv has been added.
weather_2023-11-30.csv
weather_2023-11-30.csv has been added.
weather_2023-12-01.csv
weather_2023-12-01.csv has been added.
weather_2023-12-02.csv
weather_2023-12-02.csv has been added.
weather_2023-12-03.csv
weather_2023-12-03.csv has been added.
weather_

In [35]:
weather = pd.concat(weather_list, ignore_index=True)

In [37]:
weather.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 456 entries, 0 to 455
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   datetime       456 non-null    object 
 1   temperature    456 non-null    float64
 2   wind_speed     456 non-null    float64
 3   rain           456 non-null    float64
 4   precipitation  456 non-null    float64
dtypes: float64(4), object(1)
memory usage: 17.9+ KB
