In [35]:
from io import StringIO
import os

import boto3
import pandas as pd

import matplotlib.pyplot as plt

pd.set_option("display.max_columns", 50)

In [None]:
aws_access_key = os.getenv("AWS_ACCESS_KEY")
aws_secret_key = os.getenv("AWS_ACCESS_SECRET_KEY")

In [None]:
def read_csv_from_s3(bucket: str, path: str, filename: str) -> pd.DataFrame:
    """
    Reads a CSV file from an Amazon S3 bucket and returns it as a Pandas DataFrame.

    Parameters:
        bucket (str): The name of the S3 bucket.
        path (str): The path within the S3 bucket where the file is located.
        filename (str): The name of the CSV file to be read.

    Returns:
        pd.DataFrame: A DataFrame containing the data from the CSV file.
    """
    
    s3 = boto3.client("s3", aws_access_key_id = aws_access_key, aws_secret_access_key = aws_secret_key)
    
    full_path = f"{path}{filename}"

    object = s3.get_object(Bucket = bucket, Key = full_path)
    object = object["Body"].read().decode("utf-8")
    output_df = pd.read_csv(StringIO(object))
    
    return output_df

In [None]:
def read_csv_from_s3_folder(bucket: str, folder_path: str) -> pd.DataFrame:
    data_list = []

    for file in s3.list_objects(Bucket = bucket, Prefix = folder_path)['Contents']:
        file_key = file['Key']
        
        filename = file_key.split("/")[-1].strip()
        
        if filename != "":
            if file_key.split(".")[-1] == "csv":

                data = read_csv_from_s3(bucket, folder_path, filename)
                data_list.append(data)
                print(f"{filename} has been added.")

    return pd.concat(data_list, ignore_index=True)

In [None]:
s3 = boto3.client("s3", aws_access_key_id = aws_access_key, aws_secret_access_key = aws_secret_key)

bucket = "cubix-chicago-taxi-vi"

com_areas_path = "transformed_data/community_areas/"
company_path = "transformed_data/company/"
date_path = "transformed_data/date/"
payment_type_path = "transformed_data/payment_type/"
taxi_trips_path = "transformed_data/taxi_trips/"
weather_path = "transformed_data/weather/"


In [None]:
com_ares = read_csv_from_s3(bucket, com_areas_path, "community_areas_master.csv" )
company = read_csv_from_s3(bucket, company_path, "company_master.csv" )
date = read_csv_from_s3(bucket, date_path, "date_dimension.csv" )
payment_type = read_csv_from_s3(bucket, payment_type_path, "payment_type_master.csv" )

trips = read_csv_from_s3_folder(bucket, taxi_trips_path)
weather = read_csv_from_s3_folder(bucket, weather_path)

### Extras of the Hourly weather data

In [None]:
def precipitation(row):
    if row["rain"] != row["precipitation"]:
        val = "Other"
    elif row["rain"] > 0:
        val = "Rainy"
    else:
        val = "No"
    return val

In [None]:
weather["hour"] = pd.to_datetime(weather["datetime"]).dt.hour
weather['is_rainy'] = weather.apply(precipitation, axis=1)
weather.info()
weather.sample()

In [None]:
#com_ares.head()
#company.head()
#date.head()
#payment_type.head()


## Join them together

In [None]:
trips_full = pd.merge(trips, weather, left_on="datetime_for_weather", right_on = "datetime", how="inner")
trips_full = trips_full.drop(columns=["datetime"])

In [None]:
trips_full = pd.merge(trips_full, company, left_on="company_id", right_on = "company_id", how="inner")
trips_full = trips_full.drop(columns=["company_id"])

In [None]:
trips_full = pd.merge(trips_full, payment_type, left_on="payment_type_id", right_on = "payment_type_id", how="inner")
trips_full = trips_full.drop(columns=["payment_type_id"])

In [None]:
trips_full = pd.merge(trips_full, com_ares, left_on="pickup_community_area_id", right_on = "area_code", how="inner")
trips_full = trips_full.drop(columns=["pickup_community_area_id", "area_code"])
trips_full.rename(columns={"community_name": "pickup_community_name"}, inplace=True)

In [None]:
trips_full = pd.merge(trips_full, com_ares, left_on="dropoff_community_area_id", right_on = "area_code", how="inner")
trips_full = trips_full.drop(columns=["dropoff_community_area_id", "area_code"])
trips_full.rename(columns={"community_name": "dropoff_community_name"}, inplace=True)

In [None]:
date["date"] = pd.to_datetime(date["date"])
trips_full["trip_start_timestamp"] = pd.to_datetime(trips_full["trip_start_timestamp"])

trips_full["trip_start_date"] = trips_full["trip_start_timestamp"].dt.date
trips_full["trip_start_date"] = pd.to_datetime(trips_full["trip_start_date"])

In [None]:
trips_full = pd.merge(trips_full, date, left_on="trip_start_date", right_on = "date", how="inner")
trips_full = trips_full.drop(columns=["date"])

In [None]:
trips_full.head()
trips_full.info()

#com_ares.head()

In [None]:
#date.head()
#date.info()

## Visualisation

### Analysing the weekdays

In [None]:
trips_per_day = trips_full.groupby(["trip_start_date", "day_of_week"]).agg(trip_count = ("trip_id", "count"))
trips_per_weekday = trips_per_day.groupby("day_of_week").trip_count.mean()

trips_per_weekday

In [None]:
# Plotting the data
trips_per_weekday.plot(kind="bar", figsize=(14, 8), fontsize=12)

# Adding titles and labels
plt.xlabel("Weekday", fontsize = 13)
plt.ylabel("Count of the trips", fontsize = 13)
plt.title("Daily trip counts", fontsize = 20, pad = 18)

# Show and save the plot into a file
figure = plt.gcf()
#plt.gcf().subplots_adjust(bottom=0.41)
plt.show()
figure.savefig("../img/daily_trip_count.svg", dpi = 100)

### Rain effect on the hourly trips

In [None]:
#trips_full.head(3)

In [None]:
# Only workdays to eliminate the differnce of the weekend
workday_trips = trips_full[~trips_full["is_weekend"]]

datetime_trips = workday_trips.groupby(["datetime_for_weather", "hour", "is_rainy"]).agg(trip_count = ("trip_id", "count"))
del workday_trips

hourly_trips = datetime_trips.groupby(["hour", "is_rainy"]).agg(hourly_trips=("trip_count", "mean")).reset_index()
#hourly_trips

# Separate data for rainy and non-rainy hours
rainy_trips = hourly_trips[hourly_trips['is_rainy'] == "Rainy"]
non_rainy_trips = hourly_trips[hourly_trips['is_rainy'] == "No"]
trips_with_other_precipitation = hourly_trips[hourly_trips['is_rainy'] == "Other"]

rainy_trips

In [None]:
# Plotting the data
plt.figure(figsize=(14, 7))
plt.plot(non_rainy_trips['hour'], non_rainy_trips['hourly_trips'], label='Non-Rainy', marker='x', color = '#FF5733')
plt.plot(rainy_trips['hour'], rainy_trips['hourly_trips'], label='Rainy', marker='o', color = '#4544FF')
plt.plot(trips_with_other_precipitation['hour'], trips_with_other_precipitation['hourly_trips'],
         label='Other precip.', marker='*', color = 'green', markersize = 8)

# Adding titles and labels
plt.title('Average number of the Trips per Hour (only weekdays)', fontsize = 20, pad = 18)
plt.xlabel('Hour of the Day')
plt.ylabel('Average Number of Hourly trips')
plt.legend()

# Show and save the plot into a file
plt.grid(True)
plt.xticks(range(0, 24))  # Setting x-axis ticks to cover all 24 hours
figure = plt.gcf()
plt.show()
figure.savefig("../img/hourly_trip_count.svg", dpi = 100)

### Analysing the companies

In [None]:
# Counting the unique taxis per company
taxis = trips_full.loc[:, ["company", "taxi_id"]]
taxis = taxis.drop_duplicates()
taxis_per_company = taxis.groupby("company").taxi_id.count().nlargest(8)
taxis_per_company

In [None]:
# Plotting the data
#plt.style.use("seaborn")
taxis_per_company.plot(kind="bar", figsize=(14, 8), fontsize=12)

# Adding titles and labels
plt.xlabel("Company", fontsize = 13)
plt.ylabel("No. of cars", fontsize = 13)
plt.title("No. of the used cars per company", fontsize = 20)

# Show and save the plot into a file
figure = plt.gcf()
plt.gcf().subplots_adjust(bottom=0.41)
#figure.set_size_inches(14, 8)
plt.show()
figure.savefig("../img/companies_taxi_count.svg", dpi = 100)

In [None]:
# The top companies with the largest number of the trips
trips_per_company = trips_full.groupby("company").trip_id.count().nlargest(7)
trips_per_company

In [None]:
# Plotting the data
trips_per_company.plot(kind="bar", figsize=(14, 8), fontsize=12)

# Adding titles and labels
plt.xlabel("Company", fontsize = 13)
plt.ylabel("Count of the trips", fontsize = 13)
plt.title("Count of the trips per company", fontsize = 20)

# Show and save the plot into a file
figure = plt.gcf()
plt.gcf().subplots_adjust(bottom=0.41)
plt.show()
figure.savefig("../img/companies_trip_count.svg", dpi = 100)


In [None]:
# The top companies with the longest total distance
trips_miles_per_company = trips_full.groupby("company").trip_miles.sum().nlargest(7)
trips_miles_per_company

In [None]:
# Plotting the data
trips_miles_per_company.plot(kind="bar", figsize=(14, 8), fontsize=12)

# Adding titles and labels
plt.xlabel("Company", fontsize = 13)
plt.ylabel("Sum of miles", fontsize = 13)
plt.title("Total distance per company", fontsize = 20)

# Show and save the plot into a file
figure = plt.gcf()
plt.gcf().subplots_adjust(bottom=0.41)
plt.show()
figure.savefig("../img/companies_total_miles.svg", dpi = 100)

In [None]:
# The top companies with the bigest income (trip_total)
trips_income_per_company = trips_full.groupby("company").trip_total.sum().nlargest(7)
trips_income_per_company

In [None]:
# Plotting the data
trips_income_per_company.plot(kind="bar", figsize=(14, 8), fontsize=12)

# Adding titles and labels
plt.xlabel("Company", fontsize = 13)
plt.ylabel("Sum of total fare", fontsize = 13)
plt.title("Total income per company", fontsize = 20)

# Show and save the plot into a file
figure = plt.gcf()
plt.gcf().subplots_adjust(bottom=0.41)
plt.show()
figure.savefig("../img/companies_total_income.svg", dpi = 100)