In [10]:
import csv
import pandas as pd
from datetime import datetime, timedelta
import os
from time import time
import glob

In [11]:
start = time()

In [12]:
def reduce_rides_data(files, tag_column, tag_name, filename_template):
    
    for filename in files:
        
        df = pd.read_csv(filename)
        tag_value = df[tag_column].iloc[0]

        file_tag = tag_value
        if isinstance(file_tag, str):
            if len(file_tag) == 10:  # only date
                file_tag = datetime.strptime(file_tag, "%Y-%m-%d").strftime("%Y%m%d_%H")
            else:
                file_tag = datetime.strptime(file_tag, "%Y-%m-%d %H:%M:%S").strftime("%Y%m%d_%H")

        reduced = dict(
            tag = tag_name,
            tag_value = tag_value,
            
            rides_count = len(df),

            passenger_count_mean = df["passenger_count"].mean(),
            passenger_count_median = df["passenger_count"].median(),

            trip_distance_sum = df["trip_distance"].sum(),
            trip_distance_mean = df["trip_distance"].mean(),
            trip_distance_median = df["trip_distance"].median(),

            tip_amount_sum = df["tip_amount"].sum(),
            tip_amount_mean = df["tip_amount"].mean(),
            tip_amount_median = df["tip_amount"].median(),

            humidity_mean = df["humidity"].mean(),
            humidity_median = df["humidity"].median(),

            pressure_mean = df["pressure"].mean(),
            pressure_median = df["pressure"].median(),

            wind_speed_mean = df["wind_speed"].mean(),
            wind_speed_median = df["wind_speed"].median()
        )

        reduced_filename = filename_template.format(tag_name, file_tag)
        df_reduced = pd.DataFrame([reduced])
        df_reduced.to_csv(reduced_filename, index=False)

In [13]:
def merge_files(files, dest_filename):
    for file in files:
        df = pd.read_csv(file)
        
        if os.path.exists(dest_filename):
            df.to_csv(dest_filename, mode='a', header=False, index=False)
        else:
            df.to_csv(dest_filename, index=False)

        os.remove(file)

Reduce mapped files

In [14]:
files_by_days = glob.glob("./data/processing/days*.csv")
files_by_hours = glob.glob("./data/processing/hours*.csv")
files_by_weekdays = glob.glob("./data/processing/weekdays*.csv")

file_reduced_template = "./data/reduced/{0}_{1}.csv"

In [15]:
reduce_rides_data(files_by_days, "by_day", "days", file_reduced_template)
reduce_rides_data(files_by_hours, "by_hour", "hours", file_reduced_template)
reduce_rides_data(files_by_weekdays, "by_weekday", "weekdays", file_reduced_template)

Merge reduced files

In [16]:
files_by_days = glob.glob("./data/reduced/days*.csv")
files_by_hours = glob.glob("./data/reduced/hours*.csv")
files_by_weekdays = glob.glob("./data/reduced/weekdays*.csv")

file_merged_template = "./data/reduced/{0}.csv"

In [17]:
merge_files(files_by_days, "./data/reduced/days.csv")
merge_files(files_by_hours, "./data/reduced/hours.csv")
merge_files(files_by_weekdays, "./data/reduced/weekdays.csv")

In [18]:
print(f"Done in {time()-start} sec")

Done in 120.64549231529236 sec
