# Pre-analysis

Function to convert the time in the TIME field to an int representing when in the day it is. And a function to extract the date only from the TIME field.

average_day() takes the station number to plot, and then calculates the average population of the station over the course of a day

In [None]:
def convert_time(x):
    """
    Converts TIME field in the CSV to an integer representing
    what time of day it is (in number of 5min increments) from 0 to 287
    eg
    - 00:00 -> 0
    - 00:10 -> 2
    - 02:20 -> 28
    etc
    """
    a = x.split(' ')
    a = a[1].split(':')

    ans = math.floor((int(a[0]) * 12) + (int(a[1]) / 5))

    return ans


def extract_date(x):
    return x.split(' ')[0]


def extract_year(x):
    return x.split(' ')[0].split('-')[0]


def plot_average_day(station, year=0):
    """
    Plots the average population of a station over time
    (so the average of a station at a certain time of day)
    """
    filename = './datasets/bss/dublin/reorg/station_' + str(station) + '.csv'

    if not os.path.exists(filename):
        print("Station file " + str(station) + " does not exist")
        return

    dataset = pandas.read_csv(filename, usecols=['TIME', 'AVAILABLE BIKES'])
    dataset['INT_TIME'] = dataset['TIME'].apply(lambda x: convert_time(x))
    dataset['DATE'] = dataset['TIME'].apply(lambda x: extract_date(x))
    dataset['YEAR'] = dataset['TIME'].apply(lambda x: extract_year(x))

    # Calculating average pop at all times during the day

    time_total = {}
    time_count = {}

    times = dataset['INT_TIME'].unique()
    times.sort()

    # fill dict
    for time in times:
        time_total[time] = 0
        time_count[time] = 0

    if year != 0:
        dataset = dataset.loc[dataset['YEAR'] == str(year)]
        if len(dataset) == 0:
            print("No data exists for " + str(year))
            return

    for i, row in dataset.iterrows():
        time_total[row['INT_TIME']] = time_total[row['INT_TIME']] + row['AVAILABLE BIKES']
        time_count[row['INT_TIME']] = time_count[row['INT_TIME']] + 1

    averages = [x / y for x, y in zip(time_total.values(), time_count.values())]

    label = 'S' + str(station)
    if year != 0:
        label = label + ',' + str(year)

    plt.plot(times, averages, label=label)


def show_average_day_plot():
    plt.xlabel('Time')
    plt.ylabel('Available bikes')

    x = [0, 48, 96, 144, 192, 240, 288]
    labels = ['00:00', '04:00', '08:00', '12:00', '16:00', '20:00', '24:00']
    plt.xticks(x, labels)
    
    #Comment this out to disable legend
    plt.legend()
    
    plt.show()