## Citibikes data processing: Part 4

Objectives:

- Get station data in separate table:
  - name
  - id
  - longitude
  - latitude




In [1]:
import pandas as pd
from pathlib import Path

In [3]:
def agg_month(year, month):
    # number with year and month together
    yearmonth = (year*100) + month
    # define path to csv
    file_path = Path('Resources', f'{yearmonth}-citibike-tripdata.csv')
    # new structure started on 202102
    new_structure = (yearmonth >= 202102)
    if new_structure:
        # what cols to use
        start_columns = [4, 5, 8, 9]
        end_columns = [6, 7, 10, 11]
        column_dtypes = {
            'name': 'object',
            'id': 'object',
            'lat': 'float64',
            'lon': 'float64'
        }
    else:
        start_columns = [3, 4, 5, 6]
        end_columns = [7, 8, 9, 10]
        column_dtypes = {
            'id': 'object',
            'name': 'object',
            'lat': 'float64',
            'lon': 'float64'
        }
    # read csv with desired columns and drop NAs
    start_df = pd.read_csv(
        file_path,
        header=0,
        usecols=start_columns,
        names=column_dtypes.keys(),
        dtype=column_dtypes
    ).dropna().drop_duplicates(ignore_index=True)
    end_df = pd.read_csv(
        file_path,
        header=0,
        usecols=end_columns,
        names=column_dtypes.keys(),
        dtype=column_dtypes
    ).dropna().drop_duplicates(ignore_index=True)
    # concat and get unique values
    df = pd.concat(
            objs=[start_df, end_df],
            ignore_index=True
        )
    for col in ['lat', 'lon']:
        df[col] = df[col].round(6)
    df = df.drop_duplicates(subset=['id'], ignore_index=True)
    return df

In [4]:
def agg_year(year):
    # 2013 starts in june (6)
    if year == 2013:
        first_month = 6
    else:
        first_month = 1
    # create empty df to concat months
    df = pd.DataFrame()
    # loop through all months
    for month in range(first_month, 13):
        print(f'  Aggregating month: {month}')
        # run agg_month for each month
        month_df = agg_month(year, month)
        # concatenate with all months
        df = pd.concat(
            objs=[df, month_df],
            ignore_index=True,
            names=month_df.index.names
        )
    df = df.drop_duplicates(subset=['id'], keep='last', ignore_index=True)
    return df


In [5]:
# create empty df to concat months
df = pd.DataFrame()
# loop through all years
for year in range(2013, 2024):
    print(f'Aggregating year: {year}')
    # run agg_year for each year's data
    year_df = agg_year(year)
    print(f'  --\n  {len(year_df)} rows in {year}')
    # concatenate with all years
    df = pd.concat(
        objs=[df, year_df],
        ignore_index=True,
        names=year_df.index.names
    )
    df = df.drop_duplicates(subset=['id'], keep='last', ignore_index=True)
    print(f'  {len(df)} total rows so far\n  --')
print(f'Done! {len(df)} final rows')


Aggregating year: 2013
  Aggregating month: 6
  Aggregating month: 7
  Aggregating month: 8
  Aggregating month: 9
  Aggregating month: 10
  Aggregating month: 11
  Aggregating month: 12
  --
  338 rows in 2013
  338 total rows so far
  --
Aggregating year: 2014
  Aggregating month: 1
  Aggregating month: 2
  Aggregating month: 3
  Aggregating month: 4
  Aggregating month: 5
  Aggregating month: 6
  Aggregating month: 7
  Aggregating month: 8
  Aggregating month: 9
  Aggregating month: 10
  Aggregating month: 11
  Aggregating month: 12
  --
  332 rows in 2014
  338 total rows so far
  --
Aggregating year: 2015
  Aggregating month: 1
  Aggregating month: 2
  Aggregating month: 3
  Aggregating month: 4
  Aggregating month: 5
  Aggregating month: 6
  Aggregating month: 7
  Aggregating month: 8
  Aggregating month: 9
  Aggregating month: 10
  Aggregating month: 11
  Aggregating month: 12
  --
  498 rows in 2015
  500 total rows so far
  --
Aggregating year: 2016
  Aggregating month: 1
  Ag

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3853 entries, 0 to 3852
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   id      3853 non-null   object 
 1   name    3853 non-null   object 
 2   lat     3853 non-null   float64
 3   lon     3853 non-null   float64
dtypes: float64(2), object(2)
memory usage: 120.5+ KB


In [8]:
df.to_csv('201306-202312-citibike-tripdata-station_info.csv', index=False, header=True)