# 2019 Sampling

In [99]:
# Import packages
import pandas as pd
import numpy as np
from os import listdir

In [81]:
def sampling(directory, sample_size, random_state):
    
    total_sample = pd.DataFrame()
    
    files_list = listdir(directory)
    files_list.sort()
    file_count = len(files_list)
    
    print("In progress ...")
    print("------")

    for i in range(file_count):

        file = files_list[i]
        name = file[:6]

        df = pd.read_parquet(directory+file)
        
        if sample_size > len(df):
            month_sample = df.sample(n=len(df), random_state=random_state)
        else:
            month_sample = df.sample(n=sample_size, random_state=random_state)
        
        total_sample = pd.concat([total_sample, month_sample], ignore_index=True)
        
        del df, month_sample
        
        print(str(i) + '/' + str(file_count), name+' - done')

    print("------")
    print("Done!")
    
    return total_sample

In [93]:
# Settings
directory = '../Data_2019_Converted/'
sample_size = 50000
random_state = 0

# Sample data
sample_2019 = sampling(directory, sample_size, random_state)
sample_2019.to_parquet('../Data_2019_Sample/CB-2019-sample.parquet', compression=None)

In progress ...
------
0/12 201901 - done
1/12 201902 - done
2/12 201903 - done
3/12 201904 - done
4/12 201905 - done
5/12 201906 - done
6/12 201907 - done
7/12 201908 - done
8/12 201909 - done
9/12 201910 - done
10/12 201911 - done
11/12 201912 - done
------
Done!


In [94]:
sample_2019.shape

(600000, 32)

In [95]:
sample_2019.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 600000 entries, 0 to 599999
Data columns (total 32 columns):
 #   Column              Non-Null Count   Dtype         
---  ------              --------------   -----         
 0   starttime           600000 non-null  datetime64[ns]
 1   stoptime            600000 non-null  datetime64[ns]
 2   start station id    600000 non-null  float64       
 3   start station name  600000 non-null  object        
 4   end station id      600000 non-null  float64       
 5   end station name    600000 non-null  object        
 6   usertype            600000 non-null  object        
 7   birth year          600000 non-null  int64         
 8   distance_km         600000 non-null  float64       
 9   start_date          600000 non-null  datetime64[ns]
 10  stop_date           600000 non-null  datetime64[ns]
 11  start_dayofyear     600000 non-null  int64         
 12  start_quarter       600000 non-null  int64         
 13  start_month         600000 no

In [96]:
sample_2019.head()

Unnamed: 0,starttime,stoptime,start station id,start station name,end station id,end station name,usertype,birth year,distance_km,start_date,...,stop_month,stop_week,stop_dayofmonth,stop_weekday,stop_hour,stop_minute,stop_weekend,trip_minutes,speed_kmh,gender
0,2019-01-23 18:15:03.978,2019-01-23 18:19:16.527,3093.0,N 6 St & Bedford Ave,460.0,S 4 St & Wythe Ave,Subscriber,1981,0.805677,2019-01-23,...,1,4,23,2,18,19,0,4,11.509673,0
1,2019-01-12 10:46:01.862,2019-01-12 10:59:30.170,364.0,Lafayette Ave & Classon Ave,366.0,Clinton Ave & Myrtle Ave,Subscriber,1988,0.869951,2019-01-12,...,1,2,12,5,10,59,1,13,3.87602,0
2,2019-01-05 19:57:15.559,2019-01-05 20:06:18.371,3321.0,Clinton St & Union St,274.0,Lafayette Ave & Fort Greene Pl,Subscriber,1987,1.834488,2019-01-05,...,1,1,5,5,20,6,1,9,12.184792,0
3,2019-01-03 19:11:40.733,2019-01-03 19:19:49.365,465.0,Broadway & W 41 St,484.0,W 44 St & 5 Ave,Subscriber,1991,0.542308,2019-01-03,...,1,1,3,3,19,19,0,8,4.000633,0
4,2019-01-05 23:29:41.240,2019-01-05 23:35:31.917,3372.0,E 74 St & 1 Ave,3147.0,E 85 St & 3 Ave,Subscriber,1968,1.006994,2019-01-05,...,1,1,5,5,23,35,1,5,10.357656,0


In [97]:
sample_2019.tail()

Unnamed: 0,starttime,stoptime,start station id,start station name,end station id,end station name,usertype,birth year,distance_km,start_date,...,stop_month,stop_week,stop_dayofmonth,stop_weekday,stop_hour,stop_minute,stop_weekend,trip_minutes,speed_kmh,gender
599995,2019-12-03 16:38:53.412,2019-12-03 16:49:30.286,359.0,E 47 St & Park Ave,492.0,W 33 St & 7 Ave,Subscriber,1988,1.449446,2019-12-03,...,12,49,3,1,16,49,0,10,8.204412,0
599996,2019-12-07 15:38:57.389,2019-12-07 15:47:15.810,489.0,10 Ave & W 28 St,3641.0,Broadway & W 25 St,Subscriber,1969,1.369194,2019-12-07,...,12,49,7,5,15,47,1,8,9.897791,0
599997,2019-12-30 20:04:48.114,2019-12-30 20:24:50.699,393.0,E 5 St & Avenue C,497.0,E 17 St & Broadway,Subscriber,1992,1.781358,2019-12-30,...,12,1,30,0,20,24,0,20,5.335183,0
599998,2019-12-15 10:25:39.086,2019-12-15 10:50:36.583,3374.0,Central Park North & Adam Clayton Powell Blvd,468.0,Broadway & W 56 St,Customer,1973,4.402818,2019-12-15,...,12,50,15,6,10,50,1,24,10.587939,0
599999,2019-12-05 22:15:33.473,2019-12-05 22:26:11.877,3521.0,Lenox Ave & W 111 St,3390.0,E 109 St & 3 Ave,Subscriber,1981,0.978921,2019-12-05,...,12,49,5,3,22,26,0,10,5.523693,1


In [98]:
# Check share of female cyclists
neg, pos = np.bincount(sample_2019['gender'])
total = neg + pos
print('Examples:\n    Total: {}\n    Positive (female): {} ({:.2f}% of total)\n'.format(
    total, pos, 100 * pos / total))

Examples:
    Total: 600000
    Positive (female): 152055 (25.34% of total)

