In [2]:
import pandas as pd
import numpy as np
from random import sample
from random import seed
import glob
%matplotlib inline
import matplotlib.pyplot as plt

In [None]:
https://www.graphviz.org/pdf/dotguide.pdf
https://victorzhou.com/blog/gini-impurity/

In [19]:
#Helper Functions Section

def pd_read_downsample(filename, per):
    n = sum(1 for line in open(filename)) - 1 #number of records in file (excludes header)
    s = round(n*per)
    skip = sorted(sample(range(1,n+1), n-s)) #the 0-indexed header will not be included in the skip list
    df = pd.read_csv(filename, parse_dates = [], skiprows=skip)
    return df
    #unclear if creating a downsampled csv file would be useful here or to just retain the
    #downsampled dataframe object
    df.to_csv(downsampled_filename)
    
#helper function to print columns with missing and the percentage missingness:
def colpercent(df):
    print("Total NaN in Dataframe: " , df.isnull().sum().sum())
    print("Percent Missingness in Dataframe: ", 100*df.isnull().sum().sum()/(len(df.index)*len(df.columns)))
    print('-'*55)
    percentnulldf = df.isnull().sum()/(df.isnull().sum()+df.notna().sum())
    print("Percent Missingness by Columns:")
    print(100*percentnulldf[percentnulldf>0].sort_values(ascending=False))
    
#printout to help view levels within features with missingness
def colpercount(df):
    percentnulldf = df.isnull().sum()/(df.isnull().sum()+df.notna().sum())
    percent_ordered_df=percentnulldf[percentnulldf>0].sort_values(ascending=False)
    for i in range(len(percent_ordered_df)):
        print(percent_ordered_df.index[i])
        print('-'*15)
        print(df[percent_ordered_df.index[i]].value_counts())
        print('-'*55)

#helper function to print out percentage of zeroes by column
def zeroper(df, value):
    l=[]
    columns=[]
    for i in range(len(df.columns)):
        if 0 in df[df.columns[i]].value_counts():
            if 100*df[df.columns[i]].value_counts().loc[0]/len(df[df.columns[i]])>value:
                l.append((df.columns[i], 100*df[df.columns[i]].value_counts().loc[0]/len(df[df.columns[i]])))
            else:
                pass
        else:
            pass
    
    print(len(l))    
    print('-'*55)
    for j in range(len(l)):
        columns.append(l[j][0])
        print('Percent of zeroes: ', l[j])
        print('-'*55)
    print(columns)
    return columns
    
#helper functions to characterize missingness by row and column
def data_eval(df):
    for i in range(len(df.columns)):
        print('-'*50)
        print('Column Name: ', df.columns[i])
        if (df[df.columns[i]].dtypes == 'float64' or df[df.columns[i]].dtypes == 'int64') and df[df.columns[i]][df[df.columns[i]]<0].count()>0:
            print('Number of negatives: ', df[df.columns[i]][df[df.columns[i]]<0].count())
        if df[df.columns[i]][df[df.columns[i]]=='None'].count() > 0:
            print('Number of None strings: ', df[df.columns[i]][df[df.columns[i]]=='None'].count())
        if df[df.columns[i]][df[df.columns[i]]==''].count() > 0:
            print('Number of empty strings: ', df[df.columns[i]][df[df.columns[i]]==''].count())
        else:
            print('Column ' + str(i) + ' has no negatives, empty strings or Nones')


#generates list of percentage missingness by row
def row_na_list(df, value):
    l=[]
    for i in range(len(df.index)) :
        if df.iloc[i].isnull().sum() > value:
            #print(i, df.iloc[i].isnull().sum())
            l.append(i)
    return l

#helper function to retrieve row and column index labels for correlation matrix values
#for greater than value when value>0 and less than value when value<0
#and prints out the values that correspond to those indices
def index_retrieve(df, value, measure):
    poslist = list()
    # Get bool dataframe with True at positions where the given value exists and filter out on-diagonal elements
    if measure == 'spearman':
        if value>0:
            result = df.corr(method = measure)[df.corr(method = measure)!=1][df.corr(method = measure)>value].isna().isin([value])
        if value<0:
            result = df.corr(method = measure)[df.corr(method = measure)!=1][df.corr(method = measure)<value].isna().isin([value])
        else:
            pass
    elif measure == 'pearson':
        if value>0:
            result = df.corr(method = measure)[df.corr(method = measure)!=1][df.corr(method = measure)>value].isna().isin([value])
        elif value<0:
            result = df.corr(method = measure)[df.corr(method = measure)!=1][df.corr(method = measure)<value].isna().isin([value])
        else:
            pass
    # Get list of columns that contains the value
    series = result.any()
    columnNames = list(series[series == True].index)
    # Iterate over list of columns and fetch the rows indexes where value exists
    for col in columnNames:
        rows = list(result[col][result[col] == True].index)
        for row in rows:
            poslist.append((row, col))
    # Return a list of tuples indicating the positions of value in the dataframe
    
    if value > 0:
        print('Number of correlations with value greater than ' + str(value) + ': ' + str(len(poslist)))
    if value < 0:
        print('Number of correlations with value less than ' + str(value) + ': ' + str(len(poslist)))
    else:
        pass
    for i in range(len(poslist)):
        print('-'*40)
        print('index labels: ', poslist[i][0], poslist[i][1])
        print('value at index: ', df.corr().loc[poslist[i]])
    return poslist

Alternative using bash:

For mac users who don't find shuf right away, please brew install first with brew install coreutils and then use the equivalence gshuf. This solution is much faster than calling random.

This is not in Pandas, but it achieves the same result much faster through bash, while not reading the entire file into memory:

shuf -n 100000 data/original.tsv > data/sample.tsv

The shuf command will shuffle the input and the and the -n argument indicates how many lines we want in the output.

Relevant question: https://unix.stackexchange.com/q/108581

Benchmark on a 7M lines csv available here (2008):

Top answer:

def pd_read():
    filename = "2008.csv"
    n = sum(1 for line in open(filename)) - 1 #number of records in file (excludes header)
    s = 100000 #desired sample size
    skip = sorted(random.sample(range(1,n+1),n-s)) #the 0-indexed header will not be included in the skip list
    df = pandas.read_csv(filename, skiprows=skip)
    df.to_csv("temp.csv")

Timing for pandas:

%time pd_read()
CPU times: user 18.4 s, sys: 448 ms, total: 18.9 s
Wall time: 18.9 s

While using shuf:

time shuf -n 100000 2008.csv > temp.csv

real    0m1.583s
user    0m1.445s
sys     0m0.136s


from the following:
https://stackoverflow.com/questions/22258491/read-a-small-random-sample-from-a-big-csv-file-into-a-python-data-frame

In [6]:
csvlist[2]

'2013-09 - Citi Bike trip data.csv'

In [53]:
seed(0)
my_dfs = []
csvlist = [f for f in glob.glob("*.csv")]
i=1

for csv in csvlist:
    df_downsample = pd_read_downsample(csv, 0.05)
    df_downsample.columns = map(str.lower, df_downsample.columns)
    df_downsample.columns = df_downsample.columns.str.replace(" ", "")
    print('Now on loop', i, 'with number of features being', len(df_downsample.columns))
#     print('-'*55)
#     print('Feature names: ', df_downsample.columns)
    my_dfs.append(df_downsample)
    i = i + 1

fin_dat = pd.concat(my_dfs, axis=0)
fin_dat.to_csv('concat_file_fin.csv')

Now on loop 1 with number of features being 15
Now on loop 2 with number of features being 15
Now on loop 3 with number of features being 15
Now on loop 4 with number of features being 15
Now on loop 5 with number of features being 15
Now on loop 6 with number of features being 15
Now on loop 7 with number of features being 15
Now on loop 8 with number of features being 15
Now on loop 9 with number of features being 15
Now on loop 10 with number of features being 15
Now on loop 11 with number of features being 15
Now on loop 12 with number of features being 15
Now on loop 13 with number of features being 15
Now on loop 14 with number of features being 15
Now on loop 15 with number of features being 15
Now on loop 16 with number of features being 15
Now on loop 17 with number of features being 15
Now on loop 18 with number of features being 15
Now on loop 19 with number of features being 15
Now on loop 20 with number of features being 15
Now on loop 21 with number of features being 15
N

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  app.launch_new_instance()


In [3]:
fin_dat = pd.read_csv('concat_file_fin.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [6]:
fin_dat = fin_dat.reset_index(drop=True)
fin_dat = fin_dat.drop(['Unnamed: 0', 'unnamed:0'], axis=1)

In [7]:
fin_dat.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4705843 entries, 0 to 4705842
Data columns (total 15 columns):
bikeid                   int64
birthyear                object
endstationid             float64
endstationlatitude       float64
endstationlongitude      float64
endstationname           object
gender                   int64
startstationid           float64
startstationlatitude     float64
startstationlongitude    float64
startstationname         object
starttime                object
stoptime                 object
tripduration             int64
usertype                 object
dtypes: float64(6), int64(3), object(6)
memory usage: 538.5+ MB


In [8]:
print('%d locations'%fin_dat.startstationlatitude.nunique())

1218 locations


In [10]:
fin_dat['starttime'] = pd.to_datetime(fin_dat.starttime)
fin_dat['stoptime'] = pd.to_datetime(fin_dat.stoptime)

In [62]:
fin_dat.shape

(4705843, 15)

In [75]:
fin_dat.head()

Unnamed: 0,bikeid,birthyear,endstationid,endstationlatitude,endstationlongitude,endstationname,gender,startstationid,startstationlatitude,startstationlongitude,startstationname,starttime,stoptime,tripduration,usertype
0,16852,1991.0,252.0,40.732264,-73.998522,MacDougal St & Washington Sq,1,312.0,40.722055,-73.989111,Allen St & Stanton St,2017-09-01 00:04:15,2017-09-01 00:11:18,422,Subscriber
1,27730,1983.0,468.0,40.765265,-73.981923,Broadway & W 56 St,1,478.0,40.760301,-73.998842,11 Ave & W 41 St,2017-09-01 00:06:19,2017-09-01 00:17:09,650,Subscriber
2,28032,1989.0,395.0,40.68807,-73.984106,Bond St & Schermerhorn St,1,237.0,40.730473,-73.986724,E 11 St & 2 Ave,2017-09-01 00:07:34,2017-09-01 00:29:49,1334,Subscriber
3,15651,1989.0,3449.0,40.721463,-73.948009,Eckford St & Engert Ave,2,3090.0,40.717746,-73.956001,N 8 St & Driggs Ave,2017-09-01 00:10:49,2017-09-01 00:13:52,182,Subscriber
4,17018,1900.0,402.0,40.740343,-73.989551,Broadway & E 22 St,0,325.0,40.736245,-73.984738,E 19 St & 3 Ave,2017-09-01 00:11:00,2017-09-01 00:15:39,278,Customer


In [10]:
#na survey of data after import
colpercent(fin_dat)

Total NaN in Dataframe:  258745
Percent Missingness in Dataframe:  0.3665584820119725
-------------------------------------------------------
Percent Missingness by Columns:
birthyear              5.356660
usertype               0.055697
endstationname         0.021505
endstationid           0.021505
endstationlongitude    0.018573
endstationlatitude     0.018573
startstationname       0.002933
startstationid         0.002933
dtype: float64


In [11]:
#birthyear column na handling and imputation

#the number of na versus not na
print(fin_dat.birthyear.isnull().value_counts())

#this indicates that some of the years are cast as floats while others are strings
print(fin_dat.birthyear[fin_dat.birthyear.notnull()].value_counts()[200:].index)

#data is inconsistent in some na's being whitespace and others being actual na's, so replace for consistency
fin_dat.birthyear = fin_dat.birthyear.replace('\\N', np.nan)
fin_dat.birthyear.fillna(1900.0, inplace=True)
fin_dat.birthyear = fin_dat.birthyear[fin_dat.birthyear.notnull()].astype('float16')
print('imputation complete')

False    4453767
True      252076
Name: birthyear, dtype: int64
Index([  1888.0,   '1939', '1943.0',   1930.0, '1940.0',   '1935',     1887,
         1910.0,   '1937',   '1936',     1899,   1912.0, '1938.0',   '1899',
         '1934',     1924,   '1901',   1917.0,   1921.0,   1889.0,   1926.0,
       '2000.0',   '2003', '1900.0',   '1933',     1918,     1886,   '1932',
         1895.0,     1929,   '1922', '1939.0',   1931.0,   1893.0,   '1923',
         1890.0,   '1888',     1896,     1919,     1907,   1894.0,   '1926',
         '1910',   1928.0, '1935.0',     1909,   '1917',   '1921',     1913,
           1911,   '1924', '1934.0', '1936.0', '1901.0',   1915.0, '1885.0',
         1927.0, '1899.0',   '1907',   '1927',   '1930',   1922.0, '1922.0',
         '1890',   '1929', '1932.0',   '1885', '1930.0',   '1913',   '1887',
       '1937.0',   1916.0, '1910.0', '1921.0',   '1886',   1920.0,   '1918',
         '1931',   '1896', '1926.0', '2001.0',   '1912',   '1894', '1933.0',
           1

In [12]:
#birthyear column imputation for implausibly elderly

#these ages seem implausible (Citibike starting in 2013 would put the most elderly rider at 94), 
#so we replace them with 1900.0
birth_index = fin_dat.birthyear[fin_dat.birthyear.sort_values(ascending=False)<1920.0].index
print(fin_dat.birthyear[birth_index].unique())
fin_dat.birthyear[birth_index] = fin_dat.birthyear[birth_index].replace(list(fin_dat.birthyear[birth_index].unique()), 1900.0)
print('imputation complete')

[1900. 1912. 1885. 1901. 1894. 1893. 1917. 1910. 1888. 1895. 1899. 1886.
 1896. 1918. 1887. 1907. 1913. 1915. 1916. 1889. 1890. 1911. 1897. 1919.
 1909. 1857. 1905.]
imputation complete


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


In [13]:
#usertype column na handling and imputation

#alternative is to look into proportional imputation by class
fin_dat.usertype.fillna('Unknown', inplace=True)
print('imputation complete')

imputation complete


In [14]:
#endstation na handling and imputation

#the missing station names, id, lat and long indexes coincide, so these observations cannot be used for route analysis and should be removed
print(fin_dat.endstationname[fin_dat.endstationname.isnull()].index)
print(fin_dat.endstationid[fin_dat.endstationid.isnull()].index)
print(fin_dat.endstationlatitude[fin_dat.endstationlatitude.isnull()].index)
print(fin_dat.endstationlongitude[fin_dat.endstationlongitude.isnull()].index)

fin_dat.drop(fin_dat[fin_dat.endstationid.isnull()].index, axis=0, inplace=True)

print('imputation complete')

Int64Index([  95006,   97403,   98193,   98971,   99583,   99592,  100431,
             101258,  101935,  105703,
            ...
            4117249, 4121669, 4122217, 4122980, 4123457, 4126173, 4147953,
            4158820, 4159321, 4164415],
           dtype='int64', length=1012)
Int64Index([  95006,   97403,   98193,   98971,   99583,   99592,  100431,
             101258,  101935,  105703,
            ...
            4117249, 4121669, 4122217, 4122980, 4123457, 4126173, 4147953,
            4158820, 4159321, 4164415],
           dtype='int64', length=1012)
Int64Index([1952760, 1952776, 1952788, 1952860, 1952863, 1952865, 1952879,
            1952905, 1952908, 1952923,
            ...
            3465103, 3465133, 3465146, 3465216, 3465355, 3465357, 3465389,
            3465396, 3465430, 3465448],
           dtype='int64', length=874)
Int64Index([1952760, 1952776, 1952788, 1952860, 1952863, 1952865, 1952879,
            1952905, 1952908, 1952923,
            ...
            3465103

In [15]:
#startstation na handling and imputation

#the missing station names and id indexes coincide, so these observations cannot be used for route analysis and should be removed
print(fin_dat.startstationname[fin_dat.startstationname.isnull()].index)
print(fin_dat.startstationid[fin_dat.startstationid.isnull()].index)

fin_dat.drop(fin_dat[fin_dat.startstationid.isnull()].index, axis=0, inplace=True)

print('imputation complete')

Int64Index([], dtype='int64')
Int64Index([], dtype='int64')
imputation complete


In [16]:
#now that imputations and removals are completed, we need to reset index

fin_dat = fin_dat.reset_index(drop=True)

In [20]:
#to verify all nas are gone
colpercent(fin_dat)

Total NaN in Dataframe:  0
Percent Missingness in Dataframe:  0.0
-------------------------------------------------------
Percent Missingness by Columns:
Series([], dtype: float64)


Initial Notes:
1. Need to convert starttime and stoptime to datetime format or equivalent. This can be done automatically while reading in  with the parse_dates method, but would be faster to do selectively after downsampling. The former may be unavoidable if the groupby needs to be done by month though
2. Implement downsampling, ideally read in directly as such

In [21]:
#timeDelta method for computing time of travel
print((fin_dat.stoptime - fin_dat.starttime).dt.total_seconds().sort_values(ascending=True)[:20])

#index values of concern:
# 1978590   
# 3589117   
# 2804411  
# 2804412   
# 3589116  
# 3589115   
# 3589118 
# 3589112 

#for more information if method is insufficiently time-efficient
#https://pandas.pydata.org/pandas-docs/stable/user_guide/timedeltas.html
#https://stackoverflow.com/questions/22923775/calculate-pandas-dataframe-time-difference-between-two-columns-in-hours-and-minu
#https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.dt.total_seconds.html

#error in starttime and stoptime labeling, need to reverse what is starttime and what is stoptime
#we will assume here that all station labels are correct, simply start and stoptimes were inserted incorrectly

#fin_dat.iloc[551566,:].starttime, fin_dat.iloc[551566,:].stoptime = fin_dat.iloc[551566,:].stoptime, fin_dat.iloc[551566,:].starttime
start = fin_dat.iloc[551566,:].starttime
stop = fin_dat.iloc[551566,:].stoptime

print(start)
print(stop)

# fin_dat.iloc[551566,:].starttime = fin_dat.iloc[551566,:].starttime.replace(minute=stop.minute, second=stop.second)
# fin_dat.iloc[551566,:].stoptime = fin_dat.iloc[551566,:].stoptime.replace(minute=start.minute, second=start.second)

fin_dat.iloc[551566,:].starttime = fin_dat.iloc[551566,:].starttime.replace(minute=stop.minute, second=stop.second)
#fin_dat.iloc[551566,:].stoptime.replace(minute=stop.minute, second=stop.second)

1978590   -3434.0
3589117   -3226.0
2804411   -3170.0
2804412   -3129.0
3589116   -2301.0
3589115   -1951.0
3589118    -959.0
3589112    -168.0
2520413      60.0
849863       60.0
3733825      60.0
3738704      60.0
850119       60.0
2348093      60.0
4080806      60.0
868634       60.0
3738879      60.0
3817906      60.0
891442       60.0
1756041      60.0
dtype: float64
2019-08-03 15:06:41.843000
2019-08-03 15:26:49.096000


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [22]:
diff=fin_dat.tripduration-(fin_dat.stoptime - fin_dat.starttime).dt.total_seconds()
#this indicates that only around 1/4 of the observations are in agreement
print(len(diff[diff==0]))
print('-'*45)
#a large majority of the observations are either in agreement or the differences are due to rounding error
#these are sufficiently close that we can for our purposes just keep the values we already have in tripduration
print(len(diff[abs(diff)<=1]))
print('-'*45)
#only around 42000 observations have actual errors
print(diff[abs(diff>1)])

1392818
---------------------------------------------
4461838
---------------------------------------------
391554     3600.0
391555     3600.0
770833     3599.0
776863     3600.0
795517     3600.0
            ...  
4112921      21.0
4112922      19.0
4112923       4.0
4583493    3600.0
4631912    3599.0
Length: 41787, dtype: float64


In [23]:
error_index = diff[abs(diff>1)].index
#there are no negative values here
print(fin_dat.tripduration[error_index][fin_dat.tripduration[error_index]<0])
print('-'*45)

#confirming the sum is as it should be
print(fin_dat.iloc[error_index, -2])
print('-'*45)
print(fin_dat.tripduration[error_index] + diff[abs(diff>1)])
print(fin_dat.tripduration[error_index])

Series([], Name: tripduration, dtype: int64)
---------------------------------------------
391554        4451
391555        4178
770833     1309033
776863     1199464
795517       49206
            ...   
4112921        741
4112922        259
4112923        304
4583493    8461569
4631912    2518958
Name: tripduration, Length: 41787, dtype: int64
---------------------------------------------
391554        8051.0
391555        7778.0
770833     1312632.0
776863     1203064.0
795517       52806.0
             ...    
4112921        762.0
4112922        278.0
4112923        308.0
4583493    8465169.0
4631912    2522557.0
Length: 41787, dtype: float64
391554        4451
391555        4178
770833     1309033
776863     1199464
795517       49206
            ...   
4112921        741
4112922        259
4112923        304
4583493    8461569
4631912    2518958
Name: tripduration, Length: 41787, dtype: int64


In [24]:
#made correction to update tripduration values with error
fin_dat.tripduration[error_index] = fin_dat.tripduration[error_index] + diff[abs(diff>1)]
print('imputation complete')

imputation complete


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [25]:
datetime_feat = ['birthyear','starttime', 'stoptime']
num_nom_feat = ['bikeid','endstationid','startstationid','gender']
cat_nom_feat = ['endstationname','startstationname','usertype']
nom_feat = num_nom_feat + cat_nom_feat

In [26]:
float_to_int = ['startstationid','endstationid','tripduration', 'bikeid', 'gender']

#change to int16 to reduce memory
fin_dat.birthyear = fin_dat.birthyear.astype('float16').astype('int16')
fin_dat[float_to_int] = fin_dat[float_to_int].apply(lambda x: x.astype('int16'))

#and for nominal features
fin_dat[cat_nom_feat] = fin_dat[cat_nom_feat].apply(lambda x: x.astype('category'))

In [27]:
#location of observations where lat and/or long don't make any sense
observe = [4455178, 2022803, 2051817, 3007606, 2022782, 2035624, 3539070, 956266]

#the entries have been left as zeroes
fin_dat.iloc[observe,:]

Unnamed: 0,bikeid,birthyear,endstationid,endstationlatitude,endstationlongitude,endstationname,gender,startstationid,startstationlatitude,startstationlongitude,startstationname,starttime,stoptime,tripduration,usertype
4455178,15443,1900,173,40.760683,-73.984527,Broadway & W 49 St,0,2006,40.765909,-73.976342,Central Park S & 6 Ave,2016-05-28 19:22:04,2016-05-28 19:41:40,1176,Customer
2022803,18488,1964,168,40.739713,-73.994564,W 18 St & 6 Ave,1,2012,40.739445,-73.976806,E 27 St & 1 Ave,2017-11-27 12:14:56,2017-11-27 12:25:43,647,Subscriber
2051817,20860,1949,168,40.739713,-73.994564,W 18 St & 6 Ave,1,442,40.746647,-73.993915,W 27 St & 7 Ave,2016-12-13 06:53:22,2016-12-13 06:58:47,324,Subscriber
3007606,16202,1983,365,40.682232,-73.961458,Fulton St & Grand Ave,2,324,40.689888,-73.981013,DeKalb Ave & Hudson Ave,2014-12-25 00:51:39,2014-12-25 01:01:08,569,Subscriber
2022782,17872,1960,468,40.765265,-73.981923,Broadway & W 56 St,1,3172,40.778567,-73.97755,W 74 St & Columbus Ave,2017-11-27 11:59:33,2017-11-27 12:10:15,641,Subscriber
2035624,14551,1989,236,40.728419,-73.98714,St Marks Pl & 2 Ave,2,472,40.745712,-73.981948,E 32 St & Park Ave,2016-12-02 16:47:43,2016-12-02 16:58:21,637,Subscriber
3539070,28283,1973,297,40.734232,-73.986923,E 15 St & 3 Ave,1,461,40.735877,-73.98205,E 20 St & 2 Ave,2017-12-05 07:31:44,2017-12-05 07:36:28,283,Subscriber
956266,21403,1992,448,40.756604,-73.997901,W 37 St & 10 Ave,1,490,40.751551,-73.993934,8 Ave & W 33 St,2016-10-11 18:21:17,2016-10-11 18:25:29,252,Subscriber


In [28]:
#this fixes most
fin_dat.loc[fin_dat['endstationlatitude'] == 0, 'endstationlatitude'] = 40.75
fin_dat.loc[fin_dat['endstationlongitude'] == 0, 'endstationlongitude'] = -74.0
fin_dat.loc[fin_dat['startstationlongitude']==0, 'startstationlongitude'] = -74.0
fin_dat.loc[fin_dat['startstationlatitude'] == 0, 'startstationlatitude'] = 40.75

#faulty for startstationname (WS, don't use), so we drop it
fin_dat.drop(4455178, axis=0, inplace=True)
fin_dat.reset_index(inplace=True, drop=True)

In [29]:
#negative bikeids is an issue?
print((fin_dat.bikeid<0).value_counts())
print(len(fin_dat.loc[fin_dat.bikeid<0].bikeid.value_counts().index))
fin_dat.loc[fin_dat.bikeid<0].bikeid.value_counts().index

False    4105003
True      599827
Name: bikeid, dtype: int64
7580


Int64Index([-31752, -31231, -31563, -31261, -31445, -31380, -31730, -31646,
            -32656, -31410,
            ...
            -27903, -23512, -25922, -28384, -26932, -27589, -27584, -31809,
            -27111, -28929],
           dtype='int64', length=7580)

In [30]:
fin_dat.drop(fin_dat.loc[fin_dat.bikeid <0].index, inplace=True)
fin_dat.reset_index(inplace=True, drop=True)

In [31]:
#there are some addition values that may need removal
#Hs Don't Use, WS Don't Use, NYCBS Test
fin_dat.drop(fin_dat[fin_dat.startstationname=="Hs Don't Use"].index, axis=0, inplace=True)
fin_dat.drop(fin_dat[fin_dat.startstationname=="WS Don't Use"].index, axis=0, inplace=True)
fin_dat.drop(fin_dat[fin_dat.startstationname=="NYCBS Test"].index, axis=0, inplace=True)
fin_dat.drop(fin_dat[fin_dat.endstationname=="NYCBS Test"].index, axis=0, inplace=True)
fin_dat.reset_index(inplace=True, drop=True)

In [32]:
#print(fin_dat.tripduration.sort_values(ascending=True)[:10000])
#issue with riders getting bike and then shortly after returning them to same dock station without real use.
#print(fin_dat.iloc[663912,:])

#remove observations where cyclists return bike to same dock
#fin_dat.drop(fin_dat.iloc[fin_dat.startstationid == fin_dat.endstationid], axis=1)
fin_dat.loc[fin_dat.startstationid == fin_dat.endstationid, :].index
print(len(fin_dat.drop(fin_dat.loc[fin_dat.startstationid == fin_dat.endstationid, :].index, axis=0)))
print(len(fin_dat))

#how do I determine which subset of these rides are not actual rides (rider misuse) or actual rides
#i.e., cyclists returning to the same location they started after riding for a time
fin_dat.loc[fin_dat.startstationid == fin_dat.endstationid, :].tripduration.sort_values()[:500]

#we will just cut them all to be conservative
fin_dat.drop(fin_dat.loc[fin_dat.startstationid == fin_dat.endstationid, :].index, axis=0, inplace=True)
print('imputation complete')

4017571
4104983
imputation complete


In [38]:
fin_dat = fin_dat.drop_duplicates()

In [39]:
fin_dat.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4015035 entries, 0 to 4104982
Data columns (total 15 columns):
bikeid                   int16
birthyear                int16
endstationid             int16
endstationlatitude       float64
endstationlongitude      float64
endstationname           category
gender                   int16
startstationid           int16
startstationlatitude     float64
startstationlongitude    float64
startstationname         category
starttime                datetime64[ns]
stoptime                 datetime64[ns]
tripduration             int16
usertype                 category
dtypes: category(3), datetime64[ns](2), float64(4), int16(6)
memory usage: 279.6 MB


In [41]:
fin_dat.shape

(4015035, 15)

In [42]:
print('%d locations'%fin_dat.startstationlatitude.nunique())

1183 locations


In [43]:
fin_dat.to_csv('citibike_final.csv', index=False)