https://softwarerecs.stackexchange.com/questions/7463/fastest-python-library-to-read-a-csv-file

* pandas.io.parsers.read_csv beats everybody else, 
* NumPy's loadtxt is impressively slow and 
* NumPy's from_file and load impressively fast.

```
Windows: 
open_with_python_csv: 1.57318865672 seconds
open_with_python_csv_list: 1.35567931732 seconds
open_with_python_csv_cast_as_float: 3.0801260484 seconds
open_with_numpy_loadtxt: 14.4942111801 seconds
open_with_pandas_read_csv: 0.371965476805 seconds
Numpy fromfile: 0.0130216095713 seconds
Numpy load: 0.0245501650124 seconds

Ubuntu:
open_with_python_csv: 1.93 seconds
open_with_python_csv_list: 1.52 seconds
open_with_python_csv_cast_as_float: 3.19 seconds
open_with_numpy_loadtxt: 7.47 seconds
open_with_pandas_read_csv: 0.35 seconds
Numpy fromfile: 0.01 seconds
Numpy load: 0.02 seconds
```

In [6]:
import csv
import os
import cProfile
import time
import numpy
import pandas
import warnings

# Make sure those files in the same folder as benchmark_python.py
# As the name indicates:
# - '1col.csv' is a CSV file with 1 column
# - '3col.csv' is a CSV file with 3 column
filename1 = '1col.csv'
filename3 = '3col.csv'

filename3 = "pandas-cookbook/data/bikes-2.csv"
csv_encoding = "latin-1"

filename3 = "pandas-cookbook/data/weather_2012.csv"
csv_encoding = "utf-8"


csv_delimiter = ','
debug = False

In [9]:
def open_with_python_csv(filename):
    '''
    https://docs.python.org/2/library/csv.html
    
    fastcsv is meant for python 2, with python 3, just use csv
    https://github.com/draftcode/fastcsv
    '''
    data =[]
    with open(filename, 'r', encoding=csv_encoding) as csvfile:
        csvreader = csv.reader(csvfile, delimiter=csv_delimiter, quotechar='|')
        for row in csvreader:
            data.append(row)    
    return data

In [39]:
def open_with_python_csv_list(filename):
    '''
    https://docs.python.org/2/library/csv.html
    '''
    data =[]
    with open(filename, 'r', encoding=csv_encoding) as csvfile:
        csvreader = csv.reader(csvfile, delimiter=csv_delimiter, quotechar='|')
        data = list(csvreader)    
    return data

In [None]:
def open_with_python_csv_cast_as_float(filename):
    '''
    https://docs.python.org/2/library/csv.html
    '''
    data =[]
    with open(filename, 'rb') as csvfile:
        csvreader = csv.reader(csvfile, delimiter=csv_delimiter, quotechar='|')
        for row in csvreader:
            data.append(map(float, row))    
    return data

In [30]:
def open_with_numpy_loadtxt(filename):
    '''
    http://stackoverflow.com/questions/4315506/load-csv-into-2d-matrix-with-numpy-for-plotting
    '''
    data = numpy.loadtxt(open(filename,'rb'),delimiter=csv_delimiter,skiprows=0)
    return data

def open_with_pandas_read_csv(filename):
    df = pandas.read_csv(filename, sep=csv_delimiter)
    data = df.values
    return data    


def benchmark(function_name):  
    start_time = time.clock()
    #data = function_name(filename1)       
    #if debug: print data[0] 
    data = function_name(filename3)
    if debug: print (data[0])
    print (function_name.__name__ + ': ' + str(time.clock() - start_time), "seconds")
    return time.clock() - start_time


def benchmark_numpy_fromfile():
    '''
    http://docs.scipy.org/doc/numpy/reference/generated/numpy.fromfile.html
    Do not rely on the combination of tofile and fromfile for data storage, 
    as the binary files generated are are not platform independent.
    In particular, no byte-order or data-type information is saved.
    Data can be stored in the platform independent .npy format using
    save and load instead.

    Note that fromfile will create a one-dimensional array containing your data,
    so you might need to reshape it afterward.
    '''
    #ignore the 'tmpnam is a potential security risk to your program' warning
    with warnings.catch_warnings():
        warnings.simplefilter('ignore', RuntimeWarning)
        fname1 = os.tmpnam()
        fname3 = os.tmpnam()

    # data = open_with_numpy_loadtxt(filename1)
    # if debug: print data[0]
    data.tofile(fname1)
    data = open_with_numpy_loadtxt(filename3)
    if debug: print (data[0])
    data.tofile(fname3)
    if debug: print (data.shape)
    fname3shape = data.shape
    start_time = time.clock()
    data = numpy.fromfile(fname1, dtype=numpy.float64) # you might need to switch to float32. List of types: http://docs.scipy.org/doc/numpy/reference/arrays.dtypes.html
    if debug: print (len(data), data[0], data.shape)
    data = numpy.fromfile(fname3, dtype=numpy.float64)
    data = data.reshape(fname3shape)
    if debug: print (len(data), data[0], data.shape)   
    print ('Numpy fromfile: ' + str(time.clock() - start_time), "seconds")

def benchmark_numpy_save_load():
    '''
    http://docs.scipy.org/doc/numpy/reference/generated/numpy.fromfile.html
    Do not rely on the combination of tofile and fromfile for data storage, 
    as the binary files generated are are not platform independent.
    In particular, no byte-order or data-type information is saved.
    Data can be stored in the platform independent .npy format using
    save and load instead.

    Note that fromfile will create a one-dimensional array containing your data,
    so you might need to reshape it afterward.
    '''
    #ignore the 'tmpnam is a potential security risk to your program' warning
    with warnings.catch_warnings():
        warnings.simplefilter('ignore', RuntimeWarning)
        fname1 = os.tmpnam()
        fname3 = os.tmpnam()

    # data = open_with_numpy_loadtxt(filename1)
    # if debug: print data[0]    
    numpy.save(fname1, data)    
    data = open_with_numpy_loadtxt(filename3)
    if debug: print (data[0])    
    numpy.save(fname3, data)    
    if debug: print (data.shape)
    fname3shape = data.shape
    start_time = time.clock()
    data = numpy.load(fname1 + '.npy')
    if debug: print (len(data), data[0], data.shape)
    data = numpy.load(fname3 + '.npy')
    #data = data.reshape(fname3shape)
    if debug: print (len(data), data[0], data.shape)   
    print ('Numpy load: ' + str(time.clock() - start_time), "seconds")

In [31]:
data = open_with_python_csv(filename3)

data[:3]

In [33]:
data = open_with_pandas_read_csv(filename3)

data[:3]

In [40]:
data = open_with_python_csv_list(filename3)

data[:3]

[['Date/Time',
  'Temp (C)',
  'Dew Point Temp (C)',
  'Rel Hum (%)',
  'Wind Spd (km/h)',
  'Visibility (km)',
  'Stn Press (kPa)',
  'Weather'],
 ['2012-01-01 00:00:00', '-1.8', '-3.9', '86', '4', '8.0', '101.24', 'Fog'],
 ['2012-01-01 01:00:00', '-1.8', '-3.7', '87', '4', '8.0', '101.24', 'Fog']]

In [41]:
def main():
    number_of_runs = 3
    results = []

    benchmark_functions = ['benchmark(open_with_python_csv)', 
                           'benchmark(open_with_python_csv_list)',
                           'benchmark(open_with_pandas_read_csv)'
                           # 'benchmark(open_with_python_csv_cast_as_float)',
                           # 'benchmark(open_with_numpy_loadtxt)',
                           # 'benchmark_numpy_fromfile()',
                           # 'benchmark_numpy_save_load()'
                          ]
    # Compute benchmark
    for run_number in range(number_of_runs):
        run_results = []
        for benchmark_function in benchmark_functions:
            run_results.append(eval(benchmark_function))
            results.append(run_results)

    # Display benchmark's results
    print (results)
    results = numpy.array(results)
    # numpy.set_printoptions(precision=10) # http://stackoverflow.com/questions/2891790/pretty-printing-of-numpy-array
    # numpy.set_printoptions(suppress=True)  # suppress suppresses the use of scientific notation for small numbers:
    print (numpy.mean(results, axis=0))
    print (numpy.std(results, axis=0))  

    #Another library, but not free: https://store.continuum.io/cshop/iopro/

In [42]:
main()

open_with_python_csv: 0.027123000000000452 seconds
open_with_python_csv_list: 0.06939399999999996 seconds
open_with_pandas_read_csv: 0.021934000000000786 seconds
open_with_python_csv: 0.018088000000000548 seconds
open_with_python_csv_list: 0.014740000000000641 seconds
open_with_pandas_read_csv: 0.021769000000000815 seconds
open_with_python_csv: 0.01655400000000018 seconds
open_with_python_csv_list: 0.014777999999999736 seconds
open_with_pandas_read_csv: 0.021938000000000457 seconds
[[0.029633999999999716, 0.06950700000000065, 0.022029999999999994], [0.029633999999999716, 0.06950700000000065, 0.022029999999999994], [0.029633999999999716, 0.06950700000000065, 0.022029999999999994], [0.018207000000000306, 0.01484400000000008, 0.021862000000000492], [0.018207000000000306, 0.01484400000000008, 0.021862000000000492], [0.018207000000000306, 0.01484400000000008, 0.021862000000000492], [0.016655000000000086, 0.01499999999999968, 0.022104000000000568], [0.016655000000000086, 0.01499999999999968,