In [1]:
import dask.dataframe as dd

In [2]:
help(dd.read_csv)

Help on function read_csv in module dask.dataframe.io.csv:

read_csv(urlpath, blocksize=64000000, collection=True, lineterminator=None, compression=None, sample=256000, enforce=False, assume_missing=False, storage_options=None, include_path_column=False, **kwargs)
    Read CSV files into a Dask.DataFrame
    
    This parallelizes the :func:`pandas.read_csv` function in the following ways:
    
    - It supports loading many files at once using globstrings:
    
        >>> df = dd.read_csv('myfiles.*.csv')  # doctest: +SKIP
    
    - In some cases it can break up large files:
    
        >>> df = dd.read_csv('largefile.csv', blocksize=25e6)  # 25MB chunks  # doctest: +SKIP
    
    - It can read CSV files from external resources (e.g. S3, HDFS) by
      providing a URL:
    
        >>> df = dd.read_csv('s3://bucket/myfiles.*.csv')  # doctest: +SKIP
        >>> df = dd.read_csv('hdfs:///myfiles.*.csv')  # doctest: +SKIP
        >>> df = dd.read_csv('hdfs://namenode.example.com/myfil

In [3]:
import sys

In [4]:
if sys.platform == 'win32' : 
    df = dd.read_csv(r"D:\data\LANL-Earthquake-Prediction/train.csv")
else :
    df = dd.read_csv("../../data/earthquake/train.csv")

In [5]:
df

Unnamed: 0_level_0,acoustic_data,time_to_failure
npartitions=150,Unnamed: 1_level_1,Unnamed: 2_level_1
,int64,float64
,...,...
...,...,...
,...,...
,...,...


In [6]:
df.acoustic_data.count().compute()

629145480

In [7]:
df.time_to_failure.mean().compute()

5.678291712978874

In [8]:
df.shape[0].compute()

629145480

In [9]:
len(df)

629145480

In [10]:
df.tail()

Unnamed: 0,acoustic_data,time_to_failure
1304332,7,9.759796
1304333,9,9.759796
1304334,10,9.759796
1304335,6,9.759796
1304336,5,9.759796


In [11]:
help(df.loc)

Help on _LocIndexer in module dask.dataframe.indexing object:

class _LocIndexer(_IndexerBase)
 |  _LocIndexer(obj)
 |  
 |  Helper class for the .loc accessor
 |  
 |  Method resolution order:
 |      _LocIndexer
 |      _IndexerBase
 |      builtins.object
 |  
 |  Methods defined here:
 |  
 |  __getitem__(self, key)
 |  
 |  ----------------------------------------------------------------------
 |  Methods inherited from _IndexerBase:
 |  
 |  __init__(self, obj)
 |      Initialize self.  See help(type(self)) for accurate signature.
 |  
 |  ----------------------------------------------------------------------
 |  Data descriptors inherited from _IndexerBase:
 |  
 |  __dict__
 |      dictionary for instance variables (if defined)
 |  
 |  __weakref__
 |      list of weak references to the object (if defined)



In [12]:
df.loc[2647247].count().compute()

acoustic_data      149
time_to_failure    149
dtype: int64

In [13]:
df.acoustic_data.nlargest(40).compute()

2647247    5444
2647248    5310
2647246    5188
1056717    5115
1056718    4998
1238736    4886
2647249    4855
1238737    4854
1056716    4799
2258797    4610
2647056    4600
2258798    4570
2647245    4561
1238735    4561
1056719    4524
2647057    4445
1238738    4415
2647055    4388
2258796    4379
215814     4317
1466215    4281
2258799    4246
215398     4198
1056804    4177
1056805    4175
215397     4159
2647250    4127
1466214    4115
1056715    4094
215813     4091
215815     4088
215456     4077
215684     4066
2647058    4041
1466216    4040
215685     3986
1466298    3958
215457     3954
2488788    3914
2258795    3886
Name: acoustic_data, dtype: int64

In [14]:
df.acoustic_data.nsmallest(40).compute()

805458    -5515
805457    -5474
805459    -5251
805456    -5135
1238700   -5008
1238699   -4981
1238698   -4769
1238701   -4755
805460    -4685
370866    -4621
370865    -4559
805455    -4524
2929943   -4482
1238697   -4406
215448    -4392
2929942   -4379
370867    -4312
3115867   -4299
215447    -4283
2929944   -4281
1238702   -4247
851404    -4203
3115868   -4110
370864    -4094
215948    -4087
1056984   -4079
215949    -4075
851403    -3988
2929941   -3980
3115866   -3979
1238606   -3975
1238696   -3968
851405    -3926
1056985   -3917
215449    -3912
1466907   -3905
370651    -3881
1238607   -3863
805461    -3862
215947    -3846
Name: acoustic_data, dtype: int64

In [15]:
df_temp = df.loc[2647200: 2647400, :].compute()

In [16]:
2647400 - 2647200

200

In [17]:
df_temp.head()

Unnamed: 0,acoustic_data,time_to_failure
2647200,7,0.781799
2647201,10,0.781799
2647202,5,0.781799
2647203,5,0.781799
2647204,9,0.781799


In [18]:
df_temp.tail()

Unnamed: 0,acoustic_data,time_to_failure
2647396,1,10.519496
2647397,4,10.519496
2647398,5,10.519496
2647399,4,10.519496
2647400,4,10.519496


In [19]:
df_temp.shape

(29949, 2)

In [20]:
len(df_temp.index)

29949

In [21]:
df_temp.loc[2647201]

Unnamed: 0,acoustic_data,time_to_failure
2647201,10,0.781799
2647201,7,11.266298
2647201,4,10.197997
2647201,2,9.089296
2647201,6,7.982796
2647201,1,6.877300
2647201,3,5.773900
2647201,3,4.672699
2647201,2,3.573598
2647201,4,2.471298


In [22]:
df_temp.duplicated

<bound method DataFrame.duplicated of          acoustic_data  time_to_failure
2647200              7         0.781799
2647201             10         0.781799
2647202              5         0.781799
2647203              5         0.781799
2647204              9         0.781799
2647205             10         0.781799
2647206              8         0.781799
2647207              6         0.781799
2647208              1         0.781799
2647209             -4         0.781799
2647210              6         0.781799
2647211              6         0.781799
2647212             -1         0.781799
2647213              3         0.781799
2647214              1         0.781799
2647215              5         0.781799
2647216              5         0.781799
2647217              7         0.781799
2647218             12         0.781799
2647219             11         0.781799
2647220              6         0.781799
2647221              5         0.781799
2647222              7         0.781799
26

In [23]:
len(df_temp.index)

29949

In [24]:
df_temp.shape

(29949, 2)

In [25]:
df_temp.sort_values(by='acoustic_data')

Unnamed: 0,acoustic_data,time_to_failure
2647265,-3486,0.3203
2647264,-3434,0.3203
2647266,-3310,0.3203
2647263,-3181,0.3203
2647267,-2900,0.3203
2647262,-2825,0.3203
2647235,-2769,0.3203
2647234,-2733,0.3203
2647236,-2590,0.3203
2647233,-2476,0.3203
