# Ruby Kaigi 2016, Kyoto

# Basic demonstration of daru

# daru (Data Analysis in RUby)

## daru is a Ruby gem for analysis, manipulation and cleaning of data. It works well with all the above gems and makes it very easy to perform complex data analysis, cleaning and visualization tasks.

In [1]:
require 'daru'

"if(window['d3'] === undefined ||\n   window['Nyaplot'] === undefined){\n    var path = {\"d3\":\"https://cdnjs.cloudflare.com/ajax/libs/d3/3.5.5/d3.min\",\"downloadable\":\"http://cdn.rawgit.com/domitry/d3-downloadable/master/d3-downloadable\"};\n\n\n\n    var shim = {\"d3\":{\"exports\":\"d3\"},\"downloadable\":{\"exports\":\"downloadable\"}};\n\n    require.config({paths: path, shim:shim});\n\n\nrequire(['d3'], function(d3){window['d3']=d3;console.log('finished loading d3');require(['downloadable'], function(downloadable){window['downloadable']=downloadable;console.log('finished loading downloadable');\n\n\tvar script = d3.select(\"head\")\n\t    .append(\"script\")\n\t    .attr(\"src\", \"http://cdn.rawgit.com/domitry/Nyaplotjs/master/release/nyaplot.js\")\n\t    .attr(\"async\", true);\n\n\tscript[0][0].onload = script[0][0].onreadystatechange = function(){\n\n\n\t    var event = document.createEvent(\"HTMLEvents\");\n\t    event.initEvent(\"load_nyaplot\",false,false);\n\t    win

true

## A simple case of `Daru::Vector`

In [2]:
vec = Daru::Vector.new([1,2,3,4,5])

Daru::Vector(5),Daru::Vector(5).1
0,1
1,2
2,3
3,4
4,5


## Initializing a Vector with a `Daru::Index`

In [3]:
index = Daru::Index.new(['ichi','ni','san','shi','go','roku'])
vec = Daru::Vector.new([1,2,3,4,5,6], index: index)

Daru::Vector(6),Daru::Vector(6).1
ichi,1
ni,2
san,3
shi,4
go,5
roku,6


### Select an element by index

In [4]:
vec['san']

3

### Select multiple elements by index

In [5]:
vec['ichi', 'go']

Daru::Vector(2),Daru::Vector(2).1
ichi,1
go,5


### Select a Range of elements

In [6]:
vec['san'..'go']

Daru::Vector(3),Daru::Vector(3).1
san,3
shi,4
go,5


## A simple case of Daru::DataFrame

In [7]:
df = Daru::DataFrame.new({
  a: [1342,223,3422,4234,5332],
  b: 'a'..'e',
  c: Array.new(5) {|i| i},
  d: ['Junmai-shu', 'Honjozo-shu', 'Ginjo-shu', 'Daiginjo-shu', 'Namazake']
  })
df

Daru::DataFrame(5x4),Daru::DataFrame(5x4),Daru::DataFrame(5x4),Daru::DataFrame(5x4),Daru::DataFrame(5x4)
Unnamed: 0_level_1,a,b,c,d
0,1342,a,0,Junmai-shu
1,223,b,1,Honjozo-shu
2,3422,c,2,Ginjo-shu
3,4234,d,3,Daiginjo-shu
4,5332,e,4,Namazake


### Select a single column with the `#[]` operator

In [8]:
df[:b]

Daru::Vector(5),Daru::Vector(5)
Unnamed: 0_level_1,b
0,a
1,b
2,c
3,d
4,e


### Select multiple columns by specifying a Range

In [9]:
df[:b..:d]

Daru::DataFrame(5x3),Daru::DataFrame(5x3),Daru::DataFrame(5x3),Daru::DataFrame(5x3)
Unnamed: 0_level_1,b,c,d
0,a,0,Junmai-shu
1,b,1,Honjozo-shu
2,c,2,Ginjo-shu
3,d,3,Daiginjo-shu
4,e,4,Namazake


### Or separate by commas

In [10]:
df[:a, :c]

Daru::DataFrame(5x2),Daru::DataFrame(5x2),Daru::DataFrame(5x2)
Unnamed: 0_level_1,a,c
0,1342,0
1,223,1
2,3422,2
3,4234,3
4,5332,4


### Rows can be accessed with the `#row[]` method.

In [11]:
df.row[3]

Daru::Vector(4),Daru::Vector(4)
Unnamed: 0_level_1,3
a,4234
b,d
c,3
d,Daiginjo-shu


### It also supports multiple retrieval with Range or commas

In [12]:
df.row[1..4]

Daru::DataFrame(4x4),Daru::DataFrame(4x4),Daru::DataFrame(4x4),Daru::DataFrame(4x4),Daru::DataFrame(4x4)
Unnamed: 0_level_1,a,b,c,d
1,223,b,1,Honjozo-shu
2,3422,c,2,Ginjo-shu
3,4234,d,3,Daiginjo-shu
4,5332,e,4,Namazake


### The default alphabetical ordering of columns can be changed with the `:order` option.

In [13]:
data_frame = Daru::DataFrame.new({
  a: [1,2,3,4,5],
  b: 'a'..'e',
  c: Array.new(5) {|i| i},
  d: ['Junmai-shu', 'Honjozo-shu', 'Ginjo-shu', 'Daiginjo-shu', 'Namazake']
  }, order: [:b, :d, :c, :a])

Daru::DataFrame(5x4),Daru::DataFrame(5x4),Daru::DataFrame(5x4),Daru::DataFrame(5x4),Daru::DataFrame(5x4)
Unnamed: 0_level_1,b,d,c,a
0,a,Junmai-shu,0,1
1,b,Honjozo-shu,1,2
2,c,Ginjo-shu,2,3
3,d,Daiginjo-shu,3,4
4,e,Namazake,4,5


### You can also specify a row index with the `:index` option

In [14]:
data_frame = Daru::DataFrame.new({
  a: [1,2,3,4,5],
  b: 'a'..'e',
  c: Array.new(5) {|i| i},
  d: ['Junmai-shu', 'Honjozo-shu', 'Ginjo-shu', 'Daiginjo-shu', 'Namazake']
  }, order: [:b, :d, :c, :a], index: ['ichi','ni','san','shi','go'])

Daru::DataFrame(5x4),Daru::DataFrame(5x4),Daru::DataFrame(5x4),Daru::DataFrame(5x4),Daru::DataFrame(5x4)
Unnamed: 0_level_1,b,d,c,a
ichi,a,Junmai-shu,0,1
ni,b,Honjozo-shu,1,2
san,c,Ginjo-shu,2,3
shi,d,Daiginjo-shu,3,4
go,e,Namazake,4,5


DataFrame has powerful indexing capabilities:

* `Daru::Index`
* `Daru::MultiIndex`
* `Daru::DateTimeIndex`
* `Daru::CategoricalIndex`

## A MultiIndex is a hierarchical index

In [15]:
multi_index = Daru::MultiIndex.from_tuples([
  [:a, :b, :c],
  [:a, :b, :d],
  [:a, :b, :p],
  [:a, :q, :p],
  [:b, :r, :f],
  [:c, :o, :t],
  [:c, :p, :w]
  ])

Daru::MultiIndex(7x3),Daru::MultiIndex(7x3),Daru::MultiIndex(7x3)
a,b,c
a,b,d
a,b,p
a,q,p
b,r,f
c,o,t
c,p,w


### It allows you create and query hierarchically named data

In [16]:
vec = Daru::Vector.new([1,2,3]*2 << 66, index: multi_index)

Daru::Vector(7),Daru::Vector(7).1,Daru::Vector(7).2,Daru::Vector(7).3
a,b,c,1
a,b,d,2
a,b,p,3
a,q,p,1
b,r,f,2
c,o,t,3
c,p,w,66


### You can select data by specifying the hierarchy in the #[] method.

In [17]:
vec[:a, :b, :p]

3

### Specifying an incomplete hierarchical index will return data under that index.

In [18]:
vec[:a]

Daru::Vector(4),Daru::Vector(4).1,Daru::Vector(4).2
b,c,1
b,d,2
b,p,3
q,p,1


## The DateTimeIndex allows you to index timestamp-based data like stock prices

## Create a `Daru::DateTimeIndex` with a frequency of 3 minutes.

In [19]:
date_time = Daru::DateTimeIndex.date_range(start: '2011', end: '2011-3', freq: '3M')

#<Daru::DateTimeIndex(28321, frequency=3M) 2011-01-01T00:00:00+00:00...2011-03-01T00:00:00+00:00>

In [20]:
vec = Daru::Vector.new([15]*date_time.size, index: date_time)
vec.head

Daru::Vector(10),Daru::Vector(10).1
2011-01-01T00:00:00+00:00,15
2011-01-01T00:03:00+00:00,15
2011-01-01T00:06:00+00:00,15
2011-01-01T00:09:00+00:00,15
2011-01-01T00:12:00+00:00,15
2011-01-01T00:15:00+00:00,15
2011-01-01T00:18:00+00:00,15
2011-01-01T00:21:00+00:00,15
2011-01-01T00:24:00+00:00,15
2011-01-01T00:27:00+00:00,15


### Query data based on a date range.

In [21]:
vec['2011-1'..'2011-2-10']

Daru::Vector(19680),Daru::Vector(19680).1
2011-01-01T00:00:00+00:00,15
2011-01-01T00:03:00+00:00,15
2011-01-01T00:06:00+00:00,15
2011-01-01T00:09:00+00:00,15
2011-01-01T00:12:00+00:00,15
2011-01-01T00:15:00+00:00,15
2011-01-01T00:18:00+00:00,15
2011-01-01T00:21:00+00:00,15
2011-01-01T00:24:00+00:00,15
2011-01-01T00:27:00+00:00,15


### Get the data for a specific day.

In [22]:
vec['2011-1-21']

Daru::Vector(480),Daru::Vector(480).1
2011-01-21T00:00:00+00:00,15
2011-01-21T00:03:00+00:00,15
2011-01-21T00:06:00+00:00,15
2011-01-21T00:09:00+00:00,15
2011-01-21T00:12:00+00:00,15
2011-01-21T00:15:00+00:00,15
2011-01-21T00:18:00+00:00,15
2011-01-21T00:21:00+00:00,15
2011-01-21T00:24:00+00:00,15
2011-01-21T00:27:00+00:00,15


## A `Daru::CategoricalIndex` can be used if data is purely categorical.

In [23]:
require 'open-uri'

true

In [24]:
content = open('https://d37djvu3ytnwxt.cloudfront.net/asset-v1:MITx+15.071x_3+1T2016+type@asset+block/WHO.csv')
df = Daru::DataFrame.from_csv content
df[0..5]

Daru::DataFrame(194x6),Daru::DataFrame(194x6),Daru::DataFrame(194x6),Daru::DataFrame(194x6),Daru::DataFrame(194x6),Daru::DataFrame(194x6),Daru::DataFrame(194x6)
Unnamed: 0_level_1,Country,Region,Population,Under15,Over60,FertilityRate
0,Afghanistan,Eastern Mediterranean,29825,47.42,3.82,5.4
1,Albania,Europe,3162,21.33,14.93,1.75
2,Algeria,Africa,38482,27.42,7.17,2.83
3,Andorra,Europe,78,15.2,22.86,
4,Angola,Africa,20821,47.58,3.84,6.1
5,Antigua and Barbuda,Americas,89,25.96,12.35,2.12
6,Argentina,Americas,41087,24.42,14.97,2.2
7,Armenia,Europe,2969,20.34,14.06,1.74
8,Australia,Western Pacific,23050,18.95,19.46,1.89
9,Austria,Europe,8464,14.51,23.52,1.44


### Setting the Index of the dataframe to a Categorical Index.

In [25]:
df.index = Daru::CategoricalIndex.new (df['Region']).to_a

#<Daru::CategoricalIndex(194): {Eastern Mediterranean, Europe, Africa, Europe, Africa, Americas, Americas, Europe, Western Pacific, Europe, Europe, Americas, Eastern Mediterranean, South-East Asia, Americas, Europe, Europe, Americas, Africa, South-East Asia ... Africa}>

### To see all the categories of Regions use `#categories`.

In [26]:
df.index.categories
df

Daru::DataFrame(194x13),Daru::DataFrame(194x13),Daru::DataFrame(194x13),Daru::DataFrame(194x13),Daru::DataFrame(194x13),Daru::DataFrame(194x13),Daru::DataFrame(194x13),Daru::DataFrame(194x13),Daru::DataFrame(194x13),Daru::DataFrame(194x13),Daru::DataFrame(194x13),Daru::DataFrame(194x13),Daru::DataFrame(194x13),Daru::DataFrame(194x13)
Unnamed: 0_level_1,Country,Region,Population,Under15,Over60,FertilityRate,LifeExpectancy,ChildMortality,CellularSubscribers,LiteracyRate,GNI,PrimarySchoolEnrollmentMale,PrimarySchoolEnrollmentFemale
Eastern Mediterranean,Afghanistan,Eastern Mediterranean,29825,47.42,3.82,5.4,60,98.5,54.26,,1140,,
Europe,Albania,Europe,3162,21.33,14.93,1.75,74,16.7,96.39,,8820,,
Africa,Algeria,Africa,38482,27.42,7.17,2.83,73,20,98.99,,8310,98.2,96.4
Europe,Andorra,Europe,78,15.2,22.86,,82,3.2,75.49,,,78.4,79.4
Africa,Angola,Africa,20821,47.58,3.84,6.1,51,163.5,48.38,70.1,5230,93.1,78.2
Americas,Antigua and Barbuda,Americas,89,25.96,12.35,2.12,75,9.9,196.41,99.0,17900,91.1,84.5
Americas,Argentina,Americas,41087,24.42,14.97,2.2,76,14.2,134.92,97.8,17130,,
Europe,Armenia,Europe,2969,20.34,14.06,1.74,71,16.4,103.57,99.6,6100,,
Western Pacific,Australia,Western Pacific,23050,18.95,19.46,1.89,82,4.9,108.34,,38110,96.9,97.5
Europe,Austria,Europe,8464,14.51,23.52,1.44,81,4,154.78,,42050,,


### Let's see how many countries lie in the 'Africa' region.

In [27]:
df.row['Africa'].size

46

### To get mean value of the Life Expectancy in Europe:

In [28]:
df.row['Europe']['LifeExpectancy'].mean

76.73584905660377

# Data Querying

## Data can be queried with a Arel-like querying syntax

In [29]:
df = Daru::DataFrame.new({
  'Country' => ['Japan','India', 'USA', 'Canada', 'Germany', 'Russia', 'China','Cuba'],
  'Beverage' => ['Sake', 'Toddy', 'Beer', 'Beer', 'Beer', 'Vodka', 'Beer' , 'Rum'],
  'Drinking Population (%)' => [75, 50, 43, 60, 80, 70, 54, 90]
  })

Daru::DataFrame(8x3),Daru::DataFrame(8x3),Daru::DataFrame(8x3),Daru::DataFrame(8x3)
Unnamed: 0_level_1,Beverage,Country,Drinking Population (%)
0,Sake,Japan,75
1,Toddy,India,50
2,Beer,USA,43
3,Beer,Canada,60
4,Beer,Germany,80
5,Vodka,Russia,70
6,Beer,China,54
7,Rum,Cuba,90


## It supports all logical operations on data.

In [30]:
df.where(df['Beverage'].eq('Beer'))

Daru::DataFrame(4x3),Daru::DataFrame(4x3),Daru::DataFrame(4x3),Daru::DataFrame(4x3)
Unnamed: 0_level_1,Beverage,Country,Drinking Population (%)
2,Beer,USA,43
3,Beer,Canada,60
4,Beer,Germany,80
6,Beer,China,54


## Conditions can be clubbed together with `&` (AND) and `|` (OR) operators.

In [31]:
df.where(df['Beverage'].eq('Beer') | df['Drinking Population (%)'].lt(70))

Daru::DataFrame(5x3),Daru::DataFrame(5x3),Daru::DataFrame(5x3),Daru::DataFrame(5x3)
Unnamed: 0_level_1,Beverage,Country,Drinking Population (%)
1,Toddy,India,50
2,Beer,USA,43
3,Beer,Canada,60
4,Beer,Germany,80
6,Beer,China,54


# Basic Operations on Data

## Sorting

### Sorting can be done with `DataFrame#sort`. It supports missing data in the Vector(s).

In [32]:
df['Drinking Population (%)'] = [75, 50, nil, 60, nil, 70, 54, nil]

[75, 50, nil, 60, nil, 70, 54, nil]

In [33]:
df.sort(['Drinking Population (%)'])

Daru::DataFrame(8x3),Daru::DataFrame(8x3),Daru::DataFrame(8x3),Daru::DataFrame(8x3)
Unnamed: 0_level_1,Beverage,Country,Drinking Population (%)
2,Beer,USA,
4,Beer,Germany,
7,Rum,Cuba,
1,Toddy,India,50.0
6,Beer,China,54.0
3,Beer,Canada,60.0
5,Vodka,Russia,70.0
0,Sake,Japan,75.0


In [34]:
df.sort(['Drinking Population (%)', 'Beverage'], ascending: false)

Daru::DataFrame(8x3),Daru::DataFrame(8x3),Daru::DataFrame(8x3),Daru::DataFrame(8x3)
Unnamed: 0_level_1,Beverage,Country,Drinking Population (%)
7,Rum,Cuba,
2,Beer,USA,
4,Beer,Germany,
0,Sake,Japan,75.0
5,Vodka,Russia,70.0
3,Beer,Canada,60.0
6,Beer,China,54.0
1,Toddy,India,50.0


## Basic statistics

### Various methods let you perform statistics on vectors even when missing data is present.

In [35]:
df.mean

Daru::Vector(1),Daru::Vector(1)
Unnamed: 0_level_1,mean
Drinking Population (%),61.8


In [36]:
df.describe

Daru::DataFrame(5x1),Daru::DataFrame(5x1)
Unnamed: 0_level_1,Drinking Population (%)
count,5.0
mean,61.8
std,10.545141061171254
min,50.0
max,75.0


## Filtering

### `DataFrame#filter_rows` allows filtering rows based on certain conditions.

In [37]:
df.filter_rows do |row|
  !row['Country'].nil? && row['Country'] == 'Japan' || ['Beer', 'Vodka'].include?(row['Beverage'])
end

Daru::DataFrame(6x3),Daru::DataFrame(6x3),Daru::DataFrame(6x3),Daru::DataFrame(6x3)
Unnamed: 0_level_1,Beverage,Country,Drinking Population (%)
0,Sake,Japan,75.0
2,Beer,USA,
3,Beer,Canada,60.0
4,Beer,Germany,
5,Vodka,Russia,70.0
6,Beer,China,54.0


### `DataFrame#recode_rows` allows changing rows based on certain conditions.

In [38]:
df.recode_rows do |row|
  if !row['Drinking Population (%)'].nil? && row['Drinking Population (%)'] < 60
    row['Beverage'] = "Mild #{row['Beverage']}"
  end
  
  row
end

Daru::DataFrame(8x3),Daru::DataFrame(8x3),Daru::DataFrame(8x3),Daru::DataFrame(8x3)
Unnamed: 0_level_1,Beverage,Country,Drinking Population (%)
0,Sake,Japan,75.0
1,Mild Toddy,India,50.0
2,Beer,USA,
3,Beer,Canada,60.0
4,Beer,Germany,
5,Vodka,Russia,70.0
6,Mild Beer,China,54.0
7,Rum,Cuba,


## Joining

### SQL-style joins can be performed with the `DataFrame#join` method.

In [39]:
left = Daru::DataFrame.new({
  :id   => [1,2,3,4],
  :name => ['Pirate', 'Monkey', 'Ninja', 'Spaghetti']
})
right = Daru::DataFrame.new({
  :id => [1,2,3,4],
  :name => ['Rutabaga', 'Pirate', 'Darth Vader', 'Ninja']
})

left.join(right, on: [:name], how: :inner)

Daru::DataFrame(2x3),Daru::DataFrame(2x3),Daru::DataFrame(2x3),Daru::DataFrame(2x3)
Unnamed: 0_level_1,id_1,name,id_2
0,3,Ninja,4
1,1,Pirate,2


## Aggregation with group_by

In [40]:
df = Daru::DataFrame.new({
  a: %w{foo bar foo bar   foo bar foo foo},
  b: %w{one one two three two two one three},
  c:   [1  ,2  ,3  ,1    ,3  ,6  ,3  ,8],
  d:   [11 ,22 ,33 ,44   ,55 ,66 ,77 ,88]
})

Daru::DataFrame(8x4),Daru::DataFrame(8x4),Daru::DataFrame(8x4),Daru::DataFrame(8x4),Daru::DataFrame(8x4)
Unnamed: 0_level_1,a,b,c,d
0,foo,one,1,11
1,bar,one,2,22
2,foo,two,3,33
3,bar,three,1,44
4,foo,two,3,55
5,bar,two,6,66
6,foo,one,3,77
7,foo,three,8,88


In [41]:
groups = df.group_by [:a, :b]
groups.groups

{["bar", "one"]=>[1], ["bar", "three"]=>[3], ["bar", "two"]=>[5], ["foo", "one"]=>[0, 6], ["foo", "three"]=>[7], ["foo", "two"]=>[2, 4]}

### The `GroupBy#first` method will show the first rows of each group.

In [42]:
groups.first

Daru::DataFrame(6x4),Daru::DataFrame(6x4),Daru::DataFrame(6x4),Daru::DataFrame(6x4),Daru::DataFrame(6x4)
Unnamed: 0_level_1,a,b,c,d
1,bar,one,2,22
3,bar,three,1,44
5,bar,two,6,66
0,foo,one,1,11
7,foo,three,8,88
2,foo,two,3,33


### Obtain a summary of the grouped data with `#mean`.

In [43]:
groups.mean

Daru::DataFrame(6x2),Daru::DataFrame(6x2),Daru::DataFrame(6x2),Daru::DataFrame(6x2)
Unnamed: 0_level_1,Unnamed: 1_level_1,c,d
bar,one,2.0,22.0
bar,three,1.0,44.0
bar,two,6.0,66.0
foo,one,2.0,44.0
foo,three,8.0,88.0
foo,two,3.0,44.0


## Visualization

### Daru can be used with nyaplot for generating interactive graphs.

In [44]:
df = Daru::DataFrame.new({
  :temperature => [30.4, 23.5, 44.5, 20.3, 34, 24, 31.45, 28.34, 37, 24],
  :sales       => [350, 150, 500, 200, 480, 250, 330, 400, 420, 560],
  :city        => ['Pune', 'Delhi']*5,
  :staff       => [15,20]*5
})
df

Daru::DataFrame(10x4),Daru::DataFrame(10x4),Daru::DataFrame(10x4),Daru::DataFrame(10x4),Daru::DataFrame(10x4)
Unnamed: 0_level_1,city,sales,staff,temperature
0,Pune,350,15,30.4
1,Delhi,150,20,23.5
2,Pune,500,15,44.5
3,Delhi,200,20,20.3
4,Pune,480,15,34.0
5,Delhi,250,20,24.0
6,Pune,330,15,31.45
7,Delhi,400,20,28.34
8,Pune,420,15,37.0
9,Delhi,560,20,24.0


### Plotting the temperature of the city vs. the sales of ice cream.

In [45]:
df.plot(type: :scatter, x: :temperature, y: :sales) do |plot, diagram|
  plot.x_label "Temperature"
  plot.y_label "Sales"
  plot.yrange [100, 600]
  plot.xrange [15, 50]
  diagram.tooltip_contents([:city, :staff])
  # Set the color scheme for this diagram.
  diagram.color(Nyaplot::Colors.qual) 
  # Change color of each point WRT to the city that it belongs to.
  diagram.fill_by(:city)
  # Shape each point WRT to the city that it belongs to.
  diagram.shape_by(:city) 
end