Skip to content

Commit

Permalink
Document functions
Browse files Browse the repository at this point in the history
  • Loading branch information
Steve Bentley committed Jun 15, 2017
1 parent 4c50169 commit b861313
Show file tree
Hide file tree
Showing 8 changed files with 180 additions and 23 deletions.
28 changes: 25 additions & 3 deletions dataframe.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,17 @@ import (
"sync"
)

// A DataFrame is a slice of *Series. As a Series
// contains a slice of float64, a DataFrame can be thought of
// as a two dimensional table of data, somewhat like a spreadsheet.
type DataFrame []*Series

// NewDataFrame creates a DataFrame from a 2 dimensional string slice.
// NewDataFrame creates a DataFrame from a 2 dimensional string slice, converting
// all data values to float64. If any values in the first row cannot be converted to
// a float64, then the first row is treated as containing headers and is used to set
// the column name of each Series. If a value (excluding values in the first row) cannot
// be converted to a float64, then the Series is marked as holding categorical data
// and will not be used for numeric calculations.
func NewDataFrame(data [][]string) (*DataFrame, error) {
if !columnCountsMatch(data) {
return nil, errors.New("not all rows have the same number of columns")
Expand All @@ -28,7 +36,6 @@ func NewDataFrame(data [][]string) (*DataFrame, error) {
}

d := DataFrame{}

for x := 0; x < len(data[0]); x++ {
s := createSeries(headers[x], data, x)
d = append(d, s)
Expand Down Expand Up @@ -172,7 +179,7 @@ func (d *DataFrame) String() string {
for r := 0; r < rows; r++ {
for c := 0; c < columns; c++ {
if df[c].IsCategorical() == true {
output += fmt.Sprintf(" %"+strconv.Itoa(colWidths[c]-3)+"s ", df[c].CategoricalLabels[df[c].Values[r]])
output += fmt.Sprintf(" %"+strconv.Itoa(colWidths[c]-3)+"s ", df[c].categoricalLabels[df[c].Values[r]])
} else {
output += fmt.Sprintf(" %"+strconv.Itoa(colWidths[c]-3)+".2f ", df[c].Values[r])
}
Expand All @@ -193,6 +200,21 @@ func (d *DataFrame) Standardize() {
}
}

// Describe returns a summary of the statisical properties
// of all the Series in the DataFrame.
func (d *DataFrame) Describe() []Summary {
s := []Summary{}

for _, v := range *d {
if v.IsCategorical() == false {
vs := v.Describe()
s = append(s, vs)
}
}

return s
}

func (d *DataFrame) toRow(i int) []float64 {
r := []float64{}

Expand Down
4 changes: 2 additions & 2 deletions dataframe_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,8 @@ func TestCreateDataFrameWithCategoricalData(t *testing.T) {
assert.Equal(t, nil, err, "error is not nil")
s := (*df)[3]
assert.Equal(t, true, s.IsCategorical(), "series does not contain categorical data")
assert.Equal(t, 2, len(s.CategoricalLabels), "wrong number of category labels created")
assert.Equal(t, 2, len(s.CategoricalValues), "wrong number of category values created")
assert.Equal(t, 2, len(s.categoricalLabels), "wrong number of category labels created")
assert.Equal(t, 2, len(s.categoricalValues), "wrong number of category values created")
}

func TestStringFullFrame(t *testing.T) {
Expand Down
13 changes: 13 additions & 0 deletions doc.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
// Gander provides DataFrames and Series to manipulate tabular data. It is based
// on the excellent Python Pandas package (http://pandas.pydata.org/).
// A DataFrame can be thought of as being similar to a spreadsheet, in that it holds
// rows and columns of data.
//
// Data is loaded into a DataFrame from a csv file either from
// a url, or from a file path. If all the fields of the top row of the csv contain
// non-numeric data then the top row is assumed to be column headings.
//
// Each column of the DataFrame is held as a Series object, which is made up of a
// slice of float64s, and the name of the column. Categorical (non-numeric) data
// can also be held in a Series, but no calculations can be carried out on it.
package gander
81 changes: 81 additions & 0 deletions doc_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
package gander

import (
"fmt"
"log"
)

func ExampleLoadCSVFromPath() {
df, err := LoadCSVFromPath("testdata/MOCK_DATA.csv")
if err != nil {
log.Panic(err)
}
fmt.Printf("%v\n", df.Columns())
// Output: 6
}

func ExampleLoadCSVFromURL() {
df, err := LoadCSVFromURL("http://download.tensorflow.org/data/iris_training.csv")
if err != nil {
log.Panic(err)
}
fmt.Printf("%v\n", df.Columns())
// Output: 5
}

func ExampleNewDataFrame() {
df, _ := NewDataFrame(
[][]string{
{"a", "b", "c", "d", "e"},
{"1", "2", "3", "4", "5"},
{"3", "5", "2", "2", "4"},
{"7", "6", "1", "3", "3"},
{"4", "2", "4", "7", "6"},
})
fmt.Printf("%v\n", df.Rows())
// Output: 4
}

func ExampleDataFrame_DropColumns() {
df, _ := NewDataFrame(
[][]string{
{"a", "b", "c", "d", "e"},
{"1", "2", "3", "4", "5"},
{"3", "5", "2", "2", "4"},
{"7", "6", "1", "3", "3"},
{"4", "2", "4", "7", "6"},
})
df.DropColumns(0, 2)
fmt.Printf("%v\n", df.Columns())
// Output: 3
}

func ExampleDataFrame_DropColumnsByName() {
df, _ := NewDataFrame(
[][]string{
{"a", "b", "c", "d", "e"},
{"1", "2", "3", "4", "5"},
{"3", "5", "2", "2", "4"},
{"7", "6", "1", "3", "3"},
{"4", "2", "4", "7", "6"},
})
df.DropColumnsByName("b", "d", "e")
fmt.Printf("%v\n", df.Columns())
// Output: 2
}

func ExampleDataFrame_DropColumnsWhere() {
df, _ := NewDataFrame(
[][]string{
{"a", "b", "c", "d", "e"},
{"1", "2", "3", "4", "5"},
{"3", "5", "2", "2", "4"},
{"7", "6", "1", "3", "3"},
{"4", "2", "4", "7", "6"},
})
df.DropColumnsWhere(func(s *Series) bool {
return s.Name == "c"
})
fmt.Printf("%v\n", df.Columns())
// Output: 4
}
27 changes: 23 additions & 4 deletions gander.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,34 @@ package gander

import (
"encoding/csv"
"github.com/tophatsteve/urlreader"
"io"
"os"

"github.com/tophatsteve/urlreader"
)

// A Summary describes the statisical properties of a Series.
type Summary struct {
Name string
Mean float64
Median float64
Mode []float64
Min float64
Max float64
StdDev float64
Variance float64
}

// LoadCSVFromURL creates a DataFrame by loading a csv file
// from a specific url. Note: at the moment this does not
// support https.
func LoadCSVFromURL(url string) (*DataFrame, error) {
u := urlreader.NewReader(url)
return LoadCSVFromReader(u)
return loadCSVFromReader(u)
}

// LoadCSVFromPath creates a DataFrame by loading a csv file
// from a specific file system path.
func LoadCSVFromPath(path string) (*DataFrame, error) {
f, err := os.Open(path)
if err != nil {
Expand All @@ -20,12 +38,13 @@ func LoadCSVFromPath(path string) (*DataFrame, error) {

defer f.Close()

return LoadCSVFromReader(f)
return loadCSVFromReader(f)
}

func LoadCSVFromReader(reader io.Reader) (*DataFrame, error) {
func loadCSVFromReader(reader io.Reader) (*DataFrame, error) {
r := csv.NewReader(reader)
data, err := r.ReadAll()

if err != nil {
return nil, err
}
Expand Down
2 changes: 0 additions & 2 deletions parser.go
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
package gander

// Load data and convert to a DataFrame

import (
"strconv"
)
Expand Down
44 changes: 34 additions & 10 deletions series.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,16 @@ import (
"sort"
)

// A Series represents a column of data in a DataFrame.
type Series struct {
Name string
Values []float64
CategoricalLabels map[float64]string
CategoricalValues map[string]float64
categoricalLabels map[float64]string
categoricalValues map[string]float64
}

// NewSeries creates a new Series with the specified name
// and values.
func NewSeries(name string, values []float64) *Series {
s := Series{}
s.Name = name
Expand All @@ -25,22 +28,27 @@ func NewSeries(name string, values []float64) *Series {
return &s
}

// NewCategoricalSeries create a new Series to contain categorical
// data. The data is passed in as a slice of strings. Internally
// the string values are converted to float64 and a map is maintained
// so that the original values can always be retrieved. No statistical
// operations can be carried out on a categorical series.
func NewCategoricalSeries(name string, values []string) *Series {
categoryNumber := 0.0
s := Series{}
s.CategoricalLabels = make(map[float64]string)
s.CategoricalValues = make(map[string]float64)
s.categoricalLabels = make(map[float64]string)
s.categoricalValues = make(map[string]float64)
s.Name = name

s.Values = []float64{}

for _, v := range values {
if i, ok := s.CategoricalValues[v]; ok == true {
if i, ok := s.categoricalValues[v]; ok == true {
s.Values = append(s.Values, i)
} else {
s.Values = append(s.Values, categoryNumber)
s.CategoricalValues[v] = categoryNumber
s.CategoricalLabels[categoryNumber] = v
s.categoricalValues[v] = categoryNumber
s.categoricalLabels[categoryNumber] = v
categoryNumber += 1
}
}
Expand Down Expand Up @@ -80,7 +88,8 @@ func (s *Series) Median() float64 {
return v[(len(v) / 2)]
}

// Mode finds the mode of all the values in the Series.
// Mode finds the mode of all the values in the Series. This returns
// a slice ofr float64 because a Series could have more than one mode.
func (s *Series) Mode() []float64 {
m := []float64{}
c := count(s.Values)
Expand Down Expand Up @@ -120,7 +129,7 @@ func (s *Series) StdDev() float64 {
}

func (s *Series) IsCategorical() bool {
return s.CategoricalLabels != nil
return s.categoricalLabels != nil
}

// Max returns the maximum value in the Series.
Expand Down Expand Up @@ -179,7 +188,7 @@ func (s *Series) Hist() (map[string]int, error) {
r := make(map[string]int)

for _, v := range s.Values {
c := s.CategoricalLabels[v]
c := s.categoricalLabels[v]
if _, ok := r[c]; ok {
r[c] += 1
} else {
Expand All @@ -190,6 +199,21 @@ func (s *Series) Hist() (map[string]int, error) {
return r, nil
}

// Describe returns a summary of the statisical properties
// of all the Series.
func (s *Series) Describe() Summary {
r := Summary{Name: s.Name}
r.Mean = s.Mean()
r.Median = s.Median()
r.Mode = s.Mode()
r.Min = s.Min()
r.Max = s.Max()
r.StdDev = s.StdDev()
r.Variance = s.Variance()

return r
}

func sum(r []float64) float64 {
t := 0.0

Expand Down
4 changes: 2 additions & 2 deletions series_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -60,8 +60,8 @@ func TestNewCategoricalSeries(t *testing.T) {
assert.Equal(t, "MySeries", s.Name, "column name is not correct")
assert.Equal(t, true, s.IsCategorical(), "column is not categorical")
assert.Equal(t, 10, len(s.Values), "wrong number of values")
assert.Equal(t, 4, len(s.CategoricalLabels), "wrong number of category labels")
assert.Equal(t, 4, len(s.CategoricalValues), "wrong number of category values")
assert.Equal(t, 4, len(s.categoricalLabels), "wrong number of category labels")
assert.Equal(t, 4, len(s.categoricalValues), "wrong number of category values")
}

func TestSeriesSum(t *testing.T) {
Expand Down

0 comments on commit b861313

Please sign in to comment.