From b8613130392b7877b2eab40de174c4e777b67cd9 Mon Sep 17 00:00:00 2001 From: Steve Bentley Date: Thu, 15 Jun 2017 11:16:40 +0100 Subject: [PATCH] Document functions --- dataframe.go | 28 ++++++++++++++-- dataframe_test.go | 4 +-- doc.go | 13 ++++++++ doc_test.go | 81 +++++++++++++++++++++++++++++++++++++++++++++++ gander.go | 27 +++++++++++++--- parser.go | 2 -- series.go | 44 +++++++++++++++++++------ series_test.go | 4 +-- 8 files changed, 180 insertions(+), 23 deletions(-) create mode 100644 doc.go create mode 100644 doc_test.go diff --git a/dataframe.go b/dataframe.go index c6094a5..4b20e3b 100644 --- a/dataframe.go +++ b/dataframe.go @@ -8,9 +8,17 @@ import ( "sync" ) +// A DataFrame is a slice of *Series. As a Series +// contains a slice of float64, a DataFrame can be thought of +// as a two dimensional table of data, somewhat like a spreadsheet. type DataFrame []*Series -// NewDataFrame creates a DataFrame from a 2 dimensional string slice. +// NewDataFrame creates a DataFrame from a 2 dimensional string slice, converting +// all data values to float64. If any values in the first row cannot be converted to +// a float64, then the first row is treated as containing headers and is used to set +// the column name of each Series. If a value (excluding values in the first row) cannot +// be converted to a float64, then the Series is marked as holding categorical data +// and will not be used for numeric calculations. func NewDataFrame(data [][]string) (*DataFrame, error) { if !columnCountsMatch(data) { return nil, errors.New("not all rows have the same number of columns") @@ -28,7 +36,6 @@ func NewDataFrame(data [][]string) (*DataFrame, error) { } d := DataFrame{} - for x := 0; x < len(data[0]); x++ { s := createSeries(headers[x], data, x) d = append(d, s) @@ -172,7 +179,7 @@ func (d *DataFrame) String() string { for r := 0; r < rows; r++ { for c := 0; c < columns; c++ { if df[c].IsCategorical() == true { - output += fmt.Sprintf(" %"+strconv.Itoa(colWidths[c]-3)+"s ", df[c].CategoricalLabels[df[c].Values[r]]) + output += fmt.Sprintf(" %"+strconv.Itoa(colWidths[c]-3)+"s ", df[c].categoricalLabels[df[c].Values[r]]) } else { output += fmt.Sprintf(" %"+strconv.Itoa(colWidths[c]-3)+".2f ", df[c].Values[r]) } @@ -193,6 +200,21 @@ func (d *DataFrame) Standardize() { } } +// Describe returns a summary of the statisical properties +// of all the Series in the DataFrame. +func (d *DataFrame) Describe() []Summary { + s := []Summary{} + + for _, v := range *d { + if v.IsCategorical() == false { + vs := v.Describe() + s = append(s, vs) + } + } + + return s +} + func (d *DataFrame) toRow(i int) []float64 { r := []float64{} diff --git a/dataframe_test.go b/dataframe_test.go index b0ddb9a..395fcc1 100644 --- a/dataframe_test.go +++ b/dataframe_test.go @@ -27,8 +27,8 @@ func TestCreateDataFrameWithCategoricalData(t *testing.T) { assert.Equal(t, nil, err, "error is not nil") s := (*df)[3] assert.Equal(t, true, s.IsCategorical(), "series does not contain categorical data") - assert.Equal(t, 2, len(s.CategoricalLabels), "wrong number of category labels created") - assert.Equal(t, 2, len(s.CategoricalValues), "wrong number of category values created") + assert.Equal(t, 2, len(s.categoricalLabels), "wrong number of category labels created") + assert.Equal(t, 2, len(s.categoricalValues), "wrong number of category values created") } func TestStringFullFrame(t *testing.T) { diff --git a/doc.go b/doc.go new file mode 100644 index 0000000..9ebfb38 --- /dev/null +++ b/doc.go @@ -0,0 +1,13 @@ +// Gander provides DataFrames and Series to manipulate tabular data. It is based +// on the excellent Python Pandas package (http://pandas.pydata.org/). +// A DataFrame can be thought of as being similar to a spreadsheet, in that it holds +// rows and columns of data. +// +// Data is loaded into a DataFrame from a csv file either from +// a url, or from a file path. If all the fields of the top row of the csv contain +// non-numeric data then the top row is assumed to be column headings. +// +// Each column of the DataFrame is held as a Series object, which is made up of a +// slice of float64s, and the name of the column. Categorical (non-numeric) data +// can also be held in a Series, but no calculations can be carried out on it. +package gander diff --git a/doc_test.go b/doc_test.go new file mode 100644 index 0000000..7bab746 --- /dev/null +++ b/doc_test.go @@ -0,0 +1,81 @@ +package gander + +import ( + "fmt" + "log" +) + +func ExampleLoadCSVFromPath() { + df, err := LoadCSVFromPath("testdata/MOCK_DATA.csv") + if err != nil { + log.Panic(err) + } + fmt.Printf("%v\n", df.Columns()) + // Output: 6 +} + +func ExampleLoadCSVFromURL() { + df, err := LoadCSVFromURL("http://download.tensorflow.org/data/iris_training.csv") + if err != nil { + log.Panic(err) + } + fmt.Printf("%v\n", df.Columns()) + // Output: 5 +} + +func ExampleNewDataFrame() { + df, _ := NewDataFrame( + [][]string{ + {"a", "b", "c", "d", "e"}, + {"1", "2", "3", "4", "5"}, + {"3", "5", "2", "2", "4"}, + {"7", "6", "1", "3", "3"}, + {"4", "2", "4", "7", "6"}, + }) + fmt.Printf("%v\n", df.Rows()) + // Output: 4 +} + +func ExampleDataFrame_DropColumns() { + df, _ := NewDataFrame( + [][]string{ + {"a", "b", "c", "d", "e"}, + {"1", "2", "3", "4", "5"}, + {"3", "5", "2", "2", "4"}, + {"7", "6", "1", "3", "3"}, + {"4", "2", "4", "7", "6"}, + }) + df.DropColumns(0, 2) + fmt.Printf("%v\n", df.Columns()) + // Output: 3 +} + +func ExampleDataFrame_DropColumnsByName() { + df, _ := NewDataFrame( + [][]string{ + {"a", "b", "c", "d", "e"}, + {"1", "2", "3", "4", "5"}, + {"3", "5", "2", "2", "4"}, + {"7", "6", "1", "3", "3"}, + {"4", "2", "4", "7", "6"}, + }) + df.DropColumnsByName("b", "d", "e") + fmt.Printf("%v\n", df.Columns()) + // Output: 2 +} + +func ExampleDataFrame_DropColumnsWhere() { + df, _ := NewDataFrame( + [][]string{ + {"a", "b", "c", "d", "e"}, + {"1", "2", "3", "4", "5"}, + {"3", "5", "2", "2", "4"}, + {"7", "6", "1", "3", "3"}, + {"4", "2", "4", "7", "6"}, + }) + df.DropColumnsWhere(func(s *Series) bool { + return s.Name == "c" + }) + fmt.Printf("%v\n", df.Columns()) + // Output: 4 +} diff --git a/gander.go b/gander.go index 28533df..f626c5b 100644 --- a/gander.go +++ b/gander.go @@ -2,16 +2,34 @@ package gander import ( "encoding/csv" - "github.com/tophatsteve/urlreader" "io" "os" + + "github.com/tophatsteve/urlreader" ) +// A Summary describes the statisical properties of a Series. +type Summary struct { + Name string + Mean float64 + Median float64 + Mode []float64 + Min float64 + Max float64 + StdDev float64 + Variance float64 +} + +// LoadCSVFromURL creates a DataFrame by loading a csv file +// from a specific url. Note: at the moment this does not +// support https. func LoadCSVFromURL(url string) (*DataFrame, error) { u := urlreader.NewReader(url) - return LoadCSVFromReader(u) + return loadCSVFromReader(u) } +// LoadCSVFromPath creates a DataFrame by loading a csv file +// from a specific file system path. func LoadCSVFromPath(path string) (*DataFrame, error) { f, err := os.Open(path) if err != nil { @@ -20,12 +38,13 @@ func LoadCSVFromPath(path string) (*DataFrame, error) { defer f.Close() - return LoadCSVFromReader(f) + return loadCSVFromReader(f) } -func LoadCSVFromReader(reader io.Reader) (*DataFrame, error) { +func loadCSVFromReader(reader io.Reader) (*DataFrame, error) { r := csv.NewReader(reader) data, err := r.ReadAll() + if err != nil { return nil, err } diff --git a/parser.go b/parser.go index 475e7c3..11e6788 100644 --- a/parser.go +++ b/parser.go @@ -1,7 +1,5 @@ package gander -// Load data and convert to a DataFrame - import ( "strconv" ) diff --git a/series.go b/series.go index c8f38bc..48cb2d1 100644 --- a/series.go +++ b/series.go @@ -6,13 +6,16 @@ import ( "sort" ) +// A Series represents a column of data in a DataFrame. type Series struct { Name string Values []float64 - CategoricalLabels map[float64]string - CategoricalValues map[string]float64 + categoricalLabels map[float64]string + categoricalValues map[string]float64 } +// NewSeries creates a new Series with the specified name +// and values. func NewSeries(name string, values []float64) *Series { s := Series{} s.Name = name @@ -25,22 +28,27 @@ func NewSeries(name string, values []float64) *Series { return &s } +// NewCategoricalSeries create a new Series to contain categorical +// data. The data is passed in as a slice of strings. Internally +// the string values are converted to float64 and a map is maintained +// so that the original values can always be retrieved. No statistical +// operations can be carried out on a categorical series. func NewCategoricalSeries(name string, values []string) *Series { categoryNumber := 0.0 s := Series{} - s.CategoricalLabels = make(map[float64]string) - s.CategoricalValues = make(map[string]float64) + s.categoricalLabels = make(map[float64]string) + s.categoricalValues = make(map[string]float64) s.Name = name s.Values = []float64{} for _, v := range values { - if i, ok := s.CategoricalValues[v]; ok == true { + if i, ok := s.categoricalValues[v]; ok == true { s.Values = append(s.Values, i) } else { s.Values = append(s.Values, categoryNumber) - s.CategoricalValues[v] = categoryNumber - s.CategoricalLabels[categoryNumber] = v + s.categoricalValues[v] = categoryNumber + s.categoricalLabels[categoryNumber] = v categoryNumber += 1 } } @@ -80,7 +88,8 @@ func (s *Series) Median() float64 { return v[(len(v) / 2)] } -// Mode finds the mode of all the values in the Series. +// Mode finds the mode of all the values in the Series. This returns +// a slice ofr float64 because a Series could have more than one mode. func (s *Series) Mode() []float64 { m := []float64{} c := count(s.Values) @@ -120,7 +129,7 @@ func (s *Series) StdDev() float64 { } func (s *Series) IsCategorical() bool { - return s.CategoricalLabels != nil + return s.categoricalLabels != nil } // Max returns the maximum value in the Series. @@ -179,7 +188,7 @@ func (s *Series) Hist() (map[string]int, error) { r := make(map[string]int) for _, v := range s.Values { - c := s.CategoricalLabels[v] + c := s.categoricalLabels[v] if _, ok := r[c]; ok { r[c] += 1 } else { @@ -190,6 +199,21 @@ func (s *Series) Hist() (map[string]int, error) { return r, nil } +// Describe returns a summary of the statisical properties +// of all the Series. +func (s *Series) Describe() Summary { + r := Summary{Name: s.Name} + r.Mean = s.Mean() + r.Median = s.Median() + r.Mode = s.Mode() + r.Min = s.Min() + r.Max = s.Max() + r.StdDev = s.StdDev() + r.Variance = s.Variance() + + return r +} + func sum(r []float64) float64 { t := 0.0 diff --git a/series_test.go b/series_test.go index 1ca9049..58faed6 100644 --- a/series_test.go +++ b/series_test.go @@ -60,8 +60,8 @@ func TestNewCategoricalSeries(t *testing.T) { assert.Equal(t, "MySeries", s.Name, "column name is not correct") assert.Equal(t, true, s.IsCategorical(), "column is not categorical") assert.Equal(t, 10, len(s.Values), "wrong number of values") - assert.Equal(t, 4, len(s.CategoricalLabels), "wrong number of category labels") - assert.Equal(t, 4, len(s.CategoricalValues), "wrong number of category values") + assert.Equal(t, 4, len(s.categoricalLabels), "wrong number of category labels") + assert.Equal(t, 4, len(s.categoricalValues), "wrong number of category values") } func TestSeriesSum(t *testing.T) {