Document functions

tophatsteve · Jun 15, 2017 · b861313 · b861313
1 parent 4c50169
commit b861313
Show file tree

Hide file tree

Showing 8 changed files with 180 additions and 23 deletions.
diff --git a/dataframe.go b/dataframe.go
@@ -8,9 +8,17 @@ import (
 	"sync"
 )
 
+// A DataFrame is a slice of *Series. As a Series
+// contains a slice of float64, a DataFrame can be thought of
+// as a two dimensional table of data, somewhat like a spreadsheet.
 type DataFrame []*Series
 
-// NewDataFrame creates a DataFrame from a 2 dimensional string slice.
+// NewDataFrame creates a DataFrame from a 2 dimensional string slice, converting
+// all data values to float64. If any values in the first row cannot be converted to
+// a float64, then the first row is treated as containing headers and is used to set
+// the column name of each Series. If a value (excluding values in the first row) cannot
+// be converted to a float64, then the Series is marked as holding categorical data
+// and will not be used for numeric calculations.
 func NewDataFrame(data [][]string) (*DataFrame, error) {
 	if !columnCountsMatch(data) {
 		return nil, errors.New("not all rows have the same number of columns")
@@ -28,7 +36,6 @@ func NewDataFrame(data [][]string) (*DataFrame, error) {
 	}
 
 	d := DataFrame{}
-
 	for x := 0; x < len(data[0]); x++ {
 		s := createSeries(headers[x], data, x)
 		d = append(d, s)
@@ -172,7 +179,7 @@ func (d *DataFrame) String() string {
 	for r := 0; r < rows; r++ {
 		for c := 0; c < columns; c++ {
 			if df[c].IsCategorical() == true {
-				output += fmt.Sprintf(" %"+strconv.Itoa(colWidths[c]-3)+"s  ", df[c].CategoricalLabels[df[c].Values[r]])
+				output += fmt.Sprintf(" %"+strconv.Itoa(colWidths[c]-3)+"s  ", df[c].categoricalLabels[df[c].Values[r]])
 			} else {
 				output += fmt.Sprintf(" %"+strconv.Itoa(colWidths[c]-3)+".2f  ", df[c].Values[r])
 			}
@@ -193,6 +200,21 @@ func (d *DataFrame) Standardize() {
 	}
 }
 
+// Describe returns a summary of the statisical properties
+// of all the Series in the DataFrame.
+func (d *DataFrame) Describe() []Summary {
+	s := []Summary{}
+
+	for _, v := range *d {
+		if v.IsCategorical() == false {
+			vs := v.Describe()
+			s = append(s, vs)
+		}
+	}
+
+	return s
+}
+
 func (d *DataFrame) toRow(i int) []float64 {
 	r := []float64{}
 

diff --git a/dataframe_test.go b/dataframe_test.go
@@ -27,8 +27,8 @@ func TestCreateDataFrameWithCategoricalData(t *testing.T) {
 	assert.Equal(t, nil, err, "error is not nil")
 	s := (*df)[3]
 	assert.Equal(t, true, s.IsCategorical(), "series does not contain categorical data")
-	assert.Equal(t, 2, len(s.CategoricalLabels), "wrong number of category labels created")
-	assert.Equal(t, 2, len(s.CategoricalValues), "wrong number of category values created")
+	assert.Equal(t, 2, len(s.categoricalLabels), "wrong number of category labels created")
+	assert.Equal(t, 2, len(s.categoricalValues), "wrong number of category values created")
 }
 
 func TestStringFullFrame(t *testing.T) {

diff --git a/doc.go b/doc.go
@@ -0,0 +1,13 @@
+// Gander provides DataFrames and Series to manipulate tabular data. It is based
+// on the excellent Python Pandas package (http://pandas.pydata.org/).
+// A DataFrame can be thought of as being similar to a spreadsheet, in that it holds
+// rows and columns of data.
+//
+// Data is loaded into a DataFrame from a csv file either from
+// a url, or from a file path. If all the fields of the top row of the csv contain
+// non-numeric data then the top row is assumed to be column headings.
+//
+// Each column of the DataFrame is held as a Series object, which is made up of a
+// slice of float64s, and the name of the column. Categorical (non-numeric) data
+// can also be held in a Series, but no calculations can be carried out on it.
+package gander
diff --git a/doc_test.go b/doc_test.go
@@ -0,0 +1,81 @@
+package gander
+
+import (
+	"fmt"
+	"log"
+)
+
+func ExampleLoadCSVFromPath() {
+	df, err := LoadCSVFromPath("testdata/MOCK_DATA.csv")
+	if err != nil {
+		log.Panic(err)
+	}
+	fmt.Printf("%v\n", df.Columns())
+	// Output: 6
+}
+
+func ExampleLoadCSVFromURL() {
+	df, err := LoadCSVFromURL("http://download.tensorflow.org/data/iris_training.csv")
+	if err != nil {
+		log.Panic(err)
+	}
+	fmt.Printf("%v\n", df.Columns())
+	// Output: 5
+}
+
+func ExampleNewDataFrame() {
+	df, _ := NewDataFrame(
+		[][]string{
+			{"a", "b", "c", "d", "e"},
+			{"1", "2", "3", "4", "5"},
+			{"3", "5", "2", "2", "4"},
+			{"7", "6", "1", "3", "3"},
+			{"4", "2", "4", "7", "6"},
+		})
+	fmt.Printf("%v\n", df.Rows())
+	// Output: 4
+}
+
+func ExampleDataFrame_DropColumns() {
+	df, _ := NewDataFrame(
+		[][]string{
+			{"a", "b", "c", "d", "e"},
+			{"1", "2", "3", "4", "5"},
+			{"3", "5", "2", "2", "4"},
+			{"7", "6", "1", "3", "3"},
+			{"4", "2", "4", "7", "6"},
+		})
+	df.DropColumns(0, 2)
+	fmt.Printf("%v\n", df.Columns())
+	// Output: 3
+}
+
+func ExampleDataFrame_DropColumnsByName() {
+	df, _ := NewDataFrame(
+		[][]string{
+			{"a", "b", "c", "d", "e"},
+			{"1", "2", "3", "4", "5"},
+			{"3", "5", "2", "2", "4"},
+			{"7", "6", "1", "3", "3"},
+			{"4", "2", "4", "7", "6"},
+		})
+	df.DropColumnsByName("b", "d", "e")
+	fmt.Printf("%v\n", df.Columns())
+	// Output: 2
+}
+
+func ExampleDataFrame_DropColumnsWhere() {
+	df, _ := NewDataFrame(
+		[][]string{
+			{"a", "b", "c", "d", "e"},
+			{"1", "2", "3", "4", "5"},
+			{"3", "5", "2", "2", "4"},
+			{"7", "6", "1", "3", "3"},
+			{"4", "2", "4", "7", "6"},
+		})
+	df.DropColumnsWhere(func(s *Series) bool {
+		return s.Name == "c"
+	})
+	fmt.Printf("%v\n", df.Columns())
+	// Output: 4
+}
diff --git a/gander.go b/gander.go
@@ -2,16 +2,34 @@ package gander
 
 import (
 	"encoding/csv"
-	"github.com/tophatsteve/urlreader"
 	"io"
 	"os"
+
+	"github.com/tophatsteve/urlreader"
 )
 
+// A Summary describes the statisical properties of a Series.
+type Summary struct {
+	Name     string
+	Mean     float64
+	Median   float64
+	Mode     []float64
+	Min      float64
+	Max      float64
+	StdDev   float64
+	Variance float64
+}
+
+// LoadCSVFromURL creates a DataFrame by loading a csv file
+// from a specific url. Note: at the moment this does not
+// support https.
 func LoadCSVFromURL(url string) (*DataFrame, error) {
 	u := urlreader.NewReader(url)
-	return LoadCSVFromReader(u)
+	return loadCSVFromReader(u)
 }
 
+// LoadCSVFromPath creates a DataFrame by loading a csv file
+// from a specific file system path.
 func LoadCSVFromPath(path string) (*DataFrame, error) {
 	f, err := os.Open(path)
 	if err != nil {
@@ -20,12 +38,13 @@ func LoadCSVFromPath(path string) (*DataFrame, error) {
 
 	defer f.Close()
 
-	return LoadCSVFromReader(f)
+	return loadCSVFromReader(f)
 }
 
-func LoadCSVFromReader(reader io.Reader) (*DataFrame, error) {
+func loadCSVFromReader(reader io.Reader) (*DataFrame, error) {
 	r := csv.NewReader(reader)
 	data, err := r.ReadAll()
+
 	if err != nil {
 		return nil, err
 	}

diff --git a/parser.go b/parser.go
@@ -1,7 +1,5 @@
 package gander
 
-// Load data and convert to a DataFrame
-
 import (
 	"strconv"
 )

diff --git a/series.go b/series.go
@@ -6,13 +6,16 @@ import (
 	"sort"
 )
 
+// A Series represents a column of data in a DataFrame.
 type Series struct {
 	Name              string
 	Values            []float64
-	CategoricalLabels map[float64]string
-	CategoricalValues map[string]float64
+	categoricalLabels map[float64]string
+	categoricalValues map[string]float64
 }
 
+// NewSeries creates a new Series with the specified name
+// and values.
 func NewSeries(name string, values []float64) *Series {
 	s := Series{}
 	s.Name = name
@@ -25,22 +28,27 @@ func NewSeries(name string, values []float64) *Series {
 	return &s
 }
 
+// NewCategoricalSeries create a new Series to contain categorical
+// data. The data is passed in as a slice of strings. Internally
+// the string values are converted to float64 and a map is maintained
+// so that the original values can always be retrieved. No statistical
+// operations can be carried out on a categorical series.
 func NewCategoricalSeries(name string, values []string) *Series {
 	categoryNumber := 0.0
 	s := Series{}
-	s.CategoricalLabels = make(map[float64]string)
-	s.CategoricalValues = make(map[string]float64)
+	s.categoricalLabels = make(map[float64]string)
+	s.categoricalValues = make(map[string]float64)
 	s.Name = name
 
 	s.Values = []float64{}
 
 	for _, v := range values {
-		if i, ok := s.CategoricalValues[v]; ok == true {
+		if i, ok := s.categoricalValues[v]; ok == true {
 			s.Values = append(s.Values, i)
 		} else {
 			s.Values = append(s.Values, categoryNumber)
-			s.CategoricalValues[v] = categoryNumber
-			s.CategoricalLabels[categoryNumber] = v
+			s.categoricalValues[v] = categoryNumber
+			s.categoricalLabels[categoryNumber] = v
 			categoryNumber += 1
 		}
 	}
@@ -80,7 +88,8 @@ func (s *Series) Median() float64 {
 	return v[(len(v) / 2)]
 }
 
-// Mode finds the mode of all the values in the Series.
+// Mode finds the mode of all the values in the Series. This returns
+// a slice ofr float64 because a Series could have more than one mode.
 func (s *Series) Mode() []float64 {
 	m := []float64{}
 	c := count(s.Values)
@@ -120,7 +129,7 @@ func (s *Series) StdDev() float64 {
 }
 
 func (s *Series) IsCategorical() bool {
-	return s.CategoricalLabels != nil
+	return s.categoricalLabels != nil
 }
 
 // Max returns the maximum value in the Series.
@@ -179,7 +188,7 @@ func (s *Series) Hist() (map[string]int, error) {
 	r := make(map[string]int)
 
 	for _, v := range s.Values {
-		c := s.CategoricalLabels[v]
+		c := s.categoricalLabels[v]
 		if _, ok := r[c]; ok {
 			r[c] += 1
 		} else {
@@ -190,6 +199,21 @@ func (s *Series) Hist() (map[string]int, error) {
 	return r, nil
 }
 
+// Describe returns a summary of the statisical properties
+// of all the Series.
+func (s *Series) Describe() Summary {
+	r := Summary{Name: s.Name}
+	r.Mean = s.Mean()
+	r.Median = s.Median()
+	r.Mode = s.Mode()
+	r.Min = s.Min()
+	r.Max = s.Max()
+	r.StdDev = s.StdDev()
+	r.Variance = s.Variance()
+
+	return r
+}
+
 func sum(r []float64) float64 {
 	t := 0.0
 

diff --git a/series_test.go b/series_test.go
@@ -60,8 +60,8 @@ func TestNewCategoricalSeries(t *testing.T) {
 	assert.Equal(t, "MySeries", s.Name, "column name is not correct")
 	assert.Equal(t, true, s.IsCategorical(), "column is not categorical")
 	assert.Equal(t, 10, len(s.Values), "wrong number of values")
-	assert.Equal(t, 4, len(s.CategoricalLabels), "wrong number of category labels")
-	assert.Equal(t, 4, len(s.CategoricalValues), "wrong number of category values")
+	assert.Equal(t, 4, len(s.categoricalLabels), "wrong number of category labels")
+	assert.Equal(t, 4, len(s.categoricalValues), "wrong number of category values")
 }
 
 func TestSeriesSum(t *testing.T) {