## Notes from [Python Data Analytics](http://www.amazon.com/Python-Data-Analytics-Fabio-Nelli/dp/1484209591/ref=sr_1_1?ie=UTF8&qid=1450048533&sr=8-1&keywords=Python+Data+Analytics)

### 1. Python Functions
* map(function, list)
* filter(function, list)
* reduce(function, list)
* lambda
* list comprehension
* [other built-in functions](https://docs.python.org/2/library/functions.html#reduce)


In [8]:
items = [1, 2, 3, 4, 5]
def inc(x):
    return x+1
print list(map(inc, items))                # use map function
print list(map(lambda x: x+1, items))      # use map and lambda functions
print list(filter(lambda x: x < 4, items)) # use of filter
print reduce((lambda x,y: x+y), items)     # use of reduce

[2, 3, 4, 5, 6]
[2, 3, 4, 5, 6]
[1, 2, 3]
15


Pip commands:
* pip install package_name
* pip search package_name
* pip show package_name
* pip unistall package_name

### 2. NumPy

In [14]:
import numpy as np
a = np.array([1, 2, 3])
print a.ndim  # dimension
print a.size  # total number of elments
print a.shape # shape of the array
print np.zeros((3, 3))   # zero array
print np.ones((3, 3))    # one array

1
3
(3,)
[[ 0.  0.  0.]
 [ 0.  0.  0.]
 [ 0.  0.  0.]]
[[ 1.  1.  1.]
 [ 1.  1.  1.]
 [ 1.  1.  1.]]


In [15]:
np.arange(0, 10)     # similar as range(0, 10)

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [16]:
np.arange(0, 12, 3)  # with interval, does NOT include 12

array([0, 3, 6, 9])

In [17]:
np.arange(0, 12, 3).reshape(2, 2)   # reshape the array

array([[0, 3],
       [6, 9]])

In [18]:
np.linspace(0, 10, 5)    # include 10

array([  0. ,   2.5,   5. ,   7.5,  10. ])

In [19]:
np.random.random((3, 3))

array([[ 0.0226314 ,  0.05591402,  0.30557851],
       [ 0.61526516,  0.06592523,  0.94704285],
       [ 0.98933642,  0.74626897,  0.19137706]])

In [23]:
A = np.arange(0, 9).reshape(3, 3)
B = np.ones((3, 3))
print A
print B
print A * B         # elementwise multiply
print np.dot(A, B)  # matrix multiply

[[0 1 2]
 [3 4 5]
 [6 7 8]]
[[ 1.  1.  1.]
 [ 1.  1.  1.]
 [ 1.  1.  1.]]
[[ 0.  1.  2.]
 [ 3.  4.  5.]
 [ 6.  7.  8.]]
[[  3.   3.   3.]
 [ 12.  12.  12.]
 [ 21.  21.  21.]]


In [25]:
# Indexing
a = np.arange(10, 16)
print a

[10 11 12 13 14 15]


In [29]:
print a[1:5:2]   # from index 1 to index 5 (exlcude), every 2 element
print a[:5:2]
print a[:5:]

[11 13]
[10 12 14]
[10 11 12 13 14]


In [33]:
A = np.arange(10, 19).reshape(3, 3)
print A
print A[0, :]
print A[:, 0]
print A[0:2, 0:2]
print A[[0, 2], 0:2]

[[10 11 12]
 [13 14 15]
 [16 17 18]]
[10 11 12]
[10 13 16]
[[10 11]
 [13 14]]
[[10 11]
 [16 17]]


In [34]:
A.mean(axis=0)   # mean, std, sum et al. along certain axis

array([ 13.,  14.,  15.])

In [36]:
np.apply_along_axis(np.mean, axis=0, arr=A)   # similar as above, here np.mean can be other functions

array([ 13.,  14.,  15.])

In [37]:
A[A < 13]         # selection

array([10, 11, 12])

In [38]:
A.reshape(1, 9)   # reshape the array

array([[10, 11, 12, 13, 14, 15, 16, 17, 18]])

In [39]:
A.ravel()        # turn array into one dimension

array([10, 11, 12, 13, 14, 15, 16, 17, 18])

In [40]:
A.transpose()

array([[10, 13, 16],
       [11, 14, 17],
       [12, 15, 18]])

In [42]:
# combine two arrays
A = np.ones((3, 3))
B = np.zeros((3, 3))
print np.vstack((A, B))   # vertically combine two arrays
print np.hstack((A, B))   # horizontally combine two arrays

[[ 1.  1.  1.]
 [ 1.  1.  1.]
 [ 1.  1.  1.]
 [ 0.  0.  0.]
 [ 0.  0.  0.]
 [ 0.  0.  0.]]
[[ 1.  1.  1.  0.  0.  0.]
 [ 1.  1.  1.  0.  0.  0.]
 [ 1.  1.  1.  0.  0.  0.]]


In [44]:
# combine multipy 1-d arrays
a = np.array([0, 1, 2])
b = np.array([3, 4, 5])
c = np.array([6, 7, 8])
print np.column_stack((a, b, c))   # stack for each column
print np.row_stack((a, b, c))      # stack for each row

[[0 3 6]
 [1 4 7]
 [2 5 8]]
[[0 1 2]
 [3 4 5]
 [6 7 8]]


In [46]:
# split arrays
A = np.arange(16).reshape((4, 4))
print A
[B, C] = np.hsplit(A, 2)
print B
print C
[B, C] = np.vsplit(A, 2)
print B
print C

[[ 0  1  2  3]
 [ 4  5  6  7]
 [ 8  9 10 11]
 [12 13 14 15]]
[[ 0  1]
 [ 4  5]
 [ 8  9]
 [12 13]]
[[ 2  3]
 [ 6  7]
 [10 11]
 [14 15]]
[[0 1 2 3]
 [4 5 6 7]]
[[ 8  9 10 11]
 [12 13 14 15]]


In [None]:
# A more complex way of splitting
[A1, A2, A3] = np.split(A, [1, 3], axis=1)  # split to 3 parts, 0:1, 1:3, 3:end
print A1
print A2
print A3

More array split can be found [here](http://docs.scipy.org/doc/numpy/reference/generated/numpy.split.html).

Two arrays may be subjected to broadcasting when all their dimensions are compatible, i.e., the length of each dimension must be equal between the two array or one of them must be equal to 1. 

In [47]:
A = np.arange(16).reshape(4, 4)
b = np.arange(4)
print A
print b
print A+b

[[ 0  1  2  3]
 [ 4  5  6  7]
 [ 8  9 10 11]
 [12 13 14 15]]
[0 1 2 3]
[[ 0  2  4  6]
 [ 4  6  8 10]
 [ 8 10 12 14]
 [12 14 16 18]]


In [None]:
# save and load data
np.save('saved_data', data)
np.load('saved_data.npy')
# read data in a text file
np.genfromtxt('data.csv', delimiter=',', names=True)

### 3. Pandas

Two primary data structures:
* Series
* DataFrame

In [4]:
import pandas as pd
s = pd.Series([12, -4, 7, 9], index=['a', 'b', 'c', 'd'])
print s
print s.values
print s.index
print s['b']
print s[1]
print s[['b', 'c']]

a    12
b    -4
c     7
d     9
dtype: int64
[12 -4  7  9]
Index([u'a', u'b', u'c', u'd'], dtype='object')
-4
-4
b   -4
c    7
dtype: int64


In [7]:
print s.unique()
print s.value_counts()
print s.isin([0, 7])

[12 -4  7  9]
 7     1
 12    1
-4     1
 9     1
dtype: int64
a    False
b    False
c     True
d    False
dtype: bool


In [8]:
print s.isnull()   # NaN or not
print s.notnull()  # NaN or not

a    False
b    False
c    False
d    False
dtype: bool
a    True
b    True
c    True
d    True
dtype: bool


In [9]:
# Series as dictionaries
mydict = {'red': 2000, 'blue': 1000, 'yellow': 500, 'orange': 1000}
myseries = pd.Series(mydict)
myseries

blue      1000
orange    1000
red       2000
yellow     500
dtype: int64

In [11]:
# DataFrame
data = {'color' : ['blue','green','yellow','red','white'],
'object' : ['ball','pen','pencil','paper','mug'],
'price' : [1.2,1.0,0.6,0.9,1.7]}
frame = pd.DataFrame(data)
frame

Unnamed: 0,color,object,price
0,blue,ball,1.2
1,green,pen,1.0
2,yellow,pencil,0.6
3,red,paper,0.9
4,white,mug,1.7


In [12]:
frame2 = pd.DataFrame(data, columns=['object','price'])
frame2

Unnamed: 0,object,price
0,ball,1.2
1,pen,1.0
2,pencil,0.6
3,paper,0.9
4,mug,1.7


In [13]:
frame2 = pd.DataFrame(data, index=['one','two','three','four','five'])
frame2

Unnamed: 0,color,object,price
one,blue,ball,1.2
two,green,pen,1.0
three,yellow,pencil,0.6
four,red,paper,0.9
five,white,mug,1.7


In [15]:
print frame2.index       # index names
print frame2.columns     # column names
print frame2.values      # values

Index([u'one', u'two', u'three', u'four', u'five'], dtype='object')
Index([u'color', u'object', u'price'], dtype='object')
[['blue' 'ball' 1.2]
 ['green' 'pen' 1.0]
 ['yellow' 'pencil' 0.6]
 ['red' 'paper' 0.9]
 ['white' 'mug' 1.7]]


In [18]:
# select elements
print frame.price
print frame['price']
print frame.ix[:, 2]

0    1.2
1    1.0
2    0.6
3    0.9
4    1.7
Name: price, dtype: float64
0    1.2
1    1.0
2    0.6
3    0.9
4    1.7
Name: price, dtype: float64
0    1.2
1    1.0
2    0.6
3    0.9
4    1.7
Name: price, dtype: float64


In [19]:
frame.index.name = 'id'
frame.columns.name = 'item'
frame

item,color,object,price
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,blue,ball,1.2
1,green,pen,1.0
2,yellow,pencil,0.6
3,red,paper,0.9
4,white,mug,1.7


In [21]:
import numpy as np
frame['new'] = np.arange(0, 5)     # add new column
print frame
del frame['new']    # delete column
print frame

item   color  object  price  new
id                              
0       blue    ball    1.2    0
1      green     pen    1.0    1
2     yellow  pencil    0.6    2
3        red   paper    0.9    3
4      white     mug    1.7    4
item   color  object  price
id                         
0       blue    ball    1.2
1      green     pen    1.0
2     yellow  pencil    0.6
3        red   paper    0.9
4      white     mug    1.7


In [22]:
frame.T   # transpose

id,0,1,2,3,4
item,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
color,blue,green,yellow,red,white
object,ball,pen,pencil,paper,mug
price,1.2,1,0.6,0.9,1.7


In [28]:
ser = pd.Series([5,0,3,8,4], index=['red','blue','yellow','white','green'])
print ser
print ser.index
print ser.idxmin()    # index with the lowest value
print ser.idxmax()    # index with the largest value

red       5
blue      0
yellow    3
white     8
green     4
dtype: int64
Index([u'red', u'blue', u'yellow', u'white', u'green'], dtype='object')
blue
white


In [30]:
ser.index.is_unique        # check if the index is unique or not

True

In [31]:
# Reindexing
ser = pd.Series([2,5,7,4], index=['one','two','three','four'])
print ser
ser.reindex(['three','four','five','one'])

one      2
two      5
three    7
four     4
dtype: int64


three     7
four      4
five    NaN
one       2
dtype: float64

In [35]:
# fill the series
ser3 = pd.Series([1,5,6,3],index=[0,3,5,6])
print ser3
print ser3.reindex(range(6), method='ffill')   # fill forward
print ser3.reindex(range(6), method='bfill')   # fill backward

0    1
3    5
5    6
6    3
dtype: int64
0    1
1    1
2    1
3    5
4    5
5    6
dtype: int64
0    1
1    5
2    5
3    5
4    6
5    6
dtype: int64


In [38]:
# drop 
ser = pd.Series(np.arange(4.), index=['red','blue','yellow','white'])
print ser
print ser.drop(['blue', 'yellow'])

red       0
blue      1
yellow    2
white     3
dtype: float64
red      0
white    3
dtype: float64


In [40]:
frame1 = pd.DataFrame(np.arange(16).reshape((4,4)),
... index=['red','blue','yellow','white'],
... columns=['ball','pen','pencil','paper'])

frame2 = pd.DataFrame(np.arange(12).reshape((4,3)),
... index=['blue','green','white','yellow'],
... columns=['mug','pen','ball'])
print frame1
print frame2
print frame1 + frame2
print frame1.add(frame2)   # operations is done by index

        ball  pen  pencil  paper
red        0    1       2      3
blue       4    5       6      7
yellow     8    9      10     11
white     12   13      14     15
        mug  pen  ball
blue      0    1     2
green     3    4     5
white     6    7     8
yellow    9   10    11
        ball  mug  paper  pen  pencil
blue       6  NaN    NaN    6     NaN
green    NaN  NaN    NaN  NaN     NaN
red      NaN  NaN    NaN  NaN     NaN
white     20  NaN    NaN   20     NaN
yellow    19  NaN    NaN   19     NaN
        ball  mug  paper  pen  pencil
blue       6  NaN    NaN    6     NaN
green    NaN  NaN    NaN  NaN     NaN
red      NaN  NaN    NaN  NaN     NaN
white     20  NaN    NaN   20     NaN
yellow    19  NaN    NaN   19     NaN


In [42]:
frame1.sort_index(ascending=False)

Unnamed: 0,ball,pen,pencil,paper
yellow,8,9,10,11
white,12,13,14,15
red,0,1,2,3
blue,4,5,6,7


In [43]:
frame1.sort_index(by=['pen', 'pencil'])

Unnamed: 0,ball,pen,pencil,paper
red,0,1,2,3
blue,4,5,6,7
yellow,8,9,10,11
white,12,13,14,15


In [46]:
# frame.corr()
# frame.cov()
# frame.corrwith(frame2)
# frame.dropna()
# frame.fillna(0)
# frame.stack()
# frame.unstack()

**Reading and writting data using Pandas**

* Readers: read_csv, read_excel, read_hdf, read_sql, read_json, read_html, read_stata, read_clipboard, read_pickle, read_msgpack, read_gbq
* Writers: to_csv, to_excel, to_hdf, to_sql, to_json, to_html, to_stata, to_clipboard, to_pickle, to_msgpack, to_gbq

** Data Manipulation **

In [51]:
import numpy as np
import pandas as pd

frame1 = pd.DataFrame( {'id':['ball','pencil','pen','mug','ashtray'],\
        'price': [12.33,11.44,33.21,13.23,33.62]})
frame2 = pd.DataFrame( {'id':['pencil','pencil','ball','pen'],\
        'color': ['white','red','red','black']})
print frame1
print frame2
print pd.merge(frame1, frame2, on='id')  # merge by default perform inner join
print pd.merge(frame1, frame2, on='id', how='outer')  # outer join
# There are also right and left join. 
# To make merge of mulitple keys, you simply just add a list to 'on' option.
# You can also do merge based on index (just set right_index=True, or left_index=True).
# You can also do df1.join(df2).

        id  price
0     ball  12.33
1   pencil  11.44
2      pen  33.21
3      mug  13.23
4  ashtray  33.62
   color      id
0  white  pencil
1    red  pencil
2    red    ball
3  black     pen
       id  price  color
0    ball  12.33    red
1  pencil  11.44  white
2  pencil  11.44    red
3     pen  33.21  black
        id  price  color
0     ball  12.33    red
1   pencil  11.44  white
2   pencil  11.44    red
3      pen  33.21  black
4      mug  13.23    NaN
5  ashtray  33.62    NaN


In [None]:
# combine data frames
pd.concat([df1, df2], axis=1, join='inner')
pd.concat([df1, df2], keys=[1, 2])  # set keys for df1(1) and df2(2)

In [52]:
# if values are different for same index
ser1 = pd.Series(np.random.rand(5),index=[1,2,3,4,5])
ser2 = pd.Series(np.random.rand(4),index=[2,4,5,6])
print ser1
print ser2
print ser1.combine_first(ser2)
print ser2.combine_first(ser1)

1    0.611084
2    0.707381
3    0.603744
4    0.487561
5    0.799834
dtype: float64
2    0.367192
4    0.429333
5    0.884948
6    0.998197
dtype: float64
1    0.611084
2    0.707381
3    0.603744
4    0.487561
5    0.799834
6    0.998197
dtype: float64
1    0.611084
2    0.367192
3    0.603744
4    0.429333
5    0.884948
6    0.998197
dtype: float64


In [53]:
# pivoting
frame1 = pd.DataFrame(np.arange(9).reshape(3,3),\
                      index=['white','black','red'],\
                      columns=['ball','pen','pencil'])
frame1

Unnamed: 0,ball,pen,pencil
white,0,1,2
black,3,4,5
red,6,7,8


In [54]:
frame1.stack()

white  ball      0
       pen       1
       pencil    2
black  ball      3
       pen       4
       pencil    5
red    ball      6
       pen       7
       pencil    8
dtype: int64

In [56]:
(frame1.stack()).unstack()

Unnamed: 0,ball,pen,pencil
white,0,1,2
black,3,4,5
red,6,7,8


In [57]:
longframe = pd.DataFrame({ 'color':['white','white','white',\
                                    'red','red','red',\
                                    'black','black','black'],\
                          'item':['ball','pen','mug',\
                                  'ball','pen','mug',\
                                  'ball','pen','mug'],\
                          'value': np.random.rand(9)})
longframe

Unnamed: 0,color,item,value
0,white,ball,0.182524
1,white,pen,0.34858
2,white,mug,0.087091
3,red,ball,0.180152
4,red,pen,0.674948
5,red,mug,0.263541
6,black,ball,0.838841
7,black,pen,0.395436
8,black,mug,0.01937


In [58]:
wideframe = longframe.pivot('color', 'item')
wideframe

Unnamed: 0_level_0,value,value,value
item,ball,mug,pen
color,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
black,0.838841,0.01937,0.395436
red,0.180152,0.263541,0.674948
white,0.182524,0.087091,0.34858


In [62]:
# remove values from data frame
frame1 = pd.DataFrame(np.arange(9).reshape(3,3),\
                      index=['white','black','red'],\
                      columns=['ball','pen','pencil'])
print frame1
del frame1['ball']
print frame1
print frame1.drop('white')

       ball  pen  pencil
white     0    1       2
black     3    4       5
red       6    7       8
       pen  pencil
white    1       2
black    4       5
red      7       8
       pen  pencil
black    4       5
red      7       8


In [65]:
# data transformation
dframe = pd.DataFrame({ 'color': ['white','white','red','red','white'],\
                       'value': [2,1,3,3,2]})
print dframe
print dframe.duplicated()  # check if there's duplicated data
dframe.drop_duplicates()   # drop duplicated data

   color  value
0  white      2
1  white      1
2    red      3
3    red      3
4  white      2
0    False
1    False
2    False
3     True
4     True
dtype: bool


Unnamed: 0,color,value
0,white,2
1,white,1
2,red,3


In [66]:
# replace 
frame = pd.DataFrame({ 'item':['ball','mug','pen','pencil','ashtray'],\
                      'color':['white','rosso','verde','black','yellow'], \
                      'price':[5.56,4.20,1.30,0.56,2.75]})
frame

Unnamed: 0,color,item,price
0,white,ball,5.56
1,rosso,mug,4.2
2,verde,pen,1.3
3,black,pencil,0.56
4,yellow,ashtray,2.75


In [67]:
newcolors = {'rosso': 'red',
             'verde': 'green'}
frame.replace(newcolors)

Unnamed: 0,color,item,price
0,white,ball,5.56
1,red,mug,4.2
2,green,pen,1.3
3,black,pencil,0.56
4,yellow,ashtray,2.75


In [76]:
# map
frame = pd.DataFrame({ 'item':['ball','mug','pen','pencil','ashtray'],
                      'color':['white','red','green','black','yellow']})
frame

Unnamed: 0,color,item
0,white,ball
1,red,mug
2,green,pen
3,black,pencil
4,yellow,ashtray


In [77]:
prices = {'ball' : 5.56, 'mug' : 4.20, 'bottle' : 1.30,
         'scissors' : 3.41, 'pen' : 1.30, 'pencil' : 0.56,
         'ashtray' : 2.75}
frame['price'] = frame['item'].map(prices)
frame

Unnamed: 0,color,item,price
0,white,ball,5.56
1,red,mug,4.2
2,green,pen,1.3
3,black,pencil,0.56
4,yellow,ashtray,2.75


In [78]:
# rename index or columns
reindex = {0: 'first',
           1: 'second',
           2: 'third',
           3: 'fourth',
           4: 'fifth'}
recolumn = {'item':'object',
            'price': 'value'}
print frame.rename(index=reindex, columns=recolumn)
print frame
frame.rename(index=reindex, columns=recolumn, inplace=True) # inplace makes sure the changes happen to the original dataframe
print frame

         color   object  value
first    white     ball   5.56
second     red      mug   4.20
third    green      pen   1.30
fourth   black   pencil   0.56
fifth   yellow  ashtray   2.75
    color     item  price
0   white     ball   5.56
1     red      mug   4.20
2   green      pen   1.30
3   black   pencil   0.56
4  yellow  ashtray   2.75
         color   object  value
first    white     ball   5.56
second     red      mug   4.20
third    green      pen   1.30
fourth   black   pencil   0.56
fifth   yellow  ashtray   2.75


In [79]:
# discretization and binning

# change to different categories
results = [12,34,67,55,28,90,99]
bins = [0,25,50,75,100]
cat = pd.cut(results, bins)
cat

[(0, 25], (25, 50], (50, 75], (50, 75], (25, 50], (75, 100], (75, 100]]
Categories (4, object): [(0, 25] < (25, 50] < (50, 75] < (75, 100]]

In [85]:
cat.categories

Index([u'(0, 25]', u'(25, 50]', u'(50, 75]', u'(75, 100]'], dtype='object')

In [83]:
cat.value_counts()

(0, 25]      1
(25, 50]     2
(50, 75]     2
(75, 100]    2
dtype: int64

Another method is `qcut()`, which can directly divide the sample into quntiles.

In [87]:
# filter outliers
randframe = pd.DataFrame(np.random.randn(1000,3))
print randframe.describe()
randframe[(np.abs(randframe) > (3*randframe.std())).any(1)]

                 0            1            2
count  1000.000000  1000.000000  1000.000000
mean     -0.009871     0.004314    -0.002460
std       1.031753     0.964007     0.961792
min      -3.266127    -2.750076    -2.929541
25%      -0.714287    -0.671256    -0.705985
50%      -0.022217     0.027222    -0.027829
75%       0.694838     0.648539     0.674543
max       3.395532     3.655741     3.336637


Unnamed: 0,0,1,2
381,0.301428,2.912618,-0.495989
419,0.992485,3.655741,1.672764
549,-0.904884,2.820288,3.336637
655,3.395532,-0.35316,-0.007438
778,-3.266127,1.645688,1.344644
832,0.715832,3.406925,0.116812
917,0.386274,0.301172,-2.929541


In [89]:
# permutation
nframe = pd.DataFrame(np.arange(25).reshape(5,5))
new_order = np.random.permutation(3)
print nframe
print new_order
print nframe.take(new_order)

    0   1   2   3   4
0   0   1   2   3   4
1   5   6   7   8   9
2  10  11  12  13  14
3  15  16  17  18  19
4  20  21  22  23  24
[0 2 1]
    0   1   2   3   4
0   0   1   2   3   4
2  10  11  12  13  14
1   5   6   7   8   9


In [95]:
# string manipulation
text = '16 Bolton Avenue, Boston'
tokens = [s.strip() for s in text.split(',')]
print tokens
','.join(tokens)

['16 Bolton Avenue', 'Boston']


'16 Bolton Avenue,Boston'

In [96]:
text.index('Boston')

18

In [97]:
text.find('Boston')

18

In [98]:
text.count('e')

2

In [99]:
text.replace('Avenue', 'Street')

'16 Bolton Street, Boston'

In [100]:
# Regular Expressions (pattern matching, substitution, splitting)
import re
text = "This is an\t odd \n text!"
re.split('\s+', text)   # \s+ represent one or more spaces

['This', 'is', 'an', 'odd', 'text!']

More details can be found [here](https://docs.python.org/2/library/re.html)

In [103]:
# Data Aggregation
frame = pd.DataFrame({ 'color': ['white','red','green','red','green'],
                      'object': ['pen','pencil','pencil','ashtray','pen'],
                      'price1' : [5.56,4.20,1.30,0.56,2.75],
                      'price2' : [4.75,4.12,1.60,0.75,3.15]})
frame

Unnamed: 0,color,object,price1,price2
0,white,pen,5.56,4.75
1,red,pencil,4.2,4.12
2,green,pencil,1.3,1.6
3,red,ashtray,0.56,0.75
4,green,pen,2.75,3.15


In [106]:
group = frame['price1'].groupby(frame['color'])
print group.groups
print group.mean()

{'white': [0], 'green': [2, 4], 'red': [1, 3]}
color
green    2.025
red      2.380
white    5.560
Name: price1, dtype: float64


More advanced methods can be found [here](http://pandas.pydata.org/pandas-docs/stable/groupby.html).