In [1]:
import numpy as np
import matplotlib.pyplot as plt
import os


In [2]:
%qtconsole

## Strings

Why it was a bad idea to have strings as iterables

In [None]:
#cannot wrap. Searcb for finding if iterable but not string, no clean solution and even 
# broken in different versions of python

def view_ls(s):
    """plot shortened version of long strings. works with lists."""
    if not isinstance(s,str):
        return [view_ls(ss) for ss in s]
        
    l=7
    return s[:l]+'...'+s[-l:] if len(s)>(l*2+4) else s

def view_fn(s):
    """wrapper around view_ls, plot shortened version of file names by removing path."""
    if not isinstance(s,str):
        return [view_fn(ss) for ss in s]
    return view_ls(os.path.splitext(os.path.basename(s))[0])



print(view_ls(v[0]))
print(view_ls(v))

print(view_fn(v[0]))
print(view_fn(v))



## How arrays are stored in memory and why it's important

Surprisingly (or not), the last component is modified in the operation. 
Many cases in which this can happen. The effect is due to the fact that slices are views, not copies, also in assignment (? questo e' sbagliato e contraddetto dopo. in realta' ha forse a che fare con element-wise assignment).

This is a consequence of mutable types being references. When they are modified inplace inside a routine, the change is reflected on the original data.

In [3]:
import numpy as np

def divide_last_component(a):
    """   divide third component by three."""
    a[-1]=a[-1]/3 
    result = a**2
    return result

def divide_number(a):
    """   divide third component by three."""
    a=a/3 
    result = a**2
    return result

a=np.array((24,3,9.))
print (a)
print (divide_last_component(a))
print (a)

b=5
print (b)
print (divide_number(b))
print (b)

[24.  3.  9.]
[576.   9.   9.]
[24.  3.  3.]
5
2.777777777777778
5


A slice is a mutable type and works in assignment same way as element-wise assignment. This doesn't mean slice being a view rather than a copy.

In [4]:
def stest(a,b):
    a[:]=b
    return a

a=np.array((24,3,9.))
b=np.ones(3)

print (a)
print (stest(a,b))
print (a)


[24.  3.  9.]
[1. 1. 1.]
[1. 1. 1.]


Same test would fail if b is not the right shape

In [5]:
print (stest(a,b[:2]))

ValueError: could not broadcast input array from shape (2) into shape (3)

It is indeed a copy, not a view

In [6]:
#slices in assignment make a copy ?
a=np.arange(5)
b=np.ones(2)
print (a,b)
a[1:3]=b  #b[:] doesn't make any difference
print (a)

b[1] = -5
print (a,b)

[0 1 2 3 4] [1. 1.]
[0 1 1 3 4]
[0 1 1 3 4] [ 1. -5.]


In [7]:
#this is same thing just in a routine, a doesn't change even here
def repwcopy(a,b):
    a=a.copy()
    a[1:(1+len(b))]=b
    return a

a=np.arange(20)
b=np.arange(4)*-1
print ("a,b: ",a,b)
c=repwcopy(a[1:20],b)
print ("a,c: ",a,c)

a,b:  [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19] [ 0 -1 -2 -3]
a,c:  [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19] [ 1  0 -1 -2 -3  6  7  8  9 10 11 12 13 14 15 16 17 18 19]


In [8]:
print ("a,b: ",a,b)
%timeit repwcopy(a[1:100],b)
print (a)

a,b:  [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19] [ 0 -1 -2 -3]
1.21 µs ± 17.2 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19]


In [9]:
def repwstack(a,b):
    a=np.hstack([a[:1],b,a[(1+len(b)):]])
    return a

(repwcopy(a,b)==repwstack(a,b)).all()


True

In [10]:
print ("a,b: ",a,b)
%timeit repwstack(a[1:100],b)
print (a)

a,b:  [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19] [ 0 -1 -2 -3]
5.04 µs ± 76.7 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19]


In [12]:
# This doesn't work as expected, fancy indexing works same way 
#    in assignment
def changenoa(a,b):
    a[[1+np.arange(len(b))]]=b
    print (a)
    a[[0,1]]=50
    b[1]=500
    return a

a=np.arange(10)
b=-np.arange(3)-100
print ("a,b: ",a,b)
print ("result: ",changenoa(a,b))
print ("a,b: ",a,b)

a,b:  [0 1 2 3 4 5 6 7 8 9] [-100 -101 -102]
[   0 -100 -101 -102    4    5    6    7    8    9]
result:  [  50   50 -101 -102    4    5    6    7    8    9]
a,b:  [  50   50 -101 -102    4    5    6    7    8    9] [-100  500 -102]


In [15]:
# Note also
def changeonlya(a,b):
    """replace elements of `a` after first element with `b` 
        using slice, then assign 500 to second element 
        of `b`. The change doesn't reflect in `a`."""
    a[1:(1+len(b))]=b   #this gives same result: a[1:(1+len(b))]=b[:]
    print (a)
    b[1]=500 
    return a

a=np.arange(10)
b=-np.arange(3)-100
print ("a,b: ",a,b)
print ("result: ",changeonlya(a,b))
print ("a,b: ",a,b)

a,b:  [0 1 2 3 4 5 6 7 8 9] [-100 -101 -102]
[   0 -100 -101 -102    4    5    6    7    8    9]
result:  [   0 -100 -101 -102    4    5    6    7    8    9]
a,b:  [   0 -100 -101 -102    4    5    6    7    8    9] [-100  500 -102]


## Operations on array and inplace modifications

A method acting on internal data (e.g. level) can modify (o reassign to) self and return self or return a copy.
    
An example of this are methods that return a view of original data, but also reassigning data with self.data=..
    will give a link to original data. 
    
If inplace changes (e.g. element-wise assignment) are then performed on the property, the change is reflected in the original data.

In [16]:
def replace_2nd(ar,newval):
    #ar=ar.copy()  #this avoids changes in ar
    #ar=ar         #this and the following have no effect
    #ar=ar[:]
    # ar=ar[np.arange(ar.shape[0])] #this works in creating a copy also.
    ar[2]=newval
    
def replace_2nd_slice(ar,newval):
    ar=ar[1:-1]

a=np.arange(12).reshape(3,4)
replace_2nd(a[:,1],-5)
print(a)

a=np.arange(12).reshape(3,4)
b=a[:,1]
replace_2nd(a,-5)
print(a)
    
a=np.arange(12).reshape(3,4)
replace_2nd_slice(a,-5)
print(a)

[[ 0  1  2  3]
 [ 4  5  6  7]
 [ 8 -5 10 11]]
[[ 0  1  2  3]
 [ 4  5  6  7]
 [-5 -5 -5 -5]]
[[ 0  1  2  3]
 [ 4  5  6  7]
 [ 8  9 10 11]]


In [17]:

def sl(ar):
        res=ar[1:-1]
        return res


sl(np.arange(5))
Out[415]: array([1, 2, 3])

a=np.arange(5)

sl(a)
Out[417]: array([1, 2, 3])

a
Out[418]: array([0, 1, 2, 3, 4])

NameError: name 'array' is not defined

This is a simple class that assign some data to a `data` property. An array is used as data. Note first of all that the id is the same as original data, meaning they are linked. 
instance `d` has of course a separate id.

In [18]:
class test2dd(object):
    def __init__(self,data):
        self.data=data
        
data=np.arange(5)
d=test2dd(data)

print('initial (data,d.data):\n',data,d.data,
      '\n ids:',id(data),id(d.data))
print ('id(d)',id(d))

initial (data,d.data):
 [0 1 2 3 4] [0 1 2 3 4] 
 ids: 1586594082208 1586594082208
id(d) 1586594046696


Note that different id doesn't necessary mean that data are not linked, an example is slice.

Slice syntax is ..

This is different from lists

In [19]:
#this is different from lists

a=list(range(6))
b=a[3]
c=a[3:4]
print(a,b,c)
c[0]=-5
print(a,b,c)

print("-")

a=np.arange(6)
b=a[3]
c=a[3:4]
print(a,b,c)
c[0]=-5
print(a,b,c)

[0, 1, 2, 3, 4, 5] 3 [3]
[0, 1, 2, 3, 4, 5] 3 [-5]
-
[0 1 2 3 4 5] 3 [3]
[ 0  1  2 -5  4  5] 3 [-5]




This class will test how property data behave with respect to operation of slicing, assignment and in place modification:

In [20]:
class test2dd(object):
    def __init__(self,data):
        self.data=data
    
    def rem(self):      #removes first element (slice)
        self.data=self.data[1:]
    
    def rep(self,val):  #replace last el with one value
        self.data[-1]=val
        
data=np.arange(5)
d=test2dd(data)

This is how a slice works: initially `data` and `d.data` are linked and have same id. Once the rem method assign a slice of `self.data` to self.data. This is a new object with a new id, however the old and the new array share data in the slice. 
When the method `rep` replace one element in place, the change is reflected in the original array.

In [21]:
#surprise!?
print('initial (data,d.data):\n',data,d.data,
      '\n ids:',id(data),id(d.data))

d.rem()

print('\nafter removal (data,d.data):\n',data,d.data,
      '\n ids:',id(data),id(d.data))

d.rep(-1)

print('\nafter replacement (data,d.data):\n',data,d.data,
      '\n ids:',id(data),id(d.data))

initial (data,d.data):
 [0 1 2 3 4] [0 1 2 3 4] 
 ids: 1586593805856 1586593805856

after removal (data,d.data):
 [0 1 2 3 4] [1 2 3 4] 
 ids: 1586593805856 1586593805216

after replacement (data,d.data):
 [ 0  1  2  3 -1] [ 1  2  3 -1] 
 ids: 1586593805856 1586593805216


As a conclusion, when we design a class that has properties assigned from data that are reused somewhere else, we want to carefully consider which operations create copies and which ones link references.

In following example a class assign data property from property of another object.

In [22]:
class test2dd(object):
    #add method that copies data from other instance.
    #replace print with __call__ to save a few keystrokes
    def __init__(self,data):
        self.data=data
    
    def rem(self):
        self.data=self.data[1:]
    
    def rep(self,val):
        self.data[-1]=val
        
    def dcopy(self,other):
        self.data=other.data
        
    def __call__(self):  #just to save a few keystrokes
        print(self.data)
        
data=np.arange(5)
d=test2dd(data)

d(),data

[0 1 2 3 4]


(None, array([0, 1, 2, 3, 4]))

In [23]:
data2=np.arange(5)*-1-1
d2=test2dd(data2)
d2(),data2

[-1 -2 -3 -4 -5]


(None, array([-1, -2, -3, -4, -5]))

In [24]:
print("before copy:")
print(id(data),id(d.data),id(d))
print(id(data2),id(d2.data),id(d2))
print("data: %s, d.data%s"%(data,d.data))
print("data2: %s, d2.data%s"%(data2,d2.data))

d.dcopy(d2)

#this shows how data property of 
#  both object points to same variable
print("#---\nafter copy:")
print(id(data),id(d.data),id(d))
print(id(data2),id(d2.data),id(d2))
print("data: %s, d.data%s"%(data,d.data))
print("data2: %s, d2.data%s"%(data2,d2.data))

d.rep(11)

#this replaces one element in place, change is
# reflected on all linked variables
print("#---\nafter rep:")
print(id(data),id(d.data),id(d))
print(id(data2),id(d2.data),id(d2))
print("data: %s, d.data%s"%(data,d.data))
print("data2: %s, d2.data%s"%(data2,d2.data))

before copy:
1586594009456 1586594009456 1586594048936
1586594130080 1586594130080 1586594048432
data: [0 1 2 3 4], d.data[0 1 2 3 4]
data2: [-1 -2 -3 -4 -5], d2.data[-1 -2 -3 -4 -5]
#---
after copy:
1586594009456 1586594130080 1586594048936
1586594130080 1586594130080 1586594048432
data: [0 1 2 3 4], d.data[-1 -2 -3 -4 -5]
data2: [-1 -2 -3 -4 -5], d2.data[-1 -2 -3 -4 -5]
#---
after rep:
1586594009456 1586594130080 1586594048936
1586594130080 1586594130080 1586594048432
data: [0 1 2 3 4], d.data[-1 -2 -3 -4 11]
data2: [-1 -2 -3 -4 11], d2.data[-1 -2 -3 -4 11]


## Pandas

In [3]:
import pandas as pd

Pandas doc describes it as "a powerful data analysis and manipulation library for Python"

offering "fast, flexible, and expressive data structures designed to make working with "relational" or "labeled" data both
easy and intuitive."




In [4]:
#beginning of docstring

print("".join(pd.__doc__.split('\n\n')[:2])+'\n...\n...')


pandas - a powerful data analysis and manipulation library for Python
structures designed to make working with "relational" or "labeled" data both
easy and intuitive. It aims to be the fundamental high-level building block for
doing practical, **real world** data analysis in Python. Additionally, it has
the broader goal of becoming **the most powerful and flexible open source data
analysis / manipulation tool available in any language**. It is already well on
its way toward this goal.
...
...


It is something like a crossover between arrays and (improved) dictionaries. One of the main concept is the introduction of "labels" associated to axis and coordinates of an array, allowing data access and manipulation..

The following code from https://pandas.pydata.org/pandas-docs/stable/cookbook.html#cookbook-pivot can be a good introductory example of the type of data manipulation that is possible with `pandas`. We want to focus now on the meaning of data. Pandas syntax and commands employed  will be detailed in the following. 

One of the most general forms of data structure is a list of records, also called “stacked” or “record” format(https://pandas.pydata.org/pandas-docs/stable/reshaping.html#reshaping-pivot).
In this format, data are stored as a list of `records` each one containing the values for several `fields`. Each record constitute a `database entry` [VC, check this!].
If records are listed on different lines, the result is a table on which each record is a line and fields are in columns.
This is also used to store data in CSV files or databases.

In the example a number of sales per each of a number of cities is reported. The fields are `'Province', 'City', 'Sales'`.


In [5]:
df = pd.DataFrame(data={'Province' : 
                        ['ON','QC','BC','AL','AL','MN','ON'],
                        'City' :['Toronto','Montreal','Vancouver',
                                 'Calgary','Edmonton','Winnipeg',
                                 'Windsor'],
                        'Sales' : [13,6,16,8,4,3,1]})

df

Unnamed: 0,City,Province,Sales
0,Toronto,ON,13
1,Montreal,QC,6
2,Vancouver,BC,16
3,Calgary,AL,8
4,Edmonton,AL,4
5,Winnipeg,MN,3
6,Windsor,ON,1


It is important to observe that, while each field has a value, not all values are equal, some act as identifier for the record (in this case `'Province'` and `'City'`), others constitues the real data (`'Sales'`). Following `pandas` consuetudine, we will call from now on the first 'Indices' and the second 'values'.
The first can be considered as independent variable (x), the second as dependant (y), in general the value that we want to plot/display/analyze. While in some cases (e.g. extract the state with more sales) the result is a index, this is always the result of some analysis made on "values" (data, e.g find the maximum of a sales vs City and Province, 2D data).

Another way of seeing the difference between indices and values is to think at the first as (extended) keys of a dictionary. Advanced function as `hierarchical indexing` will be discussed later. For now, notice how a province can have more than one city, while the fact that each city has only one state, is just incidental from point of view of data, even if not surprising from logical point of view (it would be different in the US).

In a general case, both city and province are needed to univoquously identify a record. The example employs a pd.DataFrame object with a single value and two indices. It could be as well a multi-index Series. 

In [6]:
display(df.to_latex())

'\\begin{tabular}{lllr}\n\\toprule\n{} &       City & Province &  Sales \\\\\n\\midrule\n0 &    Toronto &       ON &     13 \\\\\n1 &   Montreal &       QC &      6 \\\\\n2 &  Vancouver &       BC &     16 \\\\\n3 &    Calgary &       AL &      8 \\\\\n4 &   Edmonton &       AL &      4 \\\\\n5 &   Winnipeg &       MN &      3 \\\\\n6 &    Windsor &       ON &      1 \\\\\n\\bottomrule\n\\end{tabular}\n'

The data above can be presented as well as a grid on all possible combinations of indices (we are excluding the case in which there are multiple records for same position in table), missing values are filled with np.nan as default). 

In [7]:
table = pd.pivot_table(df,values=['Sales'],
        index=['Province'],columns=['City'])

table


Unnamed: 0_level_0,Sales,Sales,Sales,Sales,Sales,Sales,Sales
City,Calgary,Edmonton,Montreal,Toronto,Vancouver,Windsor,Winnipeg
Province,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
AL,8.0,4.0,,,,,
BC,,,,,16.0,,
MN,,,,,,,3.0
ON,,,,13.0,,1.0,
QC,,,6.0,,,,


Note that, while the first representation is (apart from column and row orders) univoque, there is some arbitrariety in the second: we could as well decide to put province in row columns and cities in rows.

While all representations contain an equivalent amount of information, the records format is the only one the doesn't contain any arbitrarity, while every other becomes a (often more useful) representation of the data and the underlying structure. 

To this purpose, the ability of sorting and handling axis is essential for both data visualization and processing.

Pandas allows easy handling of (labeled) axis to select and process data and to extract information. The same operation could be done (and are indeed done under the scene) by operations on arrays. An example is calculating sum by columns and rows. 



In [8]:
table = pd.pivot_table(df,values=['Sales'],
        index=['Province'],columns=['City'],aggfunc=np.sum,margins=True)

table


Unnamed: 0_level_0,Sales,Sales,Sales,Sales,Sales,Sales,Sales,Sales
City,Calgary,Edmonton,Montreal,Toronto,Vancouver,Windsor,Winnipeg,All
Province,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
AL,8.0,4.0,,,,,,12
BC,,,,,16.0,,,16
MN,,,,,,,3.0,3
ON,,,,13.0,,1.0,,14
QC,,,6.0,,,,,6
All,8.0,4.0,6.0,13.0,16.0,1.0,3.0,51


The arbitrarity of tabular representation, becomes even more evident adding additional dimensions. In this case, the simple tabular form of previous example can be extended to a three-dimensional grid and this can be extended to any dimension. Each index has one axis. However, up today, multi-dimensional structures are impractical for visualization and printing and the multiple indices are collapsed to a 2D structure. This is done placing more than index for an axis, introducing another arbitrarity, the `level`.

Each index can be placed on the horizontal or vertical axis, with no more limitation of one index per axis. In this case it is necessary to specify index priority.

The following example is a rewriting of the previous table with both indices in vertical orientation.

In [9]:
table.stack('City')

Unnamed: 0_level_0,Unnamed: 1_level_0,Sales
Province,City,Unnamed: 2_level_1
AL,All,12.0
AL,Calgary,8.0
AL,Edmonton,4.0
BC,All,16.0
BC,Vancouver,16.0
MN,All,3.0
MN,Winnipeg,3.0
ON,All,14.0
ON,Toronto,13.0
ON,Windsor,1.0


questo e' un esempio di come aggiungere una colonna come identificativo incasina tutto quando e' interpretata come valore. Per esempio qui aggiungiamo un altro campo `data` e visualizziamo come record 

    diffdf['date'] = dates
    diffdf.sort_values('date')
    
    b=diffdf.reset_index().melt(id_vars=['index']).set_index(['index',
    'terms removed','files#'])

### Data Structures in pandas

Now, we jave seen some examples of data structures and clarified the composing elements, we are going to see how these are implemented in pandas.

The principal data structures are `Series` and `DataFrame`

Let's go back to the cities example and make some observation on how data are presented.

In [43]:
df = pd.DataFrame(data={'Province' : 
                        ['ON','QC','BC','AL','AL','MN','ON'],
                        'City' :['Toronto','Montreal','Vancouver',
                                 'Calgary','Edmonton','Winnipeg',
                                 'Windsor'],
                        'Sales' : [13,6,16,8,4,3,1]})

df

Unnamed: 0,City,Province,Sales
0,Toronto,ON,13
1,Montreal,QC,6
2,Vancouver,BC,16
3,Calgary,AL,8
4,Edmonton,AL,4
5,Winnipeg,MN,3
6,Windsor,ON,1


Columns can have a natural name (fields have a name), for record in general an index is necessary, we can assume for now that first column contains and unique sequential ID number. 

The example here is based on a single non-index value. What happens if a second value is introduced? 

We introduce here pandas data structures Series and DataFrame and show how they are equivalent when multi index is considered. It is possible to revert back to record format and single index (DataFrame) from multiindex Series by using `.reset_index()` method of DataFrame.


Dataseries and dictionaries. 
Dataseries can be interlaced, meaning a dataframe can always be represented as a multi index data series.

DataFrame
A table is a representation of the dataset, in which indices are sorted and grouped. It is a visualization (2D) with arbitrary ordering of indices (rows or columns, levels,..).
This visulization is often more useful than a mere list of records, and equally often is the final product of the data processing or of the entire analysis. Is it then of extreme importance understand and handle these data structures.

Unfortunately, there is an almost infinite numbers of methods to handle indices and representations, and as many documentation pages: pivot and pivot_table, melt, groupby, stack and unstack, in a complexive confuse picture:

In [83]:
#groupby doesn't seem to have a docstring ??
import pydoc

for f in [pd.melt,pd.pivot,pd.pivot_table,pd.DataFrame.stack,pd.DataFrame.unstack]:
    print(f.__name__,':')
    print (pydoc.render_doc(f).splitlines()[2])
    print (':\n',"".join(f.__doc__.split('\n\n')[:1])+'\n...\n\n=====')
    #print(f.__doc__)

melt :
mmeelltt(frame, id_vars=None, value_vars=None, var_name=None, value_name='value', col_level=None)
:
 
"Unpivots" a DataFrame from wide format to long format, optionally
leaving identifier variables set.
...

=====
pivot_simple :
ppiivvoott__ssiimmppllee(index, columns, values)
:
 
    Produce 'pivot' table based on 3 columns of this DataFrame.
    Uses unique values from index / columns and fills with values.
...

=====
pivot_table :
ppiivvoott__ttaabbllee(data, values=None, index=None, columns=None, aggfunc='mean', fill_value=None, margins=False, dropna=True, margins_name='All')
:
 
Create a spreadsheet-style pivot table as a DataFrame. The levels in
the pivot table will be stored in MultiIndex objects (hierarchical
indexes) on the index and columns of the result DataFrame
...

=====
stack :
ssttaacckk(self, level=-1, dropna=True)
:
 
        Pivot a level of the (possibly hierarchical) column labels, returning a
        DataFrame (or Series in t

A very illustrative example (with code) and clear explanation of how multiindex and pivot work, with examples of all methods:
http://nikgrozev.com/2015/07/01/reshaping-in-pandas-pivot-pivot-table-stack-and-unstack-explained-with-pictures/


A good comparison of all methods:
https://pandas.pydata.org/pandas-docs/stable/reshaping.html#reshaping-pivot
An example of pandas plotting capabilities using groupby
http://jonathansoma.com/lede/algorithms-2017/classes/fuzziness-matplotlib/understand-df-plot-in-pandas/


Documentation for:
Multiindex (Hierarchical index): https://pandas.pydata.org/pandas-docs/stable/advanced.html#advanced-hierarchical 
Groupby (split-apply-combine): https://pandas.pydata.org/pandas-docs/stable/groupby.html#groupby
Some details about melt and advanced use  to handle indices (as in notebook WFS08_all) are discussed here:
https://github.com/pandas-dev/pandas/issues/17440 


http://www.swegler.com/becky/blog/2014/08/06/useful-pandas-snippets/

text data in pandas http://pandas.pydata.org/pandas-docs/stable/text.html#text-string-methods 

In [64]:
d = {'one' : pd.Series([1., 2., 3.], index=['a', 'b', 'c']),
    'two' : pd.Series([1., 2., 3., 4.], index=['a', 'b', 'c', 'd'])}


df = pd.DataFrame(d)

df

Unnamed: 0,one,two
a,1.0,1.0
b,2.0,2.0
c,3.0,3.0
d,,4.0


In [65]:
df['three'] = df['one'] * df['two']
df

Unnamed: 0,one,two,three
a,1.0,1.0,1.0
b,2.0,2.0,4.0
c,3.0,3.0,9.0
d,,4.0,


Note that data conventional indexing (as a dictionary) accesses data by column (df['one'] gives a Series, but df['a'] gives error. This also in assignment. The opposite is true when data are addressed by rows using iloc property of the structure.

This returns a Series, index of which can be then accessed: 

In [80]:
df['one']['a']

1.0

Note however that this explcit syntax is the only one allowed, is it not possible in this format to access data with a real multidimensional index in analogy with array (or with an hypotetic dictionary with multi-dimensional keys).
Any other combination fails, e.g. df['loc','a']

In [70]:
df.loc['a']

one      1.0
two      1.0
three    1.0
Name: a, dtype: float64

http://pandas-docs.github.io/pandas-docs-travis/dsintro.html#panel
    
The basics of indexing are as follows:
Operation 	Syntax 	Result
Select column 	df[col] 	Series
Select row by label 	df.loc[label] 	Series
Select row by integer location 	df.iloc[loc] 	Series
Slice rows 	df[5:10] 	DataFrame
Select rows by boolean vector 	df[bool_vec] 	DataFrame

In [95]:
dates = pd.date_range('1/1/2000', periods=8)
df = pd.DataFrame(np.random.randn(8, 4), index=dates, columns=['A', 'B', 'C', 'D'])
df

Unnamed: 0,A,B,C,D
2000-01-01,0.708203,0.391373,-0.459411,-0.813855
2000-01-02,1.886631,-1.911481,0.463731,-0.239855
2000-01-03,-1.701735,-1.209308,-1.597629,0.800465
2000-01-04,0.592675,1.136078,-0.409733,-0.380906
2000-01-05,-1.471722,-0.707108,-0.147781,0.121781
2000-01-06,-0.662373,-0.207777,2.129353,-0.404103
2000-01-07,-0.040132,0.669778,-0.787987,0.84402
2000-01-08,-0.304281,-1.187803,0.763857,-1.008201


In [96]:
mi=pd.MultiIndex.from_product([['0','1']]*3, 
                              names=['first', 'second','3rd digit'])
s = pd.Series([int("".join((str(d1),str(d2),str(d3))),2)
               for d1,d2,d3 in zip(*(mi.labels))], 
              index=mi,name='decimal')
print(s)

first  second  3rd digit
0      0       0            0
               1            1
       1       0            2
               1            3
1      0       0            4
               1            5
       1       0            6
               1            7
Name: decimal, dtype: int64


In [101]:
print (pd.DataFrame(s).index)
pd.DataFrame(s)

MultiIndex(levels=[['0', '1'], ['0', '1'], ['0', '1']],
           labels=[[0, 0, 0, 0, 1, 1, 1, 1], [0, 0, 1, 1, 0, 0, 1, 1], [0, 1, 0, 1, 0, 1, 0, 1]],
           names=['first', 'second', '3rd digit'])


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,decimal
first,second,3rd digit,Unnamed: 3_level_1
0,0,0,0
0,0,1,1
0,1,0,2
0,1,1,3
1,0,0,4
1,0,1,5
1,1,0,6
1,1,1,7


In [102]:
print(pd.DataFrame(s).reset_index().index)
pd.DataFrame(s).reset_index()

RangeIndex(start=0, stop=8, step=1)


Unnamed: 0,first,second,3rd digit,decimal
0,0,0,0,0
1,0,0,1,1
2,0,1,0,2
3,0,1,1,3
4,1,0,0,4
5,1,0,1,5
6,1,1,0,6
7,1,1,1,7


MultiIndex avoids the need to access data by sequential indices and allows access in a similar way as a ndarray (or an hypotetical multi-dimensional dictionary key).

In [106]:
s['0','1']

3rd digit
0    2
1    3
Name: decimal, dtype: int64

In [109]:
s.loc['0']

second  3rd digit
0       0            0
        1            1
1       0            2
        1            3
Name: decimal, dtype: int64

In [107]:
s['0',:,'1']

TypeError: ('0', slice(None, None, None), '1')

In [48]:
arrays = [['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'],
   ...:           ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']]


tuples = list(zip(*arrays))
index = pd.MultiIndex.from_tuples(tuples, names=['first', 'second'])

df = pd.DataFrame(np.random.randn(3, 8), index=['A', 'B', 'C'], columns=index)
df

first,bar,bar,baz,baz,foo,foo,qux,qux
second,one,two,one,two,one,two,one,two
A,0.409903,0.670529,-1.025956,-0.491069,1.872768,-1.341155,0.196594,-0.357962
B,-1.883635,0.272524,-0.383152,-0.31701,1.737341,0.856306,1.007251,2.301049
C,0.230527,-0.460999,-0.140931,1.095217,0.351266,1.061561,0.319736,-0.482182


In [49]:
#this works only accessing ALL indices IN THIS ORDER
print(df['bar'])
print('--')
print(df['bar']['one'])
print('--')
print(df['bar']['one']['A'])

second       one       two
A       0.409903  0.670529
B      -1.883635  0.272524
C       0.230527 -0.460999
--
A    0.409903
B   -1.883635
C    0.230527
Name: one, dtype: float64
--
0.40990258739958774


Note however what is said at http://pandas-docs.github.io/pandas-docs-travis/indexing.html#indexing-view-versus-copy about `chained indexing`

In [53]:
try:
    print(df['bar'][:]['A'])
except KeyError:
    print ("df['bar'][:]['A'] gives KeyError")

df.loc[:,['bar','one','A']]

df['bar'][:]['A'] gives KeyError


first,bar,bar
second,one,two
A,0.409903,0.670529
B,-1.883635,0.272524
C,0.230527,-0.460999


In [61]:
df.loc[:,('bar')]

second,one,two
A,0.409903,0.670529
B,-1.883635,0.272524
C,0.230527,-0.460999


In [62]:
%qtconsole

TODO: assign method, selection by callable http://pandas-docs.github.io/pandas-docs-travis/dsintro.html#panel


Difference between addressing multiindex with tuple or list



## Comparison operators and Truth values

In [None]:
a=1
if a:print a

In [None]:
if None:
    print 'None is' 
else:
    print 'None is not'

In [None]:
#None is neither True or False, but not(None) is True
if None==True:
    print ('None is True')
else:
    print ('None is not True')

In [None]:
if None==False:
    print ('None is False')
else:
    print ('None is not False')

In [None]:
#want a proof?
not(None)

In [None]:
if not(None): print "None isn't" 

In [None]:
if not(None)==True: print "None isn't" 

In [None]:

not([]),not(None)

The code below tests how different values behave in comparison. A visible contraddiction is that None (or an empty list) is not either False or True, but not(None) (or not([])) is True. It is consistent if comparison with False are NEVER used.
The code illustrates the use of *args and the passing functions as arguments to other functions.

In [None]:
def IS(*args):
    for arg in args:
         if arg: print arg,' is' 
         else: print arg, "isn't"
                    
def isTrue(*args):
    for arg in args:
         if arg==True: print arg,' is True' 
         else: print arg, "isn't True"
                    
def isFalse(*args):
    for arg in args:
         if arg==False: print arg,' is False' 
         else: print arg, "isn't False, is it then True?",arg==True
                    
def test(testFuncs,testValues):
    print 'testValues: ',testValues
    print 'Note: the above values are the values calculated at value access, for value definition see code.\n'
    for func in testFuncs:
        print func
        func(*testValues)
        print'\n----\n',
                    
testValues=[False,None,b,not(None)] #list of calues to test, they are passed to the testFunctions                    
testFuncs=[IS,isTrue,isFalse] #list of the functions to test the Values
test(testFuncs,testValues)

In [None]:
#The empty list behaves in the same way as None:
test(testFuncs,[None,[],[[]],not([])])

0 is False and 1 is True, but 3 is neither one

In [None]:
test(testFuncs,[0,1,3,-1])

In [None]:
isTrue(bgaa)

In both cases, the execution stopped at the function call and it cannot continue. Number of arguments is checked then. What else is checked at function call, if anything?

In [None]:
%debug

In [None]:
isTrue() #This evaluates to *args=[], the loop over args is skipped. 

In [None]:
test(testFuncs,[[1,2,3],[0],[[],[]]])

In [None]:
import numpy as np
IS(np.arange(5))

This is due to how comparison operators operate on arrays

In [None]:
a==True

In [None]:
a>=3

In [None]:
def funcwrong(x):
     if x.any(): print 'x' 
     else: print 'no x'

In [None]:
funcwrong(a),funcwrong(np.zeros(5))

In [None]:
funcwrong([1,2,3]),funcwrong([0]),funcwrong([]),funcwrong([[],[]])

How to check for truth a fubction argument that can be also a np array? where truth is defined as usual for other objects and works as for lists in arrays. Is there a conflict with boolean operations?

# Function optional return values

The example below illustrates the limitations of Python lacking optional return parameters (as opposite to IDL).
In this case IDL return value management and argument checking mechanism (keyword_present, arg_present, etc., see Coyote's posts on the topic) would make it easy to include all in a general function able to return the intermediate values, allowing: 1) to calculate only the needed values; 2) to do only once calculations that are needed for more return values if they are requested.
Is there a way to do this in python and avoid the explosion of routine names?
Can modules be an alternative?

In [None]:
def merge_to_string(*args):
    """From a set of same length iteratables returns a list of string resulting 
    from joining the input vectors element-wise.
    ex.:
    >> merge_to_string([1,2,3],[4,5,6]) #['1 4', '2 5', '3 6']
    """
    return [" ".join(map(str,vals)) for vals in zip(*args)]

def first_of_group_index(vector):
    """given an iterable that can contain repeated values, return a list of indices
    for the first istance of each value present in the list.
    ex.: 
    >> first_of_group_index([1,2,3,2,2,1,5,-1]) #[0,1,2,6,7]
    """
    indices=[] #indices for which to plot labels
    labels=[]
    for i,val in enumerate(vector):
        if not(val in labels):
            labels.append(val)
            indices.append(i)
    return indices

def dict_by_key(keys,values):
    """Create a dictionary grouping as lists values correponding to same key.
    ID and values must be same number of elements. """
    
    dic_by_key={}
    for k,v in zip(keys,values):   # 0, ('L23a', 710.0, 24.0)
        try:
            if not(v in dic_by_key[k]):
                dic_by_key[k].append(v)
        except KeyError:
            dic_by_key[k]=[v]
    return dic_by_key

def compress_values(dictionary,string_func="\n".join):
    """Given a dictionary, generate a string value for each dict value. 
    string_func is the function that transform each item in a string,
    type checking is left to the user. Default merges list of strings as lines."""
    for k,v in dictionary.items():
        #dictionary[k]=string_func(v)
        dictionary[k]=string_func(v) #"\n".join(v)
    return dictionary

def groupLabels(group,*args):
    return compress_values(dict_by_key(group,merge_to_string(*args))),first_of_group_index(group)

iPosLabel=first_of_group_index(group) #one element per group for label position.
labels=merge_to_string(plotTemp,plotTime) #create one label for each group (not necessarily unique)
txtLabels=compress_values(dict_by_key(group,labels)) #create one string with the label per group, txtLabels is a dictionary {group:label} 
txtLabels,iPosLabel  #all above lines are equivalent to groupLabels(group,plotTemp,plotTime)


Now, assume I want to pass the string_func argument to groupLabels, how is the default argument mandaged? Is assigning None default argument the solution? IF it is, is there another way that doesn't assume the programmer followed the best practice (e.g. I have an unchangeable library where the programmer set string_func as in the above code, in this case passing None as string_function gives error)?

# Attribute handling

This is an example from:
http://stackoverflow.com/questions/11145501/getattr-going-recursive-in-python
Probably the author didn't understand what he wanted to do, but it has some interesting points.

In [None]:
class A:
    def __init__(self, list_1, list_2):
        self.list1 = list_1
        self.list2 = list_2

    def __getattr__(self, item):
        print 'Looking for %s'%(item)
        if item in self.list1: return "It is in list 1"
        elif item in self.list2: return "It is in list 2"
        else: return "It is in neither list 1 nor list 2"

In [None]:
a=A([1,2,3],[43,44,45])

In [None]:
a.list1

In [None]:
a.cane

In [None]:
a.1

In [None]:
b=A(['cane', 'gatto', 'topo'],['uomo','donna',[1,2]])

In [None]:
b.cane

In [None]:
b.donna

In [None]:
c=[1,2]
b.c

Note that the above code never generates infinite recursion. Can it happen if __setattr__ is also defined?

## Yield

In [None]:
def print_a(n=None):
    try:
        s
    except NameError:
        s='a'
        
    while n is None:
        s=s+'a'
        yield(s)
    
g=print_a()

In [None]:
g.next(1)

In [None]:
def print_a(l,n=None):
    """this uses a workaround to make the generator dependant on
    an external variable. a list l is linked on generator creation.
    In place operations allow to share values between the generator 
    and the external namespace.
    
    l is a string of characters, on each iteration a 
    character 'a' is added to l, unless the 
    last character of l is not 'a', in that case iteration is terminated 
    """
    
    while len(l)==0 or l[-1] == 'a':
        l.append('a')
        yield(''.join(l))

l=[]
g=print_a(l)

In [None]:
for i in range(3): print(g.next(),'\n')
print ("---\n",''.join(l))

In [None]:
l.append('a')
print(''.join(l))


In [None]:
g.next()


In [None]:
l.append('b')

In [None]:
g.next()


In [None]:
plt.close('all')