# Pandas

In [24]:
import numpy as np
import pandas as pd

## Series

In [3]:
#Series is similar to numpy array
#Series can be accessed by labels

In [4]:
labels = ['a', 'b', 'c']
my_data = [10, 20, 30]
arr = np.array(my_data)
d = {'a':10, 'b':20, 'c':30}

In [5]:
pd.Series(data=my_data)
#Series has Indices and Datapoints

0    10
1    20
2    30
dtype: int64

In [18]:
mySeries = pd.Series(data=my_data, index=labels)
#The Indices can be set separately
#short hand notation: 
pd.Series(my_data, labels)

a    10
b    20
c    30
dtype: int64

In [19]:
mySeries['b']

20

In [11]:
#pd.Series can cast a numpy array, 
#exactly as np.array did with lists: 
pd.Series(arr)

0    10
1    20
2    30
dtype: int64

In [12]:
#Given a dictionary, Pandas sets key as index and value as datapoint
pd.Series(d)

a    10
b    20
c    30
dtype: int64

In [15]:
#Series can hold 'anything' as datapoints, even functions: 
pd.Series(data=[sum, print, len, np.sqrt, np.pi])

0      <built-in function sum>
1    <built-in function print>
2      <built-in function len>
3               <ufunc 'sqrt'>
4                      3.14159
dtype: object

## Grab Information from Series:

In [16]:
ser1 = pd.Series([1,2,3,4], ['Apple', 'Banana', 'Orange', 'Blueberry'])

In [17]:
ser1

Apple        1
Banana       2
Orange       3
Blueberry    4
dtype: int64

In [21]:
ser1['Banana']

2

In [22]:
ser2 = pd.Series([3,2,1,5], ['Banana', 'Kiwi','Orange', 'Blueberry'])

In [23]:
ser1 + ser2
#Pandas adds values where Indices match
#No index-match results in NaN
#Numbers are converted into float to avoid information-loss

Apple        NaN
Banana       5.0
Blueberry    9.0
Kiwi         NaN
Orange       4.0
dtype: float64

## DataFrames

In [1]:
import numpy as np
import pandas as pd
from numpy.random import randn

In [2]:
np.random.seed(101)

In [3]:
df = pd.DataFrame(randn(5,4), ['A', 'B', 'C', 'D','E'],['W','X','Y','Z'])

In [4]:
df
#now we see a nice output of our random data
#each colum is a series, the colums/series share same index in rows

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [10]:
#Data type of a dataframe - DataFrame :D
type(df)

pandas.core.frame.DataFrame

In [6]:
#Grab data:
df['W'] # grab W-Colum with bracket notation

A    2.706850
B    0.651118
C   -2.018168
D    0.188695
E    0.190794
Name: W, dtype: float64

In [7]:
#A row is indeed a series: 
type(df['W'])

pandas.core.series.Series

In [11]:
#Grab colum as with sql: 
df.W #dot-Notation
#advice: don't use this, confuses with methods on dataframes

A    2.706850
B    0.651118
C   -2.018168
D    0.188695
E    0.190794
Name: W, dtype: float64

In [12]:
#Grab multiple colums with list of colum-names:
df[['X', 'Z']]

Unnamed: 0,X,Z
A,0.628133,0.503826
B,-0.319318,0.605965
C,0.740122,-0.589001
D,-0.758872,0.955057
E,1.978757,0.683509


In [13]:
#Creating new, non-existing, colum by arithmetic works
df['new'] = df['W'] + df['Y']

In [14]:
df

Unnamed: 0,W,X,Y,Z,new
A,2.70685,0.628133,0.907969,0.503826,3.614819
B,0.651118,-0.319318,-0.848077,0.605965,-0.196959
C,-2.018168,0.740122,0.528813,-0.589001,-1.489355
D,0.188695,-0.758872,-0.933237,0.955057,-0.744542
E,0.190794,1.978757,2.605967,0.683509,2.796762


In [17]:
#delete Colum with the .drop method
#we need to specify axis: 
#axis refers to index
#axis = 0: for rows
#axis = 1: for colums

#reason for this: numpy-indexing of DataFrame:
#df.shape returns in our case (5,4) (after deletion)
#Tupel, so 2D-Matrix
#0-Index: Number of rows
#1-Index: Number of colums


#and we need to specify the inplace argument: 
#inplace=True: if you want those changes to be permanent

df.drop('new', axis=1, inplace=True)

In [18]:
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [21]:
#delete a row with same Method: 
df.drop('E', axis=0)

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057


In [22]:
df.shape

(5, 4)

In [23]:
#Selecting colums by df['X']
#Selecting rows: two different ways:

df.loc['A'] #pass the label

W    2.706850
X    0.628133
Y    0.907969
Z    0.503826
Name: A, dtype: float64

In [24]:
# pass the numerical index: 
df.iloc[0]
#i for integer/index
#loc for location
#we see, the rows are also returned as series

W    2.706850
X    0.628133
Y    0.907969
Z    0.503826
Name: A, dtype: float64

In [26]:
#select subsets of DataFrames: 
df.loc['B', 'Y'] #comma notation, just as with matrices in numpy
df.loc[['A', 'B'], ['W','Y']]

Unnamed: 0,W,Y
A,2.70685,0.907969
B,0.651118,-0.848077


In [8]:
#Conditional selection
booldf = df > 0 #create 'mask'
booldf #show 'mask'

Unnamed: 0,W,X,Y,Z
A,True,True,True,True
B,True,False,False,True
C,False,True,True,False
D,True,False,False,True
E,True,True,True,True


In [9]:
#No slect conditionally: 
df[booldf]

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,,,0.605965
C,,0.740122,0.528813,
D,0.188695,,,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [10]:
#now all in one step: 
df[df>0]

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,,,0.605965
C,,0.740122,0.528813,
D,0.188695,,,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [13]:
#Just checking rows: 
df['W']>0

A     True
B     True
C    False
D     True
E     True
Name: W, dtype: bool

### Following notation is used the most:

In [12]:
df[df['W']>0]
#returns only the rows, where Condition on W happens to be true
#we can call commands on the resulting DataFrame

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [15]:
#Return the X-Colum from conditionally selected df: 
df[df['W']>0]['X']

A    0.628133
B   -0.319318
D   -0.758872
E    1.978757
Name: X, dtype: float64

In [17]:
#Multiple conditions: 
df[(df['W']>0) & (df['X']>1)] #only use & or | here
#the 'and'-Operator is just for Comparing one True with one False
'''
& ampersand 'and'
| pipe 'or'
'''

Unnamed: 0,W,X,Y,Z
E,0.190794,1.978757,2.605967,0.683509


In [18]:
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [20]:
#Reset the index: 
df.reset_index()
#will only stay if you set inplace=True
#this sets old index as a new colum in the DataFrame
#and sets the index to a numerical index

Unnamed: 0,index,W,X,Y,Z
0,A,2.70685,0.628133,0.907969,0.503826
1,B,0.651118,-0.319318,-0.848077,0.605965
2,C,-2.018168,0.740122,0.528813,-0.589001
3,D,0.188695,-0.758872,-0.933237,0.955057
4,E,0.190794,1.978757,2.605967,0.683509


In [22]:
#Create a new index: 
newind = 'CA NY WY OR CO'.split()

['CA', 'NY', 'WY', 'OR', 'CO']

In [25]:
#We can attach this to the df DataFrame
#put it as a new colum in the dataframe
#Dimensions match, so we can just write: 
df['States'] = newind
df

Unnamed: 0,W,X,Y,Z,States
A,2.70685,0.628133,0.907969,0.503826,CA
B,0.651118,-0.319318,-0.848077,0.605965,NY
C,-2.018168,0.740122,0.528813,-0.589001,WY
D,0.188695,-0.758872,-0.933237,0.955057,OR
E,0.190794,1.978757,2.605967,0.683509,CO


In [26]:
#now we want the new colum to be the index: 
#we can use .set_index('colum to be index')
df.set_index('States')
#Caution: old index will be lost
#this is not inplace by default

Unnamed: 0_level_0,W,X,Y,Z
States,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
CA,2.70685,0.628133,0.907969,0.503826
NY,0.651118,-0.319318,-0.848077,0.605965
WY,-2.018168,0.740122,0.528813,-0.589001
OR,0.188695,-0.758872,-0.933237,0.955057
CO,0.190794,1.978757,2.605967,0.683509
