# Section 6: Python for Data Analysis - Pandas

# 02 - DataFrames Part I

In [2]:
## DataFrames -> Main tool when working with pandas

import numpy as np 
import pandas as pd 

from numpy.random import randn 

np.random.seed(101)

In [3]:
###Creating a dataFrame

##We have a data and index argument just like in Series but now we have a columns argument (third argument)
df = pd.DataFrame(randn(5,4), ['A', 'B','C','D','E'], ['W','X','Y','Z']) 

In [4]:
## Each of our columns is a Pandas Series which share a commmon index
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [5]:
##DataFrame indexing and selection 

##Using bracket notation -> Recommended way
df['W']

A    2.706850
B    0.651118
C   -2.018168
D    0.188695
E    0.190794
Name: W, dtype: float64

In [6]:
##Lets confirm that df['W'] is a Pandas Series by calling the type() method 
type(df['W'])

pandas.core.series.Series

In [7]:
###Checking the type of our dataFrame
type(df)

pandas.core.frame.DataFrame

In [8]:
### We can also use SQL like notation -> This is not recommmended because confusion may happen with dataFrames 
### Attributes and Methods 

df.W  ##We may sometimes overwrite the method names with the column names. So we should avoid this indexing notation

A    2.706850
B    0.651118
C   -2.018168
D    0.188695
E    0.190794
Name: W, dtype: float64

In [9]:
##If we want multiple columns we should pass a list of column names e.g. -> Note that this returns a dataFrame
df[['Y','Z']] ##Notice the display in Jupyter Notebook

Unnamed: 0,Y,Z
A,0.907969,0.503826
B,-0.848077,0.605965
C,0.528813,-0.589001
D,-0.933237,0.955057
E,2.605967,0.683509


In [24]:
##Creating new columns in our dataFrames -> We can do so by using the assignment operator = 

df['new'] = df['W'] + df['Y']

In [35]:
df['oneMore'] = pd.Series(data = [1,2,3,4,7], index = df.index)

In [36]:
type(df['Z'])

pandas.core.series.Series

In [37]:
df

Unnamed: 0,W,X,Y,Z,new,oneMore
A,2.70685,0.628133,0.907969,0.503826,3.614819,1
B,0.651118,-0.319318,-0.848077,0.605965,-0.196959,2
C,-2.018168,0.740122,0.528813,-0.589001,-1.489355,3
D,0.188695,-0.758872,-0.933237,0.955057,-0.744542,4
E,0.190794,1.978757,2.605967,0.683509,2.796762,7


In [12]:
##To remove a column we call the df.drop() method as follows 

## We must specify the axis (In this case axis = 1 since we are dropping from the columns)
## inplace = False by default -> We do not  affect the dataFrame  

df.drop('new', axis = 1, inplace = False) 

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [13]:
##Notice that we still have the 'new' column in our df 
df

Unnamed: 0,W,X,Y,Z,new
A,2.70685,0.628133,0.907969,0.503826,3.614819
B,0.651118,-0.319318,-0.848077,0.605965,-0.196959
C,-2.018168,0.740122,0.528813,-0.589001,-1.489355
D,0.188695,-0.758872,-0.933237,0.955057,-0.744542
E,0.190794,1.978757,2.605967,0.683509,2.796762


In [14]:
## Then inplace must be set to True. This occurs in many Pandas methods and its purpose is to avoid accidental 
## loss of information 

#Then, 

df.drop('new', axis = 1, inplace = True)

In [15]:
##Now we have permanently deleted the 'new' column
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [16]:
##We can also use df.drop() to drop rows

df.drop('E', axis = 0) ##axis = 0 is the default and thus we don't have to add it 

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057


In [17]:
##Row selection -> Different than column in selection

##There are two ways to select rows in our Data Frame

## 1-Using df.loc[]- Labeled based index location
## This returns a series too! Rows are series as well ()

df.loc['A']

W    2.706850
X    0.628133
Y    0.907969
Z    0.503826
Name: A, dtype: float64

In [18]:
## 2-USing df.iloc[] - Integer based index location -
## Pass a numerical index position even if our axes are labeled by strings 

df.iloc[0] ##Returns row 0 or row with index 'A' - Equivalent to df.loc['A']

W    2.706850
X    0.628133
Y    0.907969
Z    0.503826
Name: A, dtype: float64

In [19]:
##Selecting subsets of rows and columns

##To do so, we must use the df.loc[] method, e.g. with the numpy [i,j] notation 

df.loc['B','Y']

-0.8480769834036315

In [20]:
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [21]:
##Now, we can also pass lists of columns or rows to select larger subsets 
df.loc[['A','B'], ['W','Y']]

Unnamed: 0,W,Y
A,2.70685,0.907969
B,0.651118,-0.848077
