## Pandas - DataFrames

In [1]:
import numpy as np
import pandas as pd

## numpy seed

In [2]:
np.random.seed(0)
# seed is a tool which is used to extract same set of random numbers when random function is used over and over again

In [3]:
df = pd.DataFrame(np.random.randn(5,4))

In [4]:
df

Unnamed: 0,0,1,2,3
0,1.764052,0.400157,0.978738,2.240893
1,1.867558,-0.977278,0.950088,-0.151357
2,-0.103219,0.410599,0.144044,1.454274
3,0.761038,0.121675,0.443863,0.333674
4,1.494079,-0.205158,0.313068,-0.854096


In [5]:
df = pd.DataFrame(np.random.randn(5,4),['A','B','C','D','E'],['W','X','Y','Z'])
# index = rows heading ['A','B','C','D','E']
# ['W','X','Y','Z']

In [6]:
df
# each of these columns is a panda series
#  SO W is a panda series as well as X,Y,Z and they all share a common index.
# So dataframe is just a bunch of series that share an index

Unnamed: 0,W,X,Y,Z
A,-2.55299,0.653619,0.864436,-0.742165
B,2.269755,-1.454366,0.045759,-0.187184
C,1.532779,1.469359,0.154947,0.378163
D,-0.887786,-1.980796,-0.347912,0.156349
E,1.230291,1.20238,-0.387327,-0.302303


### Dataframe indexing and slicing

In [7]:
df['W']

A   -2.552990
B    2.269755
C    1.532779
D   -0.887786
E    1.230291
Name: W, dtype: float64

In [8]:
type(df['W'])

pandas.core.series.Series

In [9]:
type(df)

pandas.core.frame.DataFrame

In [10]:
df.W
# alternating way

A   -2.552990
B    2.269755
C    1.532779
D   -0.887786
E    1.230291
Name: W, dtype: float64

In [11]:
df[['W','Z']]
# for instance if I want columns W and Z I pass on a list here
#highlighting W and C
# i get back a dataframe containing those two series

Unnamed: 0,W,Z
A,-2.55299,-0.742165
B,2.269755,-0.187184
C,1.532779,0.378163
D,-0.887786,0.156349
E,1.230291,-0.302303


## creating a new column

In [12]:
# df['new']
# creating a new column of dataframe
df['new']= df['W']+df['Y']

In [13]:
df

Unnamed: 0,W,X,Y,Z,new
A,-2.55299,0.653619,0.864436,-0.742165,-1.688554
B,2.269755,-1.454366,0.045759,-0.187184,2.315513
C,1.532779,1.469359,0.154947,0.378163,1.687727
D,-0.887786,-1.980796,-0.347912,0.156349,-1.235698
E,1.230291,1.20238,-0.387327,-0.302303,0.842964


## deleting a column from a dataframe

In [14]:
df.drop('new',axis=1)
# by default it is axis = 0 ,i.e axis = 0 will represent indexes
# sum of all indexes  will give column sum(done in numpy exercises)
# if we change axis = 1; axis =1 will represent columns(sum of all columns will give rowsum)

Unnamed: 0,W,X,Y,Z
A,-2.55299,0.653619,0.864436,-0.742165
B,2.269755,-1.454366,0.045759,-0.187184
C,1.532779,1.469359,0.154947,0.378163
D,-0.887786,-1.980796,-0.347912,0.156349
E,1.230291,1.20238,-0.387327,-0.302303


In [15]:
df
# still no change

Unnamed: 0,W,X,Y,Z,new
A,-2.55299,0.653619,0.864436,-0.742165,-1.688554
B,2.269755,-1.454366,0.045759,-0.187184,2.315513
C,1.532779,1.469359,0.154947,0.378163,1.687727
D,-0.887786,-1.980796,-0.347912,0.156349,-1.235698
E,1.230291,1.20238,-0.387327,-0.302303,0.842964


In [16]:
# if we want to truly delete the column
df.drop('new',axis=1,inplace=True)
# we have to make inplace = True

In [17]:
df
# new column is completely removed

Unnamed: 0,W,X,Y,Z
A,-2.55299,0.653619,0.864436,-0.742165
B,2.269755,-1.454366,0.045759,-0.187184
C,1.532779,1.469359,0.154947,0.378163
D,-0.887786,-1.980796,-0.347912,0.156349
E,1.230291,1.20238,-0.387327,-0.302303


## removing rows

In [18]:
df.drop('E',inplace = True)

In [19]:
df

Unnamed: 0,W,X,Y,Z
A,-2.55299,0.653619,0.864436,-0.742165
B,2.269755,-1.454366,0.045759,-0.187184
C,1.532779,1.469359,0.154947,0.378163
D,-0.887786,-1.980796,-0.347912,0.156349


In [21]:
df.shape
# df.shape is basically a tuple
# at 0 index, no. of rows
# at 1 index, no. of columns
# which is why rows are referred to as zero axis
# columns are referred to as one axis
# row and columns are directly taken from shape

(4, 4)

## selecting rows

In [23]:
df.loc['A']
# not only are all the columns series but all rows are series as well

W   -2.552990
X    0.653619
Y    0.864436
Z   -0.742165
Name: A, dtype: float64

In [25]:
df.iloc[0]
# iloc considers default index 0,1,2,3

W   -2.552990
X    0.653619
Y    0.864436
Z   -0.742165
Name: A, dtype: float64

## accessing value at index

In [26]:
df.loc['B','Y']
# passing rowname
# passing column name
# row name : 'B'
# column name : 'Y'

0.04575851730144607

In [29]:
df.loc[['A','B'],['W','Y']]
# passing list of rows
# passing list of columns

Unnamed: 0,W,Y
A,-2.55299,0.864436
B,2.269755,0.045759
