# Pandas - DATAFRAME

In [1]:
import pandas as pd

In [3]:
import numpy as np

### DataFrame is a tabular data structure that contain rows and columns, rows will have index and columns can have meaningful data

#### let's look at some examples of creating a dataframe, using numpy arrays, lists, dictionaries etc and directly.

In [6]:
temp = np.random.randint(low = 20, high = 100, size = [20,])

In [7]:
name = np.random.choice(['Mumbai', 'Delhi', 'Chennai', "Kolkata", 'Bangaluru'], 20)
random = np.random.choice([10,20,30,40,50], 20)

#### Now we are going to zip all the above variables into a list

In [8]:
a_zp = list(zip(temp, name, random))

In [9]:
a_zp

[(38, 'Mumbai', 40),
 (36, 'Bangaluru', 50),
 (89, 'Kolkata', 50),
 (89, 'Bangaluru', 50),
 (68, 'Chennai', 10),
 (57, 'Kolkata', 10),
 (28, 'Bangaluru', 40),
 (96, 'Chennai', 10),
 (87, 'Mumbai', 10),
 (86, 'Kolkata', 30),
 (47, 'Mumbai', 30),
 (71, 'Chennai', 50),
 (61, 'Mumbai', 10),
 (62, 'Delhi', 20),
 (38, 'Mumbai', 30),
 (93, 'Chennai', 20),
 (51, 'Kolkata', 50),
 (60, 'Delhi', 40),
 (58, 'Bangaluru', 30),
 (97, 'Chennai', 50)]

#### Creating a Dataframe from above list

In [10]:
a_df = pd.DataFrame(data = a_zp, columns = ['temp', 'name', 'random'])

In [11]:
a_df

Unnamed: 0,temp,name,random
0,38,Mumbai,40
1,36,Bangaluru,50
2,89,Kolkata,50
3,89,Bangaluru,50
4,68,Chennai,10
5,57,Kolkata,10
6,28,Bangaluru,40
7,96,Chennai,10
8,87,Mumbai,10
9,86,Kolkata,30


In [12]:
type(a_df)

pandas.core.frame.DataFrame

#### how to create Dataframe from dictionaries

In [13]:
a_df = pd.DataFrame({'random':random, 'name':name, 'temp':temp})

In [14]:
a_df

Unnamed: 0,random,name,temp
0,40,Mumbai,38
1,50,Bangaluru,36
2,50,Kolkata,89
3,50,Bangaluru,89
4,10,Chennai,68
5,10,Kolkata,57
6,40,Bangaluru,28
7,10,Chennai,96
8,10,Mumbai,87
9,30,Kolkata,86


##### There are another ways to create a dataframes from various sources such as CSV, text files, Excel sheets etc

#### first let's perform some minor operations over dataframe

In [15]:
a_df.head()  # Outputs first 5 rows of the dataframe

Unnamed: 0,random,name,temp
0,40,Mumbai,38
1,50,Bangaluru,36
2,50,Kolkata,89
3,50,Bangaluru,89
4,10,Chennai,68


In [16]:
a_df.tail()  # Outputs last 5 rows of the dataframe

Unnamed: 0,random,name,temp
15,20,Chennai,93
16,50,Kolkata,51
17,40,Delhi,60
18,30,Bangaluru,58
19,50,Chennai,97


In [17]:
a_df.shape # check dimensions of the dataframe

(20, 3)

In [18]:
a_df.columns # Checking columns present in the Dataframe, 

Index(['random', 'name', 'temp'], dtype='object')

#### Retrieving all values of a particular column present in a dataframe

In [19]:
a_df.name # just mention the column-name we desire to retrieve values from

0        Mumbai
1     Bangaluru
2       Kolkata
3     Bangaluru
4       Chennai
5       Kolkata
6     Bangaluru
7       Chennai
8        Mumbai
9       Kolkata
10       Mumbai
11      Chennai
12       Mumbai
13        Delhi
14       Mumbai
15      Chennai
16      Kolkata
17        Delhi
18    Bangaluru
19      Chennai
Name: name, dtype: object

In [20]:
a_df['random'] # more common way for getting values of a particular column

0     40
1     50
2     50
3     50
4     10
5     10
6     40
7     10
8     10
9     30
10    30
11    50
12    10
13    20
14    30
15    20
16    50
17    40
18    30
19    50
Name: random, dtype: int64

## Statistical Information (summary statistics like in R)

In [23]:
a_df.describe()

Unnamed: 0,random,temp
count,20.0,20.0
mean,31.5,65.6
std,15.985191,22.046363
min,10.0,28.0
25%,17.5,50.0
50%,30.0,61.5
75%,50.0,87.5
max,50.0,97.0


#### statistical information of a particular column

In [27]:
a_df.temp.describe() # or a_df['temp'].describe()

count    20.000000
mean     65.600000
std      22.046363
min      28.000000
25%      50.000000
50%      61.500000
75%      87.500000
max      97.000000
Name: temp, dtype: float64

#### General information about a dataframe using .info() function

In [28]:
a_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 3 columns):
random    20 non-null int64
name      20 non-null object
temp      20 non-null int64
dtypes: int64(2), object(1)
memory usage: 560.0+ bytes


#### Outputting a dataframe in a different format / way

In [29]:
a_df.values

array([[40, 'Mumbai', 38],
       [50, 'Bangaluru', 36],
       [50, 'Kolkata', 89],
       [50, 'Bangaluru', 89],
       [10, 'Chennai', 68],
       [10, 'Kolkata', 57],
       [40, 'Bangaluru', 28],
       [10, 'Chennai', 96],
       [10, 'Mumbai', 87],
       [30, 'Kolkata', 86],
       [30, 'Mumbai', 47],
       [50, 'Chennai', 71],
       [10, 'Mumbai', 61],
       [20, 'Delhi', 62],
       [30, 'Mumbai', 38],
       [20, 'Chennai', 93],
       [50, 'Kolkata', 51],
       [40, 'Delhi', 60],
       [30, 'Bangaluru', 58],
       [50, 'Chennai', 97]], dtype=object)

### IF we look at the dataframe above we see the index is generated automatically for us, but if want to change / or convert a column to be as an index for our dataframe, this is possible using set_index() function

In [31]:
a_df.set_index('temp')

Unnamed: 0_level_0,random,name
temp,Unnamed: 1_level_1,Unnamed: 2_level_1
38,40,Mumbai
36,50,Bangaluru
89,50,Kolkata
89,50,Bangaluru
68,10,Chennai
57,10,Kolkata
28,40,Bangaluru
96,10,Chennai
87,10,Mumbai
86,30,Kolkata


##### The above change is Temporary, let's output a_df, we see

In [32]:
a_df

Unnamed: 0,random,name,temp
0,40,Mumbai,38
1,50,Bangaluru,36
2,50,Kolkata,89
3,50,Bangaluru,89
4,10,Chennai,68
5,10,Kolkata,57
6,40,Bangaluru,28
7,10,Chennai,96
8,10,Mumbai,87
9,30,Kolkata,86


### To make a change Permanent to a dataframe, as in the case with .set_index() above, we need to supply / add inplace parameter to the code, by default inplace = False

In [33]:
a_df.set_index('temp', inplace = True)

In [34]:
a_df

Unnamed: 0_level_0,random,name
temp,Unnamed: 1_level_1,Unnamed: 2_level_1
38,40,Mumbai
36,50,Bangaluru
89,50,Kolkata
89,50,Bangaluru
68,10,Chennai
57,10,Kolkata
28,40,Bangaluru
96,10,Chennai
87,10,Mumbai
86,30,Kolkata


#### Sort the dataframe by index values

In [35]:
a_df.sort_index(axis = 0, ascending = True)

Unnamed: 0_level_0,random,name
temp,Unnamed: 1_level_1,Unnamed: 2_level_1
28,40,Bangaluru
36,50,Bangaluru
38,40,Mumbai
38,30,Mumbai
47,30,Mumbai
51,50,Kolkata
57,10,Kolkata
58,30,Bangaluru
60,40,Delhi
61,10,Mumbai


#### Sort the dataframe with respect to column / by column values

In [36]:
a_df.sort_values(by = 'random', ascending = False)

Unnamed: 0_level_0,random,name
temp,Unnamed: 1_level_1,Unnamed: 2_level_1
97,50,Chennai
36,50,Bangaluru
89,50,Kolkata
89,50,Bangaluru
51,50,Kolkata
71,50,Chennai
60,40,Delhi
38,40,Mumbai
28,40,Bangaluru
86,30,Kolkata


In [39]:
a_df.drop(['random'])

KeyError: "['random'] not found in axis"

#### note to self: we get this error because random is a column not a row, imp to know that axis by default is 0, to drop the column we must change the axis to 1 (axis = 0 is row and when axis = 1 it is column)

In [40]:
a_df.drop('random', axis=1) # or we can write this like a_df.drop(['random'], axis=1)

Unnamed: 0_level_0,name
temp,Unnamed: 1_level_1
38,Mumbai
36,Bangaluru
89,Kolkata
89,Bangaluru
68,Chennai
57,Kolkata
28,Bangaluru
96,Chennai
87,Mumbai
86,Kolkata


In [43]:
#remember still not a permanent change
a_df.head()

Unnamed: 0_level_0,random,name
temp,Unnamed: 1_level_1,Unnamed: 2_level_1
38,40,Mumbai
36,50,Bangaluru
89,50,Kolkata
89,50,Bangaluru
68,10,Chennai


## iloc[ ] and loc[ ]  (Selecting data)

#### we have seen how to access element by providing indexes, similarly we can access elements of a dataframe. But a more cleaner way for achieving this is using iloc[] and loc[]

In [48]:
a_df.iloc[[0,1]]  # Row indexes

Unnamed: 0_level_0,random,name
temp,Unnamed: 1_level_1,Unnamed: 2_level_1
38,40,Mumbai
36,50,Bangaluru


In [53]:
a_df.iloc[1:3,1]  # 1 to 3 row indexes from column index 1

temp
36    Bangaluru
89      Kolkata
Name: name, dtype: object

#### using Boolean, True will select the value at the corresponding index and False will omit it, till we have mentioned, but after that every index for which we have not supplied any bool will be considered False automatically till the end of the dataframe and omitted

In [54]:
a_df.iloc[[True, False, True, True, False]]

Unnamed: 0_level_0,random,name
temp,Unnamed: 1_level_1,Unnamed: 2_level_1
38,40,Mumbai
89,50,Kolkata
89,50,Bangaluru


### loc[ ] , here we SELECT using the VALUES of the Index itself

In [59]:
a_df.loc[38]

Unnamed: 0_level_0,random,name
temp,Unnamed: 1_level_1,Unnamed: 2_level_1
38,40,Mumbai
38,30,Mumbai


In [60]:
a_df.loc[[38,89,57,61]]

Unnamed: 0_level_0,random,name
temp,Unnamed: 1_level_1,Unnamed: 2_level_1
38,40,Mumbai
38,30,Mumbai
89,50,Kolkata
89,50,Bangaluru
57,10,Kolkata
61,10,Mumbai


#### If we want to select values from particular columns, we can specify column names and or column range

In [72]:
a_df.loc[[38,89,61],['name']]

Unnamed: 0_level_0,name
temp,Unnamed: 1_level_1
38,Mumbai
38,Mumbai
89,Kolkata
89,Bangaluru
61,Mumbai


In [75]:
a_df.loc[[39,89,61], 'random':'name'] # index value 39 is not present in data so NaN

Unnamed: 0_level_0,random,name
temp,Unnamed: 1_level_1,Unnamed: 2_level_1
39,,
89,50.0,Kolkata
89,50.0,Bangaluru
61,10.0,Mumbai


In [76]:
a_df.loc[[38,89,61], 'random':'name']

Unnamed: 0_level_0,random,name
temp,Unnamed: 1_level_1,Unnamed: 2_level_1
38,40,Mumbai
38,30,Mumbai
89,50,Kolkata
89,50,Bangaluru
61,10,Mumbai


In [78]:
a_df.loc[[True, False, True, True, False, True, True], ['random']]

Unnamed: 0_level_0,random
temp,Unnamed: 1_level_1
38,40
89,50
89,50
57,10
28,40


### Within loc[], we can also define some conditions

In [86]:
a_df.loc[a_df.name == "Mumbai"]

Unnamed: 0_level_0,random,name
temp,Unnamed: 1_level_1,Unnamed: 2_level_1
38,40,Mumbai
87,10,Mumbai
47,30,Mumbai
61,10,Mumbai
38,30,Mumbai


#### combine condtional logic

In [88]:
a_df.loc[(a_df.random > 15) & (a_df.name == "Mumbai"), : ]

Unnamed: 0_level_0,random,name
temp,Unnamed: 1_level_1,Unnamed: 2_level_1
38,40,Mumbai
47,30,Mumbai
38,30,Mumbai
