## <font color = 'red'>Dataframes in Pandas </font>

> ### _DF are more like a table_

In [31]:
import numpy as np
import pandas as pd

In [32]:
df = pd.DataFrame({
    'Population' : [23.445, 44.445, 33.656, 66.565, 67.445],
    'GDP' : [
        123456,
         233443,
         234234,
         324324,
         766767           
    ],
    'Surface Area' : [
        342343,
        23423424,
        657567,
        565464,
        879878        
    ],
    'HDI' : [
        0.556,
        0.784,
        0.556,
        0.895,
        0.578        
    ],
    'Continent' : [
        'America',
        'Europe',
        'Europe',
        'Europe',
        'Asia'        
    ]
    
}, columns=['Population','GDP','Surface Area','HDI','Continent'])

#### _Columns are kept to keep the column orders in same sequence._

In [33]:
df

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
0,23.445,123456,342343,0.556,America
1,44.445,233443,23423424,0.784,Europe
2,33.656,234234,657567,0.556,Europe
3,66.565,324324,565464,0.895,Europe
4,67.445,766767,879878,0.578,Asia


In [38]:
df.index = [
    'Canada',
    'France',
    'Germany',
    'Italy',
    'China'
]

In [39]:
df

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
Canada,23.445,123456,342343,0.556,America
France,44.445,233443,23423424,0.784,Europe
Germany,33.656,234234,657567,0.556,Europe
Italy,66.565,324324,565464,0.895,Europe
China,67.445,766767,879878,0.578,Asia


In [6]:
df.columns

Index(['Population', 'GDP', 'Surface Area', 'HDI', 'Continent'], dtype='object')

In [7]:
 df.index

Index(['Canada', 'France', 'Germany', 'Italy', 'China'], dtype='object')

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5 entries, Canada to China
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Population    5 non-null      float64
 1   GDP           5 non-null      int64  
 2   Surface Area  5 non-null      int64  
 3   HDI           5 non-null      float64
 4   Continent     5 non-null      object 
dtypes: float64(2), int64(2), object(1)
memory usage: 240.0+ bytes


In [11]:
 df.shape

(5, 5)

In [10]:
df

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
Canada,23.445,123456,342343,0.556,America
France,44.445,233443,23423424,0.784,Europe
Germany,33.656,234234,657567,0.556,Europe
Italy,66.565,324324,565464,0.895,Europe
China,67.445,766767,879878,0.578,Asia


In [11]:
df.describe() # only visible for numeric columns/ statistic fields

Unnamed: 0,Population,GDP,Surface Area,HDI
count,5.0,5.0,5.0,5.0
mean,47.1112,336444.8,5173735.0,0.6738
std,19.622397,250870.861685,10203710.0,0.156529
min,23.445,123456.0,342343.0,0.556
25%,33.656,233443.0,565464.0,0.556
50%,44.445,234234.0,657567.0,0.578
75%,66.565,324324.0,879878.0,0.784
max,67.445,766767.0,23423420.0,0.895


In [12]:
df.dtypes # data types of fields

Population      float64
GDP               int64
Surface Area      int64
HDI             float64
Continent        object
dtype: object

In [13]:
df.size

25

In [14]:
df.dtypes.value_counts()

int64      2
float64    2
object     1
dtype: int64

![green-divider](https://user-images.githubusercontent.com/7065401/52071924-c003ad80-2562-11e9-8297-1c6595f8a7ff.png)

## Indexing, Slicing and Selection

In [40]:
df

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
Canada,23.445,123456,342343,0.556,America
France,44.445,233443,23423424,0.784,Europe
Germany,33.656,234234,657567,0.556,Europe
Italy,66.565,324324,565464,0.895,Europe
China,67.445,766767,879878,0.578,Asia


In [41]:
df.loc['Canada'] #loc does a transpose of the dataframe

Population       23.445
GDP              123456
Surface Area     342343
HDI               0.556
Continent       America
Name: Canada, dtype: object

In [42]:
df.loc['Italy'] #horizontal data filter

Population      66.565
GDP             324324
Surface Area    565464
HDI              0.895
Continent       Europe
Name: Italy, dtype: object

In [35]:
df.iloc[1] #iloc allows to feed the numerical index #horizontal data filter

Population        44.445
GDP               233443
Surface Area    23423424
HDI                0.784
Continent         Europe
Name: 1, dtype: object

In [36]:
df['Population'] # column level data for all the rows #vertical data filter

0    23.445
1    44.445
2    33.656
3    66.565
4    67.445
Name: Population, dtype: float64

In [20]:
df

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
Canada,23.445,123456,342343,0.556,America
France,44.445,233443,23423424,0.784,Europe
Germany,33.656,234234,657567,0.556,Europe
Italy,66.565,324324,565464,0.895,Europe
China,67.445,766767,879878,0.578,Asia


In [21]:
# we can use to_Frame to see the output series data in DataFrame format itself..
df['Population'].to_frame()

Unnamed: 0,Population
Canada,23.445
France,44.445
Germany,33.656
Italy,66.565
China,67.445


In [22]:
df[['Population','GDP']] # can be selected multiple columns from dataframe

Unnamed: 0,Population,GDP
Canada,23.445,123456
France,44.445,233443
Germany,33.656,234234
Italy,66.565,324324
China,67.445,766767


In [23]:
df.iloc[0]

Population       23.445
GDP              123456
Surface Area     342343
HDI               0.556
Continent       America
Name: Canada, dtype: object

In [43]:
df.loc['China']

Population      67.445
GDP             766767
Surface Area    879878
HDI              0.578
Continent         Asia
Name: China, dtype: object

In [44]:
df['Population','Continent'].to_frame() # multiple need not be used with to_frame function

KeyError: ('Population', 'Continent')

In [31]:
df

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
Canada,23.445,123456,342343,0.556,America
France,44.445,233443,23423424,0.784,Europe
Germany,33.656,234234,657567,0.556,Europe
Italy,66.565,324324,565464,0.895,Europe
China,67.445,766767,879878,0.578,Asia


In [32]:
df[1:3] #hifger limit is not included

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
France,44.445,233443,23423424,0.784,Europe
Germany,33.656,234234,657567,0.556,Europe


In [33]:
 df['Canada' : 'Italy']

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
Canada,23.445,123456,342343,0.556,America
France,44.445,233443,23423424,0.784,Europe
Germany,33.656,234234,657567,0.556,Europe
Italy,66.565,324324,565464,0.895,Europe


In [34]:
df.loc['Canada' : 'Italy' , 'Population'].to_frame()

Unnamed: 0,Population
Canada,23.445
France,44.445
Germany,33.656
Italy,66.565


In [35]:
df.loc['Canada':'Italy', ['Population','GDP']]

Unnamed: 0,Population,GDP
Canada,23.445,123456
France,44.445,233443
Germany,33.656,234234
Italy,66.565,324324


In [36]:
df.iloc[0].to_frame()

Unnamed: 0,Canada
Population,23.445
GDP,123456
Surface Area,342343
HDI,0.556
Continent,America


In [37]:
df.iloc[-1]

Population      67.445
GDP             766767
Surface Area    879878
HDI              0.578
Continent         Asia
Name: China, dtype: object

In [38]:
df.iloc[[0,1, 2, -1]] # multi indices selection

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
Canada,23.445,123456,342343,0.556,America
France,44.445,233443,23423424,0.784,Europe
Germany,33.656,234234,657567,0.556,Europe
China,67.445,766767,879878,0.578,Asia


In [39]:
df

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
Canada,23.445,123456,342343,0.556,America
France,44.445,233443,23423424,0.784,Europe
Germany,33.656,234234,657567,0.556,Europe
Italy,66.565,324324,565464,0.895,Europe
China,67.445,766767,879878,0.578,Asia


In [40]:
df.iloc[0:-1]

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
Canada,23.445,123456,342343,0.556,America
France,44.445,233443,23423424,0.784,Europe
Germany,33.656,234234,657567,0.556,Europe
Italy,66.565,324324,565464,0.895,Europe


In [41]:
df.iloc[0:3, [0,2,3,4]]

Unnamed: 0,Population,Surface Area,HDI,Continent
Canada,23.445,342343,0.556,America
France,44.445,23423424,0.784,Europe
Germany,33.656,657567,0.556,Europe


In [42]:
df.iloc[0:3, 1:3] # Upper limit is not considered in both row and column slicing in dataframe isf iloc is used for selection

Unnamed: 0,GDP,Surface Area
Canada,123456,342343
France,233443,23423424
Germany,234234,657567


> #### <font color ='green'>**RECOMMENDATION: Always use iloc and loc especially with dataframe numerical indexes.**</font>

In [44]:
df.iloc[0].to_frame()

Unnamed: 0,Canada
Population,23.445
GDP,123456
Surface Area,342343
HDI,0.556
Continent,America


In [45]:
df.iloc[0:2,[0,1,4]]

Unnamed: 0,Population,GDP,Continent
Canada,23.445,123456,America
France,44.445,233443,Europe


![purple-divider](https://user-images.githubusercontent.com/7065401/52071927-c1cd7100-2562-11e9-908a-dde91ba14e59.png)

## Conditional Selection (boolean Arrays)

> #### <font color='Gray'>Like Series in Pandas, boolean selection also work with dataframe</font>

In [47]:
df

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
Canada,23.445,123456,342343,0.556,America
France,44.445,233443,23423424,0.784,Europe
Germany,33.656,234234,657567,0.556,Europe
Italy,66.565,324324,565464,0.895,Europe
China,67.445,766767,879878,0.578,Asia


In [53]:
df['Population'] > 50 # it gives the index satisfying the condition

Canada     False
France     False
Germany    False
Italy       True
China       True
Name: Population, dtype: bool

#### **Above the index received for filtered data matches with the main dataframe, althought these are 2 different datasets.**

In [19]:
df.loc[df['Population'] > 50 , ['Population','GDP','Continent']]

Unnamed: 0,Population,GDP,Continent
3,66.565,324324,Europe
4,67.445,766767,Asia


In [23]:
df

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
0,23.445,123456,342343,0.556,America
1,44.445,233443,23423424,0.784,Europe
2,33.656,234234,657567,0.556,Europe
3,66.565,324324,565464,0.895,Europe
4,67.445,766767,879878,0.578,Asia


![purple-divider](https://user-images.githubusercontent.com/7065401/52071927-c1cd7100-2562-11e9-908a-dde91ba14e59.png)

## Conditional Selection (boolean Arrays)

> #### <font color='Gray'>Like Series in Pandas, boolean selection also work with dataframe</font>

In [25]:
df

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
0,23.445,123456,342343,0.556,America
1,44.445,233443,23423424,0.784,Europe
2,33.656,234234,657567,0.556,Europe
3,66.565,324324,565464,0.895,Europe
4,67.445,766767,879878,0.578,Asia


In [26]:
df.iloc[-1]

Population      67.445
GDP             766767
Surface Area    879878
HDI              0.578
Continent         Asia
Name: 4, dtype: object

In [46]:
df.loc['Canada']

Population       23.445
GDP              123456
Surface Area     342343
HDI               0.556
Continent       America
Name: Canada, dtype: object

In [47]:
df

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
Canada,23.445,123456,342343,0.556,America
France,44.445,233443,23423424,0.784,Europe
Germany,33.656,234234,657567,0.556,Europe
Italy,66.565,324324,565464,0.895,Europe
China,67.445,766767,879878,0.578,Asia


In [49]:
df.iloc[0]

Population       23.445
GDP              123456
Surface Area     342343
HDI               0.556
Continent       America
Name: Canada, dtype: object

![purple-divider](https://user-images.githubusercontent.com/7065401/52071927-c1cd7100-2562-11e9-908a-dde91ba14e59.png)

## Dropping Stuffs....

> #### <font color='Gray'>Instead of dropping values we can point which to drop (does not drop the elements from the dataframe</font>

In [50]:
df

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
Canada,23.445,123456,342343,0.556,America
France,44.445,233443,23423424,0.784,Europe
Germany,33.656,234234,657567,0.556,Europe
Italy,66.565,324324,565464,0.895,Europe
China,67.445,766767,879878,0.578,Asia


In [58]:
df.drop(['Canada'])
df

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
Canada,23.445,123456,342343,0.556,America
France,44.445,233443,23423424,0.784,Europe
Germany,33.656,234234,657567,0.556,Europe
Italy,66.565,324324,565464,0.895,Europe
China,67.445,766767,879878,0.578,Asia


In [60]:
df.drop(['Canada','Italy'])

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
France,44.445,233443,23423424,0.784,Europe
Germany,33.656,234234,657567,0.556,Europe
China,67.445,766767,879878,0.578,Asia


In [63]:
df.drop(columns=['HDI'])

Unnamed: 0,Population,GDP,Surface Area,Continent
Canada,23.445,123456,342343,America
France,44.445,233443,23423424,Europe
Germany,33.656,234234,657567,Europe
Italy,66.565,324324,565464,Europe
China,67.445,766767,879878,Asia


In [65]:
df.drop(['Canada','Italy'], axis =0)

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
France,44.445,233443,23423424,0.784,Europe
Germany,33.656,234234,657567,0.556,Europe
China,67.445,766767,879878,0.578,Asia


In [67]:
df.drop(['HDI','Continent'], axis =1)

Unnamed: 0,Population,GDP,Surface Area
Canada,23.445,123456,342343
France,44.445,233443,23423424
Germany,33.656,234234,657567
Italy,66.565,324324,565464
China,67.445,766767,879878


In [69]:
df.drop(['Canada','Italy'], axis='rows')

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
France,44.445,233443,23423424,0.784,Europe
Germany,33.656,234234,657567,0.556,Europe
China,67.445,766767,879878,0.578,Asia
