In [None]:
import numpy as np;
import pandas as pd;

### Note : 
Consider Series as a Java Map with values of a specific data type e.g. ```Map<String,Integer>, Map<String,String>```
and keys which are called as index in pandas.
A DataFrame is collection of Series (each series with it's specific data type for value) and each series having the same/similar index.
With Java analogy DataFrame is a group of Maps having same keys but different value data types, each column in the dataframe represents a Map (more specifically values in the Map)

__Java represenation__

```java
Map<String,Float> population = new HashMap<String, Float>(){
		{
			put("Mumbai",1.84f);
			put("Delhi",1.9f);
			put("Pune",0.31f);
		}
	};
	
	Map<String,Integer> area = new HashMap<String, Integer>(){
		{
			put("Mumbai",603);
			put("Delhi",1484);
			put("Pune",331);
		}
	};
	
	Map<String,String> state = new HashMap<String, String>(){
		{
			put("Mumbai","Maharashtra");
			put("Delhi","Central");
			put("Pune","Maharashtra");
		}
	};
```

__Python represenation:__

```python
cities = {
    'population':{'Delhi':1.9,'Pune':0.31,'Mumbai':1.84},
    'area':{'Delhi':1484,'Pune':331,'Mumbai':603},
    'state':{'Delhi':'Central','Pune':'Maharashtra','Mumbai':'Maharashtra'}
}
```

With MS Excel analogy, Dataframe is a worksheet, index is an index column(left most bold column ;)) and other columns as excel columns having same data-type values.


__Excel representation:__



city|area|	population	|state
-|-|-|-
Delhi|1484|1.90|Central
Mumbai|603|1.84|Maharashtra
Pune|331|0.31|Maharashtra


__Database representation:__

<pre>
CREATE TABLE CITIES(
city VARCHAR,
area NUMBER,
population NUMBER,
state VARCHAR
);
</pre>

## DDL

### DataFrame creation

In [None]:
# Naming a series
pune = pd.Series(dict(apple=30,mango=45,banana=67))
pune.name = "Pune"
print(pune)
print(pune.name)

####  concatenating multiple series

In [None]:

mumbai = pd.Series(dict(apple=30,mango=45,banana=67))
kolkata = pd.Series(dict(apple=32,mango=90,banana=34))
delhi = pd.Series(dict(apple=20,mango=94,banana=45))

# axis should be 1, else the contents will be merged into a single series
fruits_df = pd.concat([mumbai,kolkata,delhi],axis=1)

print(type(fruits_df))

fruits_df

In [None]:
# assign column names
fruits_df.columns = ['Mumbai','Kolkata','Delhi']

mumbai.name = "MUMBAI" # This won't take effect because dataFrame has already been created

fruits_df

In [None]:
# Another way to assign column names : assign names to series BEFORE concatenation


mumbai.name = "Mumbai"
kolkata.name = "Kolkata"
delhi.name = "Delhi"

pd.concat([mumbai,delhi,kolkata],axis=1)

#### From list of series : when it (outer) is a list, every element in the list is a row

In [None]:
pd.DataFrame([mumbai,kolkata]) # Each series becomes a row

#### From list of lists : when it (outer) is a list, every element in the list is a row

In [None]:
# The 2d array would look like the DataFrame when punctuations(,[,]) are removed

temperatures = [
    [20,32,32],# Row1
    [35,40,40],# Row2
    [10,10,30],# Row3
]

pd.DataFrame(temperatures)

In [None]:
# From random array

pd.DataFrame(
    np.random.choice(range(1,32),size=(7,4),replace=False),
    index=['S','M','T','W','T','F','S'],
    columns=('week '+str(i) for i in range(4))
)

#### From list of dictionaries : when it (outer) is a list, every element in the list is a row

In [None]:
# This is a little strange
# The array would look like the DataFrame when punctuations(,[,]) are removed
# Column names derived from keys in the dictionaries

temperatures = [# Columns are named
    {'Bengaluru':20,'Delhi':32,'Mumbai':32}, # Row1
    {'Bengaluru':35,'Delhi':40,'Mumbai':40}, # Row2
    {'Bengaluru':10,'Delhi':10,'Mumbai':30}, # Row3
]

pd.DataFrame(temperatures)

#### From dictionary of lists : when it (outer) is a dictionary, every key is a column

In [None]:
# Column names from the keys in the dictionaries

temperatures = {
    "Bengaluru":[20,35,10],# Column1
    "Delhi":[32,40,10], # Column2
    "Mumbai":[32,40,30], # Column3
}

pd.DataFrame(temperatures)

#### From dictionary of dictionaries : when it (outer) is a dictionary, every key is a column

In [None]:
# Column names from the keys in outer dictioanry and index from the inner dictionary
# Logically most dictionaries have same data-type elements, so analogous to an excel/db column

temperatures = {
    "Bengaluru" : {'rain':20,'summer':35,'winter':10}, # Column1
    "Delhi" : {'rain':32,'summer':40,'winter':10}, # Column2
    "Mumbai" : {'rain':32,'summer':40,'winter':30}, # Column3
}

pd.DataFrame(temperatures)

### Index and Column arrangement

#### Reorder the columns/index while frame creation

In [None]:
temperatures = {
    "Bengaluru" : {'rain':20,'summer':35,'winter':10}, # Column1
    "Delhi" : {'rain':32,'summer':40,'winter':10}, # Column2
    "Mumbai" : {'rain':32,'summer':40,'winter':30}, # Column3
}

# Columns are already named in the data, we just re-order them, if given differnt name it becomes a NaN column
temperatures_frame = pd.DataFrame(temperatures
             ,index=['winter','rain','summer']
             ,columns=['Mumbai','Delhi','Bengaluru']
)

temperatures_frame

 #### Name the columns/index while frame creation

In [None]:
temperatures = [
    [27,33,35,42,41,43,34,35,37,41,26,23],
    [9,10,35,42,41,43,34,35,37,38,12,8],
    [9,10,30,34,36,35,30,28,22,27,12,8]
]


# Note : While creating this data-frame we are just naming the index and columns (in case of list-of-dict,
# dict-of-list,dict-of-dict 
# columns/index could be already named, in those cases we just re-order while creation)
temperatures_frame = pd.DataFrame(temperatures
             ,columns=['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec']
             ,index=['Mumbai','Delhi','Bengaluru']
            
)

temperatures_frame




#### Reindexing : only re-order the column/index

In [None]:
# Re-indexing : Change/Drop the index/columns and/or change their order

# This gives out a new dataFrame
temperatures_frame.reindex(columns=['Dec','May','Jul','Oct'],index=['Bengaluru','Mumbai'])



In [None]:
# This is same as above, as reindex anyways give out a new dataFrame

pd.DataFrame(temperatures_frame,columns=['Dec','May','Jul','Oct'],index=['Bengaluru','Mumbai'])

#### Renaming column/index : provide maps (old=>new) for renaming

In [None]:
temperatures_frame.rename(index={'Mumbai':'Mum'},columns={'Jan':'January','Oct':'October'},inplace=True)
temperatures_frame

#### Existing column as index 

In [None]:

cities = {
    'name':['Delhi','Pune','Mumbai'],
    'population':[1.9,0.31,1.84],
    'area':[1484,331,603],
    'state':['Central','Maharashtra','Maharashtra']
}


In [None]:
pd.DataFrame(cities)

In [None]:
# A nice idea to use an existing column as index

pd.DataFrame(cities,
             index=cities['name'], # This gives all the city names
             columns=cities.keys()-['name'] # Now exclude the 'name' column as it is already the index column
            )

In [None]:
# Another inplace and simpler way to set existing column as index

cities_frame = pd.DataFrame(cities)
cities_frame.set_index('name',inplace=True)
cities_frame

#### Insert new columns 

In [None]:
# alter table add column region 
cities_with_regions = cities_frame.copy(deep=True)
cities_with_regions.insert(1,'region',None)
cities_with_regions

In [None]:
# alter table add column region default True
cities_with_it = cities_frame.copy(deep=True)
cities_with_it.insert(2,'it',True)
cities_with_it

In [None]:
# new column value derived from an existing column
# alter table add column region default ....
cities_with_readable_population = cities_frame.copy(deep=True)
cities_with_readable_population.insert(
    loc=3,
    column='readable population',
    value = cities_with_readable_population.population.apply(lambda p : str(p)+' lacs')
)
cities_with_readable_population

## DQL

In [None]:
cities = {
    'population':[1.9,0.31,1.84],
    'area':[1484,331,603],
    'state':['Delhi','Maharashtra','Maharashtra']
}

pd.DataFrame(cities,index=['Delhi','Pune','Mumbai'])



In [None]:
# Each column is a series

print(type(cities_frame.area))
cities_frame.area


In [None]:
# Each index is also a series (with columns as index)

print(type(cities_frame.loc['Delhi']))
print(cities_frame.loc['Delhi'].index)
cities_frame.loc['Delhi']

### Note :  
Important to understand that the operations on dataFrames give back another dataFrames/Series. And further operations can be applied on the results (chaining)

#### Projection

In [None]:
cities_frame['area']

In [None]:
cities_frame.area # same as above

In [None]:
cities_frame[['area','population']] # select area,population from cities

In [None]:
# Apply function on each row : apply is done on the series
cities_frame.population.apply(np.log10) # select log10(population) from cities

In [None]:
# population density
# select population*100000/area from cities where index in ('Delhi','Pune')
(cities_frame.population*100000/cities_frame.area)[['Delhi','Pune']]

### Note : 
<ul>apply : the provided function works on each index(row) of the dataFrame
<li>if the result of the operation on each index is a scalar, 'apply' outputs a series
<li>if the result of the operation on each index is a series, 'apply' outputs a dataFrame with each vector result represented as series as index.
<li>if the result of the operation on each index is a list (of same cardinality as each index in the dataFrame), 'apply' outputs a dataFrame with each vector result represented as series as index.
</ul>

In [None]:
# The output of lambda is the modified series, so final output of apply is a dataFrame
cities_frame.apply(lambda series : series.apply(lambda value : value.upper() if type(value)==str else value),axis=1)

In [None]:
# [0,0,None] has same cardinality(3) as Delhi,Pune and Mumbai
cities_frame.apply(lambda series : [0,0,None],axis=1)

In [None]:
# [0,0,0,0] has different cardinality(4) than Delhi,Pune and Mumbai
cities_frame.apply(lambda series : [0,0,0,0],axis=1)

In [None]:
# This is same as above example of population density
# select population*100000/area from cities where index in ('Delhi','Pune')
cities_frame.apply(lambda series : series.population*100000/series.area,axis=1)[['Delhi','Pune']]

In [None]:
# select area||' sq.kms',population||' lakhs' from cities

cities_frame[['area','population']] \
.apply(lambda series : [str(series.area)+' sq.kms',str(series.population)+' lakhs'],axis=1)

In [None]:
cities_frame[['area','population']]  \
.apply(lambda series : series.area,axis=1)

#### Selection (where clause) 

In [None]:
cities_frame.loc['Delhi']  # Where index='Delhi'

In [None]:
cities_frame.loc[['Delhi','Pune']] # Where index in ('Delhi','Pune')

In [None]:
cities_frame.loc[cities_frame.area>600] # Note : cities_frame.area>600 is a bool series

#### Projection + Selection

In [None]:
cities_frame[['state','population']].loc[cities_frame.area>600] # select state,population from cities where area>600

In [None]:
cities_frame.loc[cities_frame.area>600][['state','population']] # select state,population from cities where area>600

In [None]:
# select area from cities where name = 'Delhi'
cities_frame.loc['Delhi','area'] # Note first param is index and second param is column

In [None]:
# select area from cities where name = 'Delhi'
cities_frame.at['Delhi','area']

#### Note : Use 'at' if you only need to get or set a single value in a DataFrame or Series." 
#### loc on the other hand can be used to access a single value but also to access a group 
#### of rows and columns by a label or labels.
#### When it comes to speed the answer is clear: we should definitely use at.

In [None]:
# select area where name in ('Delhi','Mumbai')
cities_frame.loc[['Delhi','Mumbai'],'area']

In [None]:
# select state, population from cities where area > 600
cities_frame.loc[cities_frame.area>600,['state','population']]

In [None]:
# Nested query
# select * from cities where area>600 (select * from cities where state='Maharashtra')
cities_frame.loc[cities_frame.area>600].loc[cities_frame.state=='Maharashtra'] 

#### Order by

In [None]:
# select state,population from cities order by state desc, population asc

cities_frame.sort_values(by=['state','population'],ascending=[False,True])[['state','population']]

#### Aggregation

In [None]:
cities_frame.area.sum() # select sum(area) from cities

In [None]:
np.mean(cities_frame.population) # select avg(area) from cities

In [None]:
cities_frame.area.max(),cities_frame.population.min() # select max(area),min(population) from cities

### DML

#### Updates

In [None]:
cities_frame = pd.DataFrame(
    cities,
    index=['Delhi','Pune','Mumbai'],
    columns=['area','population','state','type','tier','old_name']
)
cities_frame

In [None]:
# update cities set type = 'Unassigned'
# Update all values in column to a single value
cities_frame.type = 'Unassigned'
cities_frame

In [None]:
# Update values with a list
cities_frame.type = ['UT','Normal','State Capital']
cities_frame

In [None]:
# Update values with a series
cities_frame.old_name = cities_frame.index
cities_frame.type = cities_frame.area > 500
cities_frame

In [None]:
# Update values with a series
cities_frame.type = cities_frame.area.apply(lambda v : 'Big' if v > 500 else 'Small')
cities_frame

In [None]:
# Update values with a series (which has subset of keys)
cities_frame.type = pd.Series(['UT','Capital'],index=['Delhi','Mumbai'])
cities_frame

In [None]:
# update cities set type = 'Big' where area > 600
cities_frame.loc[cities_frame.area>600,'type']='Big'
cities_frame

In [None]:
# update cities set type = 'Big',tier = 1 where area > 600
cities_frame.loc[cities_frame.area>600,['type','tier']]=('Big',1)
cities_frame

In [None]:
# update cities set type = 'Big',tier = 1 where index in ('Delhi','Mumbai')
cities_frame.loc[['Delhi','Mumbai'],['type','tier']]=('Big',1)
cities_frame

In [None]:
# Set all value in All rows having area>1000 to None

cities_frame.loc[cities_frame.area>1000] = None
cities_frame


In [None]:
cities_frame.loc['Delhi'] = (1484,1.90,'Delhi','Big',1,'Delhi')
cities_frame

#### Update using Replace

In [None]:
series = pd.Series([2,3,5,1,4,5])

In [None]:
series.replace(5,0)

In [None]:
series.replace(5,0,inplace=True)
series

In [None]:
# update n where n in ()
series.replace([1,2,3],-1)

In [None]:
series.replace([1,2,3],[-1,-2,-3])

In [None]:
# replace all values in all columns and all indices
cities_frame.replace('Delhi','Dilli')

In [None]:
# replace multiple values with multiple values
cities_frame.replace(['Delhi','Mumbai'],['Dilli','Bombay'])

In [None]:
# replace with regex
cities_frame.replace(r'(.+)i$',r'\1y',regex=True)

In [None]:
# replace with dictionary
cities_frame.replace(to_replace={'Delhi':'Dilli','Mumbai':'Bombay'})

In [None]:
# replace with dictionary, mentioning the specific columns
cities_frame.replace(to_replace={'state':{'Delhi':'Dilli'},'old_name':{'Mumbai':'Bombay'}})

In [None]:
# replace inplace : mutate original values
mutable_cities_frame = cities_frame.copy(deep=True)
mutable_cities_frame.replace(to_replace={'state':{'Delhi':'Dilli'},'old_name':{'Mumbai':'Bombay'}},inplace=True)
mutable_cities_frame

In [None]:
cities_frame.replace(to_replace='Maharashtra', 
           value=None, 
           method='ffill')