In [0]:
import pandas as pd
import numpy as np

## 1. DataFrame Notes
it is a two-dimensional labeled data structure with columns of potentially different types. 

### (1) How to create a dataframe?

In [57]:
# Among the many things that can serve as input to make a ‘DataFrame’, a NumPy ndarray is one of them

data = np.array([['','Col1','Col2'],
                ['Row1',1,2],
                ['Row2',3,4]])
                
print(pd.DataFrame(data=data[1:,1:],
                  index=data[1:,0],
                  columns=data[0,1:]))

     Col1 Col2
Row1    1    2
Row2    3    4


In [58]:
# comparison

my_2darray = np.array([[1, 2, 3], [4, 5, 6]])
print(my_2darray)
print()

my_dict = {1: ['1', '3'], 2:['1', '2'], 3:['2', '4']}
print(my_dict)
print()

my_df = pd.DataFrame(data=[4, 5, 6, 7], index=range(0, 4), columns=['col'])
print(my_df)
print()

my_series = pd.Series({"Belgium":"Brussels", "India":"New Delhi", "China":"Beijing"})
print(my_series)

[[1 2 3]
 [4 5 6]]

{1: ['1', '3'], 2: ['1', '2'], 3: ['2', '4']}

   col
0    4
1    5
2    6
3    7

Belgium     Brussels
India      New Delhi
China        Beijing
dtype: object


In [59]:
# use shape property or the len() function to know more about the dataframe

df = pd.DataFrame(np.array([[1, 2, 3], [4, 5, 6]]))
print(df.shape)
print(len(df.index))

(2, 3)
2


### (2) DataFrame Operations: Select an Index or Column 

In [47]:
# selection of index or cloumn from a dataframe, df.iloc[], df.loc, df.ix[]

df = pd.DataFrame({"A": [1, 2, 3],
                   "B": [4, 5, 6],
                   "C": [7, 8, 9]})
print(df)
print()
# print(df[1]) incorrect
print(df[1:])
print()
print(df.iloc[0:1, :])
print()
print(df.loc[2])
print()
print(df.loc[:, 'A'])

   A  B  C
0  1  4  7
1  2  5  8
2  3  6  9

   A  B  C
1  2  5  8
2  3  6  9

   A  B  C
0  1  4  7

A    3
B    6
C    9
Name: 2, dtype: int64

0    1
1    2
2    3
Name: A, dtype: int64


In [63]:
# more practice on .loc[], iloc[], ix[]
# .loc[] works on labels of your index. 
# .iloc[] works on the positions in your index. 
# .ix[] is a more complex case: when the index is integer-based, you pass a label to .ix[]. This is just like .loc[]
# .ix[] will work with positions if your index is not solely integer-based, just like .iloc[]

df = pd.DataFrame(data=np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]), index= [2, 'A', 4], columns=[48, 49, 50])
print(df)
print()
print(df.loc[2])
print()
print(df.iloc[2])
print()
print(df.ix[2])

   48  49  50
2   1   2   3
A   4   5   6
4   7   8   9

48    1
49    2
50    3
Name: 2, dtype: int64

48    7
49    8
50    9
Name: 4, dtype: int64

48    7
49    8
50    9
Name: 4, dtype: int64


.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated
  if __name__ == '__main__':


### (3) DataFrame Operations: Add an Index, Row or Column

In [61]:
# reset index
# by default, a numerically valued index that starts with 0 and continues until the last row of your DataFrame.
# we can re-use one of the columns and make it the new index

df = pd.DataFrame({"A": [1, 2, 3],
                   "B": [4, 5, 6],
                   "C": [7, 8, 9]})
print(df)

df.set_index('C')

   A  B  C
0  1  4  7
1  2  5  8
2  3  6  9


Unnamed: 0_level_0,A,B
C,Unnamed: 1_level_1,Unnamed: 2_level_1
7,1,4
8,2,5
9,3,6


In [52]:
# adding rows to a DataFrame

df = pd.DataFrame(data=np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]), index= [2.5, 12.6, 4.8], columns=[48, 49, 50])
print(df)
print()
df.ix[2] = [60, 50, 40] # There's no index labeled `2`, so you will change the index at position `2`
print(df)
print()
df.loc[2] = [11, 12, 13] # This will make an index labeled `2` and add the new values
print(df)

      48  49  50
2.5    1   2   3
12.6   4   5   6
4.8    7   8   9

      48  49  50
2.5    1   2   3
12.6   4   5   6
4.8   60  50  40

      48  49  50
2.5    1   2   3
12.6   4   5   6
4.8   60  50  40
2.0   11  12  13


.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated
  """


In [64]:
# adding columns to a DataFrame

df = pd.DataFrame(data=np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]), columns=['A', 'B', 'C'])
df['D'] = df.index
print(df)
print()
df.loc[:, 'E'] = pd.Series([1, 2, 3], index=df.index)
print(df)

   A  B  C  D
0  1  2  3  0
1  4  5  6  1
2  7  8  9  2

   A  B  C  D  E
0  1  2  3  0  1
1  4  5  6  1  2
2  7  8  9  2  3


## 2. Pandas apply
* "apply" is used when you want to apply a function along the axis of a dataframe.
* it accepts a Series whose index is either column (axis=0) or row (axis=1). 
* For example: df.apply(np.square), it will give a dataframe with number squared.
* [Reference: Pandas apply, map and applymap](https://kanoki.org/2019/11/25/pandas-apply-map-and-applymap/)



In [16]:
import pandas as pd
import numpy as np
df = pd.DataFrame({'a' : [ 10,20,30], 'b' : [5,10,15], 'c' : [10,100,1000]}, index=['r1','r2','r3'])
df

Unnamed: 0,a,b,c
r1,10,5,10
r2,20,10,100
r3,30,15,1000


In [0]:
def multiply_by_2(col):
    return col * 2

def multiply_col1_col2(col):
    return col['a'] * col['b']

In [18]:
df.apply(multiply_by_2)

Unnamed: 0,a,b,c
r1,20,10,20
r2,40,20,200
r3,60,30,2000


In [19]:
df.apply(multiply_col1_col2, axis=1)

r1     50
r2    200
r3    450
dtype: int64

In [20]:
df['col1Xcol2'] = df.apply(multiply_col1_col2, axis=1)
df

Unnamed: 0,a,b,c,col1Xcol2
r1,10,5,10,50
r2,20,10,100,200
r3,30,15,1000,450


In [21]:
df.apply(lambda x: x['a'] * x['b'], axis=1)

r1     50
r2    200
r3    450
dtype: int64