## FUNDAMENTAL PANDAS NOTES
### Created by Ugur URESIN - AI Engineer, Data Scientist

#### LIBRARY IMPORT

In [4]:
import pandas as pd

There are two pandas data structures:
1. Pandas Series (1D)
2. Pandas DataFrame (2D) 

### PANDAS SERIES

#### NumPy vs. PANDAS

|                                       | NumPy Arrays | Pandas Series |
|---------------------------------------|:------------:|:-------------:|
| Can hold many d.types                 |       X      |       ✔️      |
| Assign an index label to each element |       X      |       ✔️      |


#### PANDAS SERIES CREATION

In [5]:
## CREATING an EMPTY PANDAS SERIE
my_serie = pd.Series()
my_serie

Series([], dtype: float64)

In [31]:
## CREATING an PANDAS SERIE
otel_data = pd.Series(data=[50, 10, 'Yes', 'No'], index=['Double Rooms', 'Single Rooms', 'Free Wifi', 'Free SPA'])
otel_data

Double Rooms     50
Single Rooms     10
Free Wifi       Yes
Free SPA         No
dtype: object

In [11]:
otel_data.shape

(4,)

In [12]:
otel_data.ndim

1

In [13]:
otel_data.size

4

In [14]:
otel_data.index

Index(['Double Rooms', 'Single Rooms', 'Free Wifi', 'Free SPA'], dtype='object')

In [15]:
otel_data.values

array([50, 10, 'Yes', 'No'], dtype=object)

In [19]:
## LABEL/VALUE CHECK
'Free Breakfast' in otel_data

False

In [20]:
'Free SPA' in otel_data

True

#### ACCESSING & DELETING ELEMENTS 

In [22]:
otel_data[0] #first value of otel_data

50

In [23]:
otel_data[-1] #last value of otel_data

'No'

#### LOC & ILOC
loc: location<br> 
iloc: integer location

In [24]:
otel_data.loc[['Double Rooms', 'Single Rooms']]

Double Rooms    50
Single Rooms    10
dtype: object

In [25]:
otel_data.iloc[[2,3]]

Free Wifi    Yes
Free SPA      No
dtype: object

#### RE-ASSINGING VALUES

In [32]:
otel_data['Double Rooms']=30
otel_data

Double Rooms     30
Single Rooms     10
Free Wifi       Yes
Free SPA         No
dtype: object

#### DELETING by DROP

In [33]:
otel_data.drop('Free SPA') #original serie is NOT changed

Double Rooms     30
Single Rooms     10
Free Wifi       Yes
dtype: object

In [34]:
otel_data.drop('Free SPA', inplace=True) #original serie is changed
otel_data

Double Rooms     30
Single Rooms     10
Free Wifi       Yes
dtype: object

#### ARITHMETIC OPERATIONS IN SERIES

In [36]:
fruits = pd.Series([40,50,10], ['apples','bananas','pineapples'])
fruits

apples        40
bananas       50
pineapples    10
dtype: int64

In [37]:
fruits + 30

apples        70
bananas       80
pineapples    40
dtype: int64

In [38]:
fruits * 2

apples         80
bananas       100
pineapples     20
dtype: int64

In [39]:
fruits / 2

apples         8.0
bananas       10.0
pineapples     2.0
dtype: float64

It's also possible to apply the following operators on a Pandas Series<br>
np.power(my_serie, 2)<br>
np.sqrt(my_serie)<br>
np.exp(my_serie)<br>


In [40]:
## ARITHMETIC OPERATION on SPECIFIC VALUES
fruits['apples']*2

80

In [41]:
fruits.iloc[1]+200

250

In [43]:
## ARITHMETIC OPERATION on SERIES CONTAIN OBJECTS!
otel_data = pd.Series(data=[50, 10, 'Yes', 'No'], index=['Double Rooms', 'Single Rooms', 'Free Wifi', 'Free SPA'])
otel_data*2 #produce YesYes and NoNo

Double Rooms       100
Single Rooms        20
Free Wifi       YesYes
Free SPA          NoNo
dtype: object

### PANDAS DATAFRAME

#### PANDAS DATAFRAME CREATION

In [45]:
## CREATING an EMPTY PANDAS DATAFRAME
my_df = pd.DataFrame()
my_df

In [49]:
## CONVERTING an DICT to an PANDAS DATAFRAME
items = {'Customer_001': pd.Series([1,6], index=['chips','beer']),
         'Customer_002': pd.Series([2,4,1], index=['sausage','beer','magazine'])}
type(items)

dict

In [50]:
shopping_cart = pd.DataFrame(items)
shopping_cart

Unnamed: 0,Customer_001,Customer_002
beer,6.0,4.0
chips,1.0,
magazine,,1.0
sausage,,2.0


**Note:** If there were no index labels, there would be numbers!

In [51]:
shopping_cart.values

array([[ 6.,  4.],
       [ 1., nan],
       [nan,  1.],
       [nan,  2.]])

In [52]:
shopping_cart.shape

(4, 2)

In [53]:
shopping_cart.ndim

2

In [54]:
shopping_cart.size

8

#### ADDING LABELS & ACCESSING ELEMENTS

In [69]:
items = {'Customer_001': pd.Series([2,6]),
         'Customer_002': pd.Series([2,4,2]),
         'Customer_003': pd.Series([3,1]),
         'Customer_004': pd.Series([1,1,4,1])}
shopping_cart = pd.DataFrame(items)
shopping_cart

Unnamed: 0,Customer_001,Customer_002,Customer_003,Customer_004
0,2.0,2.0,3.0,1
1,6.0,4.0,1.0,1
2,,2.0,,4
3,,,,1


In [76]:
## ADDING LABELS
shopping_cart.index = ['chips','beer','sausage','magazine']
shopping_cart

Unnamed: 0,Customer_001,Customer_002,Customer_003,Customer_004
chips,2.0,2.0,3.0,1
beer,6.0,4.0,1.0,1
sausage,,2.0,,4
magazine,,,,1


In [79]:
## ACCESSING ELEMENTS
shopping_cart[['Customer_001']]

Unnamed: 0,Customer_001
chips,2.0
beer,6.0
sausage,
magazine,


In [80]:
shopping_cart[['Customer_001','Customer_002']]

Unnamed: 0,Customer_001,Customer_002
chips,2.0,2.0
beer,6.0,4.0
sausage,,2.0
magazine,,


In [81]:
shopping_cart.loc[['chips']]

Unnamed: 0,Customer_001,Customer_002,Customer_003,Customer_004
chips,2.0,2.0,3.0,1


In [84]:
shopping_cart['Customer_001']['beer']

6.0

#### ADDING ROWS/COLUMNS

In [97]:
items = {'Customer_001': pd.Series([1,6], index=['chips','beer']),
         'Customer_002': pd.Series([2,4,1], index=['sausage','beer','magazine'])}
shopping_cart = pd.DataFrame(items)

In [98]:
## ADDING a NEW COLUMN
shopping_cart['Customer_003'] = [5,2,1,0]
shopping_cart

Unnamed: 0,Customer_001,Customer_002,Customer_003
beer,6.0,4.0,5
chips,1.0,,2
magazine,,1.0,1
sausage,,2.0,0


In [99]:
## ADDING a NEW COLUMN from an EXISTING COLUMN
shopping_cart['Customer_004'] = shopping_cart['Customer_001']*2
shopping_cart

Unnamed: 0,Customer_001,Customer_002,Customer_003,Customer_004
beer,6.0,4.0,5,12.0
chips,1.0,,2,2.0
magazine,,1.0,1,
sausage,,2.0,0,


In [102]:
coke_sold = [{'Customer_001':5,'Customer_002':2,'Customer_003':10,'Customer_004':0}]
coke = pd.DataFrame(coke_sold, index=['coke'])
shopping_cart = shopping_cart.append(coke)
shopping_cart

Unnamed: 0,Customer_001,Customer_002,Customer_003,Customer_004
beer,6.0,4.0,5,12.0
chips,1.0,,2,2.0
magazine,,1.0,1,
sausage,,2.0,0,
coke,5.0,2.0,10,0.0


#### INSERTING

In [110]:
items = {'Customer_001': pd.Series([1,6], index=['chips','beer']),
         'Customer_003': pd.Series([2,4,1], index=['sausage','beer','magazine']),
         'Customer_004': pd.Series([5,2], index=['beer','chips'])}
shopping_cart = pd.DataFrame(items)
shopping_cart

Unnamed: 0,Customer_001,Customer_003,Customer_004
beer,6.0,4.0,5.0
chips,1.0,,2.0
magazine,,1.0,
sausage,,2.0,


In [108]:
shopping_cart.insert(1, 'Customer_002', [8,2,0,4]) #'1' for 2nd column
shopping_cart

Unnamed: 0,Customer_001,Customer_002,Customer_003,Customer_004
beer,6.0,8,4.0,5.0
chips,1.0,2,,2.0
magazine,,0,1.0,
sausage,,4,2.0,


#### DROPING COLUMNS/ROWS

In [137]:
items = {'Customer_001': pd.Series([1,6], index=['chips','beer']),
         'Customer_002': pd.Series([2,4,1], index=['chips','beer','magazine']),
         'Customer_003': pd.Series(),
         'Customer_004': pd.Series([2,4,1], index=['sausage','beer','magazine']),
         'Customer_005': pd.Series(index=['diaper'])}
shopping_cart = pd.DataFrame(items)
shopping_cart

Unnamed: 0,Customer_001,Customer_002,Customer_003,Customer_004,Customer_005
beer,6.0,4.0,,4.0,
chips,1.0,2.0,,,
diaper,,,,,
magazine,,1.0,,1.0,
sausage,,,,2.0,


In [138]:
## DROPING COLUMNS
shopping_cart.drop(['Customer_003', 'Customer_005'], axis=1) #add 'inplace=True' to change the original data

Unnamed: 0,Customer_001,Customer_002,Customer_004
beer,6.0,4.0,4.0
chips,1.0,2.0,
diaper,,,
magazine,,1.0,1.0
sausage,,,2.0


In [139]:
shopping_cart.pop('Customer_005') #drops a column!
shopping_cart

Unnamed: 0,Customer_001,Customer_002,Customer_003,Customer_004
beer,6.0,4.0,,4.0
chips,1.0,2.0,,
diaper,,,,
magazine,,1.0,,1.0
sausage,,,,2.0


In [126]:
## DROPING ROWS
shopping_cart.drop(['diaper'], axis=0) #add 'inplace=True' to change the original data

Unnamed: 0,Customer_001,Customer_002,Customer_003,Customer_004,Customer_005
beer,6.0,4.0,,4.0,
chips,1.0,2.0,,,
magazine,,1.0,,1.0,
sausage,,,,2.0,


#### RENAMING COLUMNS/ROWS

In [147]:
items = {'Customer_001': pd.Series([1,6], index=['chips','beer']),
         'Customer_002': pd.Series([2,4,1], index=['chips','beer','magazine'])}
shopping_cart = pd.DataFrame(items)
shopping_cart

Unnamed: 0,Customer_001,Customer_002
beer,6.0,4
chips,1.0,2
magazine,,1


In [148]:
## RENAMING COLUMNS
shopping_cart.rename(columns={'Customer_001':'id_001','Customer_002':'id_002'})

Unnamed: 0,id_001,id_002
beer,6.0,4
chips,1.0,2
magazine,,1


In [149]:
## RENAMING ROWS
shopping_cart.rename(index={'beer':'Beer','chips':'Chips','magazine':'Magazine'})

Unnamed: 0,Customer_001,Customer_002
Beer,6.0,4
Chips,1.0,2
Magazine,,1


#### DEALING with NaN VALUES

In [165]:
items = {'Customer_001': pd.Series([1,6], index=['chips','beer']),
         'Customer_002': pd.Series([1,4,2], index=['chips','beer','sausage']),
         'Customer_003': pd.Series([1,1], index=['chips','magazine']),
         'Customer_004': pd.Series([2,4,1,2], index=['chips','beer','magazine','sausage']),
         'Customer_005': pd.Series([5,10], index=['chips','beer'])}
shopping_cart = pd.DataFrame(items)
shopping_cart

Unnamed: 0,Customer_001,Customer_002,Customer_003,Customer_004,Customer_005
beer,6.0,4.0,,4,10.0
chips,1.0,1.0,1.0,2,5.0
magazine,,,1.0,1,
sausage,,2.0,,2,


In [166]:
## CHECK VALUES WHETHER They are NaN
shopping_cart.isnull()

Unnamed: 0,Customer_001,Customer_002,Customer_003,Customer_004,Customer_005
beer,False,False,True,False,False
chips,False,False,False,False,False
magazine,True,True,False,False,True
sausage,True,False,True,False,True


In [167]:
## NUMBER of NaN in EACH COLUMN
shopping_cart.isnull().sum()

Customer_001    2
Customer_002    1
Customer_003    2
Customer_004    0
Customer_005    2
dtype: int64

In [168]:
## TOTAL NUMBER of NaN in DATAFRAME
shopping_cart.isnull().sum().sum()

7

In [169]:
## DROPING ROWS with NaN
shopping_cart.dropna(axis=0)

Unnamed: 0,Customer_001,Customer_002,Customer_003,Customer_004,Customer_005
chips,1.0,1.0,1.0,2,5.0


In [170]:
## DROPING COLUMNS with NaN
shopping_cart.dropna(axis=1)

Unnamed: 0,Customer_004
beer,4
chips,2
magazine,1
sausage,2


**Note that:** Original dataframe is NOT modified. Add 'inplace=True' to modify the original dataframe.

In [173]:
## REPLACING NaN VALUES with a VALUE
shopping_cart_replaced_wzero = shopping_cart.fillna(0) #replaces all NaN values with 0
shopping_cart_replaced_wzero

Unnamed: 0,Customer_001,Customer_002,Customer_003,Customer_004,Customer_005
beer,6.0,4.0,0.0,4,10.0
chips,1.0,1.0,1.0,2,5.0
magazine,0.0,0.0,1.0,1,0.0
sausage,0.0,2.0,0.0,2,0.0


In [174]:
## REPLACING NaN VALUES with the 'PREVIOUS VALUE' along the GIVEN AXIS!
shopping_cart_replaced_wpre = shopping_cart.fillna(method='ffill', axis=0)
shopping_cart_replaced_wpre

Unnamed: 0,Customer_001,Customer_002,Customer_003,Customer_004,Customer_005
beer,6.0,4.0,,4,10.0
chips,1.0,1.0,1.0,2,5.0
magazine,1.0,1.0,1.0,1,5.0
sausage,1.0,2.0,1.0,2,5.0


As seen in the dataframe above, Customer_003 column has a NaN. Because there is no previous value for the first row!

In [175]:
## REPLACING NaN VALUES with the 'NEXT VALUE' along the GIVEN AXIS!
shopping_cart_replaced_wnext = shopping_cart.fillna(method='backfill', axis=0)
shopping_cart_replaced_wnext

Unnamed: 0,Customer_001,Customer_002,Customer_003,Customer_004,Customer_005
beer,6.0,4.0,1.0,4,10.0
chips,1.0,1.0,1.0,2,5.0
magazine,,2.0,1.0,1,
sausage,,2.0,,2,


As seen in the dataframe above, some last rows have NaN. Because there is no next value for the first rows! Also previous of those NaN values are also NaN. Best practice is to apply both methods: 'ffill' and 'backfill' one after another!

In [177]:
## INTERPOLATION METHOD
shopping_cart_replaced_winterpolate = shopping_cart.interpolate(method='linear', axis=0)
shopping_cart_replaced_winterpolate 

Unnamed: 0,Customer_001,Customer_002,Customer_003,Customer_004,Customer_005
beer,6.0,4.0,,4,10.0
chips,1.0,1.0,1.0,2,5.0
magazine,1.0,1.5,1.0,1,5.0
sausage,1.0,2.0,1.0,2,5.0
