## Pandas

In [1]:
# def: Pandas is used for data manipulation and analysis.
# 1. Pandas is a open source library.
# 2. Pandas is designed over the numpy. pandas data structures are converted to the numpy and then executed.
# 3. Data structures in pandas such as series and dataframe convert to numpy 1D and 2D arrays.

## Important list of operations in pandas

In [2]:
# 1. Data structures (Series, Dataframe)
# 2. Data input and output (Excel, CSV, SQL, JSON)
# 3. Understand the dataset (head(), tail(), dtype,...)
# 4. Data filtering and indexing (loc, iloc, where(),...)
# 5. Handling missing values (NAN/NULL)
# 6. Remove the duplicate values
# 7. Handle inconsistant data (Male, M, Ma,...)
# 8. Data transformation (apply(),...)
# 9. Aggregation and grouping functions
# 10. Time series related functions
# 11. Data visualization

In [3]:
pip install pandas

Note: you may need to restart the kernel to use updated packages.


In [4]:
import pandas as pd

In [5]:
# check pandas version
pd.__version__

'2.0.3'

## 1. Data structures (Series, Dataframe)

### Series Data Structure

In [6]:
# Series is a 1D array.
# In machine learning we store the "Target" in the dataset as a series

In [7]:
# empty series
ser = pd.Series()
print(ser)

Series([], dtype: object)


In [8]:
# Series from the python list
list1 = [1,2,3,4,5,6]
ser1 = pd.Series(list1)
print(ser1)

0    1
1    2
2    3
3    4
4    5
5    6
dtype: int64


In [9]:
# Series from the python tuple
tuple1 = (1,2,3,4,5,6)
ser2 = pd.Series(tuple1)
print(ser2)

0    1
1    2
2    3
3    4
4    5
5    6
dtype: int64


In [10]:
# Create series from the numpy array
import numpy as np
data = np.array(["Apple","Mango","Orange","Grape"])
ser = pd.Series(data)
print(data)
print(type(data))
print(ser)
print(type(ser))

['Apple' 'Mango' 'Orange' 'Grape']
<class 'numpy.ndarray'>
0     Apple
1     Mango
2    Orange
3     Grape
dtype: object
<class 'pandas.core.series.Series'>


In [11]:
# we can access series by using the indexing.
print(ser[3])
# print(ser[-3]) is not valid
# raises KeyError: -3
# Pandas series do not support negative indexing 

Grape


In [12]:
data = np.array(["Apple","Mango","Orange","Grape"])
ser3 = pd.Series(data, index = ['A','B','C','D'])
ser3

A     Apple
B     Mango
C    Orange
D     Grape
dtype: object

In [13]:
ser3['D']

'Grape'

In [14]:
# convert the dictionary into pandas series
data = {1:'Bangalore',2:'Chennai',3:'Mumbai',4:'Kochi'}
ser4 = pd.Series(data)
ser4

1    Bangalore
2      Chennai
3       Mumbai
4        Kochi
dtype: object

In [15]:
sett = {'A','B','C','D','E'}
ser5 = pd.Series(sett) # TypeError: 'set' type is unordered: so it cannot be converted to a series
ser5 = pd.Series(list(sett))
ser5

TypeError: 'set' type is unordered

In [16]:
# convert the scaler to pandas series
val = 101
ser5 = pd.Series(val)
print(ser5)
ser6 = pd.Series(val, index = ['A','B','C'])
print(ser6)

0    101
dtype: int64
A    101
B    101
C    101
dtype: int64


### DataFrame Data Structure

In [17]:
# empty dataframe
df = pd.DataFrame()
print(df)

Empty DataFrame
Columns: []
Index: []


In [18]:
# list into a 1D dataframe
l1 = ['cricket','hockey','football','socker']
df1 = pd.DataFrame(l1)
print(df1)
print(type(df1))

          0
0   cricket
1    hockey
2  football
3    socker
<class 'pandas.core.frame.DataFrame'>


In [19]:
# nested list of list into a 2D dataframe
l2 = [['cricket',9],['hockey',8],['football',7],['socker',6]]
df2 = pd.DataFrame(l2)
print(df2)

          0  1
0   cricket  9
1    hockey  8
2  football  7
3    socker  6


In [20]:
# nested tuple of tuple into a 2D dataframe
l3 = (('cricket',9),('hockey',8),('football',7),('socker',6))
df3 = pd.DataFrame(l3)
print(df3)

          0  1
0   cricket  9
1    hockey  8
2  football  7
3    socker  6


In [21]:
# nested list of set into a 2D dataframe
# sets cannot store mutable type of data in it.
l4 = [{'cricket',9},{'hockey',8},{'football',7},{'socker',6}]
df4 = pd.DataFrame(l4)
print(df4)

          0       1
0   cricket       9
1         8  hockey
2  football       7
3    socker       6
