# Pandas

Pandas is a tool fopr data processing which helps in data analysis

It provides functions and methods to efficiently manipulate large datasets

-  `Series` is a one-dimensional array with labels it can contain nay data type including integers, strings, floats, Python objects, etc.
-  `DataFrame` is a two-dimensional array with labels for rows and columns. We can use labels to locate data

In [1]:
import pandas as pd

In [2]:
# check pandas version
pd.__version__

'2.2.1'

# Series create, manipulate, querry, delete

In [3]:
# creating a series from a list
arr = [1, 2, 3, 4, 5]
s1 = pd.Series(arr)
s1

0    1
1    2
2    3
3    4
4    5
dtype: int64

In [4]:
order = [1, 2, 3, 4, 5]
s2 = pd.Series(arr, index=order)
s2

1    1
2    2
3    3
4    4
5    5
dtype: int64

In [5]:
import numpy as np
n = np.random.randn(5)  # Create a random Ndarray
index = ['a', 'b', 'c', 'd', 'e']
s3 = pd.Series(n, index=index)
s3

a   -0.511093
b   -0.116558
c   -1.035657
d    0.572139
e   -0.901659
dtype: float64

In [6]:
# create Serues from dictionary
d = {'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5}
s4 = pd.Series(d)
s4

a    1
b    2
c    3
d    4
e    5
dtype: int64

In [7]:
# modify the index of series
print(s1)
s1.index = ['A', 'B', 'C', 'D', 'E']
s1

0    1
1    2
2    3
3    4
4    5
dtype: int64


A    1
B    2
C    3
D    4
E    5
dtype: int64

In [8]:
# slicing
s1[-2:]

D    4
E    5
dtype: int64

In [9]:
s5 = pd.concat([s1, s3])  # instead of concatenate, we can use append
s5

A    1.000000
B    2.000000
C    3.000000
D    4.000000
E    5.000000
a   -0.511093
b   -0.116558
c   -1.035657
d    0.572139
e   -0.901659
dtype: float64

In [10]:
s5.drop('e')  # doesn't change the original series

A    1.000000
B    2.000000
C    3.000000
D    4.000000
E    5.000000
a   -0.511093
b   -0.116558
c   -1.035657
d    0.572139
dtype: float64

In [11]:
s5

A    1.000000
B    2.000000
C    3.000000
D    4.000000
E    5.000000
a   -0.511093
b   -0.116558
c   -1.035657
d    0.572139
e   -0.901659
dtype: float64

# Series operations


In [12]:
arr1 = [0, 1, 2, 3, 4, 5, 7]
arr2 = [6, 7, 8, 9, 5]


In [13]:
s6  = pd.Series(arr2)
s6

0    6
1    7
2    8
3    9
4    5
dtype: int64

In [14]:
s7 = pd.Series(arr1)
s7

0    0
1    1
2    2
3    3
4    4
5    5
6    7
dtype: int64

In [15]:
s6.add(s7)

0     6.0
1     8.0
2    10.0
3    12.0
4     9.0
5     NaN
6     NaN
dtype: float64

In [16]:
s6.sub(s7)

0    6.0
1    6.0
2    6.0
3    6.0
4    1.0
5    NaN
6    NaN
dtype: float64

In [17]:
s6.mul(s7)

0     0.0
1     7.0
2    16.0
3    27.0
4    20.0
5     NaN
6     NaN
dtype: float64

In [18]:
s6.div(s7)

0     inf
1    7.00
2    4.00
3    3.00
4    1.25
5     NaN
6     NaN
dtype: float64

In [26]:
print('median', s7.median())
print('max', s7.max())
print('min', s7.min())
print('mean', s7.mean())
print('sum', s7.sum())
print('std', s7.std())
print('count', s7.count())

median 3.0
max 7
min 0
mean 3.142857142857143
sum 22
std 2.410295378065479
count 7


In [28]:
# invalid values are ignored
s8 = s6.div(s7)
print('median', s8.median())
print('max', s8.max())
print('min', s8.min())
print('mean', s8.mean())
print('sum', s8.sum())
# print('std', s8.std())
print('count', s8.count())

median 4.0
max inf
min 1.25
mean inf
sum inf
count 5


# Create DataFrame


In [29]:
dates = pd.date_range('today', periods=6)  # Define time sequence as index
dates

DatetimeIndex(['2024-04-20 06:40:19.868932', '2024-04-21 06:40:19.868932',
               '2024-04-22 06:40:19.868932', '2024-04-23 06:40:19.868932',
               '2024-04-24 06:40:19.868932', '2024-04-25 06:40:19.868932'],
              dtype='datetime64[ns]', freq='D')

In [30]:
num_arr = np.random.randn(6,4)  # Create a numpy random array
num_arr

array([[ 0.99523335,  0.48277907, -0.05362755, -0.54969105],
       [-0.69030938,  0.0045381 , -1.63149667,  0.5723772 ],
       [ 1.36996662,  1.14574714,  2.04886973,  0.06078359],
       [ 0.31165753,  1.22907519, -0.50032263, -0.05087997],
       [-0.18641692,  0.54586975, -1.50688579, -0.3095489 ],
       [ 0.22080241,  0.17418277, -0.45479516, -0.00365886]])

In [32]:
columns = ['A', 'B', 'C', 'D']  # Create a list of column names
columns

['A', 'B', 'C', 'D']

In [38]:
df1 = pd.DataFrame(num_arr, index=dates.map(lambda x: x.strftime('%Y-%m-%d'))
, columns=columns)
df1

Unnamed: 0,A,B,C,D
2024-04-20,0.995233,0.482779,-0.053628,-0.549691
2024-04-21,-0.690309,0.004538,-1.631497,0.572377
2024-04-22,1.369967,1.145747,2.04887,0.060784
2024-04-23,0.311658,1.229075,-0.500323,-0.05088
2024-04-24,-0.186417,0.54587,-1.506886,-0.309549
2024-04-25,0.220802,0.174183,-0.454795,-0.003659


In [39]:
# create dataframes with dictionary array

data = {
    'animal': ['cat', 'cat', 'snake', 'dog', 'dog', 'cat', 'snake', 'cat', 'dog', 'dog'],
    'age': [2.5, 3, 0.5, np.nan, 5, 2, 4.5, np.nan, 7, 3],
    'visits': [1, 3, 2, 3, 2, 3, 1, 1, 2, 1],
    'priority': ['yes', 'yes', 'no', 'yes', 'no', 'no', 'no', 'yes', 'no', 'no']
}

labels = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j']
df2 = pd.DataFrame(data, index=labels)
df2

Unnamed: 0,animal,age,visits,priority
a,cat,2.5,1,yes
b,cat,3.0,3,yes
c,snake,0.5,2,no
d,dog,,3,yes
e,dog,5.0,2,no
f,cat,2.0,3,no
g,snake,4.5,1,no
h,cat,,1,yes
i,dog,7.0,2,no
j,dog,3.0,1,no


In [40]:
# see datatypes
df2.dtypes

animal       object
age         float64
visits        int64
priority     object
dtype: object

In [44]:
df3 = df2.head(10)
df3

Unnamed: 0,animal,age,visits,priority
a,cat,2.5,1,yes
b,cat,3.0,3,yes
c,snake,0.5,2,no
d,dog,,3,yes
e,dog,5.0,2,no
f,cat,2.0,3,no
g,snake,4.5,1,no
h,cat,,1,yes
i,dog,7.0,2,no
j,dog,3.0,1,no


In [46]:
df2.tail(3)

Unnamed: 0,animal,age,visits,priority
h,cat,,1,yes
i,dog,7.0,2,no
j,dog,3.0,1,no


In [49]:
print(df2.index)
df.columns

Index(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j'], dtype='object')


Index(['A', 'B', 'C', 'D'], dtype='object')

In [50]:
df2.values

array([['cat', 2.5, 1, 'yes'],
       ['cat', 3.0, 3, 'yes'],
       ['snake', 0.5, 2, 'no'],
       ['dog', nan, 3, 'yes'],
       ['dog', 5.0, 2, 'no'],
       ['cat', 2.0, 3, 'no'],
       ['snake', 4.5, 1, 'no'],
       ['cat', nan, 1, 'yes'],
       ['dog', 7.0, 2, 'no'],
       ['dog', 3.0, 1, 'no']], dtype=object)

In [51]:
df2.describe()

Unnamed: 0,age,visits
count,8.0,10.0
mean,3.4375,1.9
std,2.007797,0.875595
min,0.5,1.0
25%,2.375,1.0
50%,3.0,2.0
75%,4.625,2.75
max,7.0,3.0
