# Pandas & Pandera
## Pandas core components
* series types
* dataframe types

In [1]:
%pip install pandera

Collecting pandera
  Downloading pandera-0.20.3-py3-none-any.whl.metadata (15 kB)
Collecting multimethod<=1.10.0 (from pandera)
  Downloading multimethod-1.10-py3-none-any.whl.metadata (8.2 kB)
Collecting typeguard (from pandera)
  Downloading typeguard-4.3.0-py3-none-any.whl.metadata (3.7 kB)
Collecting typing-inspect>=0.6.0 (from pandera)
  Downloading typing_inspect-0.9.0-py3-none-any.whl.metadata (1.5 kB)
Collecting wrapt (from pandera)
  Downloading wrapt-1.16.0-cp312-cp312-macosx_10_9_x86_64.whl.metadata (6.6 kB)
Downloading pandera-0.20.3-py3-none-any.whl (255 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m255.6/255.6 kB[0m [31m727.5 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading multimethod-1.10-py3-none-any.whl (9.9 kB)
Downloading typing_inspect-0.9.0-py3-none-any.whl (8.8 kB)
Downloading typeguard-4.3.0-py3-none-any.whl (35 kB)
Downloading wrapt-1.16.0-cp312-cp312-macosx_10_9_x86_64.whl (37 kB)
Installing collected packages: wrapt, typi

In [2]:
import pandas as pd
import pandera as pa

In [3]:
s1 : pd.Series = pd.Series([1, 2, 3, 4, 5])
s1

0    1
1    2
2    3
3    4
4    5
dtype: int64

In [4]:
s1 : pd.Series = pd.Series({1, 2, 3, 4, 5})
s1

TypeError: 'set' type is unordered

In [5]:
s1 : pd.Series = pd.Series((1, 2, 3, 4, 5))
s1

0    1
1    2
2    3
3    4
4    5
dtype: int64

In [6]:
s1 : pd.Series = pd.Series({"a" : 1, "b" : 2, "c" : 3})
s1

a    1
b    2
c    3
dtype: int64

In [9]:
values : list[int] = [1, 2, 3, 4, 5]
myKeys : list[str] = ["a", "b", "c", "d", "e"]
s1 : pd.Series = pd.Series(values, index=myKeys)
s1

a    1
b    2
c    3
d    4
e    5
dtype: int64

In [11]:
values : list[int] = [1, 2, 3, 4, 5]
myKeys : list[list[str]] = [["a1", "b1", "a1", "a1", "b1"],["a", "b", "c", "de", "e"]]
s1 : pd.Series = pd.Series(values, index=myKeys, name='student_data')
s1

a1  a     1
b1  b     2
a1  c     3
    de    4
b1  e     5
Name: student_data, dtype: int64

In [12]:
# we can also change the data type
import numpy as np
values : list[int] = [1, 2, 3, 4, 5]
myKeys : list[list[str]] = [["a1", "b1", "a1", "a1", "b1"],["a", "b", "c", "de", "e"]]
s1 : pd.Series = pd.Series(values, 
                           index=myKeys, 
                           name='student_data',
                           dtype=np.int32)
s1

a1  a     1
b1  b     2
a1  c     3
    de    4
b1  e     5
Name: student_data, dtype: int32

# Pandera

In [15]:

# data to validate
df = pd.DataFrame({
    "column1": [1, 4, 0, 10, 9],
    "column2": [-1.3, -1.4, -2.9, -10.1, -20.4],
    "column3": ["value_1", "value_2", "value_3", "value_2", "value_1"],
})

# define schema
schema = pa.DataFrameSchema({
    "column1": pa.Column(int, checks=pa.Check.le(10)),
    "column2": pa.Column(float, checks=pa.Check.lt(-1.2)),
    "column3": pa.Column(str, checks=[
        pa.Check.str_startswith("value_"),
        # define custom checks as functions that take a series as input and
        # outputs a boolean or boolean Series
        pa.Check(lambda s: s.str.split("_", expand=True).shape[1] == 2)
    ]),
})

validated_df = schema(df)
print(validated_df)

   column1  column2  column3
0        1     -1.3  value_1
1        4     -1.4  value_2
2        0     -2.9  value_3
3       10    -10.1  value_2
4        9    -20.4  value_1


# DataFrames

In [19]:
s1 : pd.Series = pd.Series([1, 2, 3, 4, 5], name='ID')
s2 : pd.Series = pd.Series(['01', '02', '03', '04', '05'], name='Code')
s3 : pd.Series = pd.Series(['Wasim', 'Usman', 'Ali', 'Qasim', 'Nasir'], name='Student Name')
result = pd.DataFrame({'ID' : s1, 'Code' : s2, 'Name' : s3})
result

Unnamed: 0,ID,Code,Name
0,1,1,Wasim
1,2,2,Usman
2,3,3,Ali
3,4,4,Qasim
4,5,5,Nasir


In [21]:
s1 : pd.Series = pd.Series([1, 2, 3, 4, 5], name='ID')
s2 : pd.Series = pd.Series(['01', '02', '03', '04', '05'], name='Code')
s3 : pd.Series = pd.Series(['Wasim', 'Usman', 'Ali', 'Qasim', 'Nasir'], name='Student Name')
result = pd.concat([s1, s2, s3], axis=1)
result

Unnamed: 0,ID,Code,Student Name
0,1,1,Wasim
1,2,2,Usman
2,3,3,Ali
3,4,4,Qasim
4,5,5,Nasir


In [22]:
s1 : pd.Series = pd.Series([1, 2, 3, 4, 5], name='ID')
s2 : pd.Series = pd.Series(['01', '02', '03', '04', '05'], name='Code')
s3 : pd.Series = pd.Series(['Wasim', 'Usman', 'Ali', 'Qasim', 'Nasir'], name='Student Name')
result = pd.DataFrame([s1, s2, s3])
result

Unnamed: 0,0,1,2,3,4
ID,1,2,3,4,5
Code,01,02,03,04,05
Student Name,Wasim,Usman,Ali,Qasim,Nasir


In [27]:
data : list[list[int]] = [[1, 2, 3], 
                          [4, 5, 6], 
                          [7, 8, 9]]
df : pd.DataFrame = pd.DataFrame(data, columns=['A', 'B', 'C'], index=['X', 'Y', 'Z'])
df

Unnamed: 0,A,B,C
X,1,2,3
Y,4,5,6
Z,7,8,9


In [28]:
df.columns

Index(['A', 'B', 'C'], dtype='object')

In [29]:
df.index

Index(['X', 'Y', 'Z'], dtype='object')

In [30]:
df.values

array([[1, 2, 3],
       [4, 5, 6],
       [7, 8, 9]])

In [34]:
from nptyping import NDArray, Shape
from typing import Any

data : NDArray[Shape["10, 10"], Any] = np.arange(10*10).reshape(10, 10)
data

array([[ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9],
       [10, 11, 12, 13, 14, 15, 16, 17, 18, 19],
       [20, 21, 22, 23, 24, 25, 26, 27, 28, 29],
       [30, 31, 32, 33, 34, 35, 36, 37, 38, 39],
       [40, 41, 42, 43, 44, 45, 46, 47, 48, 49],
       [50, 51, 52, 53, 54, 55, 56, 57, 58, 59],
       [60, 61, 62, 63, 64, 65, 66, 67, 68, 69],
       [70, 71, 72, 73, 74, 75, 76, 77, 78, 79],
       [80, 81, 82, 83, 84, 85, 86, 87, 88, 89],
       [90, 91, 92, 93, 94, 95, 96, 97, 98, 99]])

In [36]:
df : pd.DataFrame = pd.DataFrame(data)
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0,1,2,3,4,5,6,7,8,9
1,10,11,12,13,14,15,16,17,18,19
2,20,21,22,23,24,25,26,27,28,29
3,30,31,32,33,34,35,36,37,38,39
4,40,41,42,43,44,45,46,47,48,49
5,50,51,52,53,54,55,56,57,58,59
6,60,61,62,63,64,65,66,67,68,69
7,70,71,72,73,74,75,76,77,78,79
8,80,81,82,83,84,85,86,87,88,89
9,90,91,92,93,94,95,96,97,98,99


In [37]:
df : pd.DataFrame = pd.DataFrame(data, columns=list('ABCDEFGHIJ'))
df

Unnamed: 0,A,B,C,D,E,F,G,H,I,J
0,0,1,2,3,4,5,6,7,8,9
1,10,11,12,13,14,15,16,17,18,19
2,20,21,22,23,24,25,26,27,28,29
3,30,31,32,33,34,35,36,37,38,39
4,40,41,42,43,44,45,46,47,48,49
5,50,51,52,53,54,55,56,57,58,59
6,60,61,62,63,64,65,66,67,68,69
7,70,71,72,73,74,75,76,77,78,79
8,80,81,82,83,84,85,86,87,88,89
9,90,91,92,93,94,95,96,97,98,99


In [41]:
ndf : list[pd.DataFrame] = pd.read_html('https://www.w3schools.com/python/python_datatypes.asp')
ndf[0]

Unnamed: 0,0,1
0,Text Type:,str
1,Numeric Types:,"int, float, complex"
2,Sequence Types:,"list, tuple, range"
3,Mapping Type:,dict
4,Set Types:,"set, frozenset"
5,Boolean Type:,bool
6,Binary Types:,"bytes, bytearray, memoryview"
7,None Type:,NoneType


In [43]:
# it can also read json
json_data : pd.DataFrame = pd.read_json('https://microsoftedge.github.io/Demos/json-dummy-data/64KB-min.json')
json_data

Unnamed: 0,name,language,id,bio,version
0,Adeel Solangi,Sindhi,V59OF92YF627HFY0,Donec lobortis eleifend condimentum. Cras dict...,6.10
1,Afzal Ghaffar,Sindhi,ENTOCR13RSCLZ6KU,"Aliquam sollicitudin ante ligula, eget malesua...",1.88
2,Aamir Solangi,Sindhi,IAKPO3R4761JDRVG,Vestibulum pharetra libero et velit gravida eu...,7.27
3,Abla Dilmurat,Uyghur,5ZVOEPMJUI4MB4EN,Donec lobortis eleifend condimentum. Morbi ac ...,2.53
4,Adil Eli,Uyghur,6VTI8X6LL0MMPJCC,"Vivamus id faucibus velit, id posuere leo. Mor...",6.49
...,...,...,...,...,...
192,Kristín Sigurðardóttir,Icelandic,ZP5TBBYX6RI2UJ31,Cras dictum dolor lacinia lectus vehicula rutr...,2.80
193,Rohini Vasav,Hindi,UEFML43TCGS04KWM,"Ut accumsan, est vel fringilla varius, purus a...",9.30
194,Sunil Kapoor,Hindi,VY2A0APGVHK5NAW2,"Proin tempus eu risus nec mattis. Ut dictum, l...",8.04
195,Zamokuhle Zulu,isiZulu,XU7BX2F8M5PVZ1EF,Etiam congue dignissim volutpat. Phasellus tin...,8.39


In [44]:
pd.read_csv?

[0;31mSignature:[0m
[0mpd[0m[0;34m.[0m[0mread_csv[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mfilepath_or_buffer[0m[0;34m:[0m [0;34m'FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str]'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0;34m*[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0msep[0m[0;34m:[0m [0;34m'str | None | lib.NoDefault'[0m [0;34m=[0m [0;34m<[0m[0mno_default[0m[0;34m>[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mdelimiter[0m[0;34m:[0m [0;34m'str | None | lib.NoDefault'[0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mheader[0m[0;34m:[0m [0;34m"int | Sequence[int] | None | Literal['infer']"[0m [0;34m=[0m [0;34m'infer'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mnames[0m[0;34m:[0m [0;34m'Sequence[Hashable] | None | lib.NoDefault'[0m [0;34m=[0m [0;34m<[0m[0mno_default[0m[0;34m>[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mindex_col[0m[0;34m:[0m [0;34m'IndexLabel | Literal[False] | None'[0m [0

In [46]:
dir(pa.check)

['__annotations__',
 '__builtins__',
 '__call__',
 '__class__',
 '__closure__',
 '__code__',
 '__defaults__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__get__',
 '__getattribute__',
 '__getstate__',
 '__globals__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__kwdefaults__',
 '__le__',
 '__lt__',
 '__module__',
 '__name__',
 '__ne__',
 '__new__',
 '__qualname__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__type_params__']

# Slicing & Indexing
* series_variable[index]
* dataframe
    * loc
    * iloc
    * at
    * iat

In [47]:
my_series : pd.Series = pd.Series([1, 2, 3, 4, 5])
my_series

0    1
1    2
2    3
3    4
4    5
dtype: int64

In [49]:
my_series[1:4]

1    2
2    3
3    4
dtype: int64

In [50]:
my_series : pd.Series = pd.Series([1, 2, 3, 4, 5], index=['a', 'b', 'c', 'd', 'e'])
my_series

a    1
b    2
c    3
d    4
e    5
dtype: int64

In [51]:
# iloc always used for and consider numberic indexes
my_series : pd.Series = pd.Series([1, 2, 3, 4, 5], index=['a', 'b', 'c', 'd', 'e'])
my_series.iloc[1:3]

b    2
c    3
dtype: int64

In [52]:
# loc always works against keys and include the last value
my_series : pd.Series = pd.Series([1, 2, 3, 4, 5], index=['a', 'b', 'c', 'd', 'e'])
my_series.loc['b':'d']

b    2
c    3
d    4
dtype: int64

In [53]:
my_series : pd.Series = pd.Series([1, 2, 3, 4, 5], index=['a', 'b', 'c', 'd', 'e'])
my_series.at['b']

2

# Regexp pattern

In [62]:
my_string : str = '''
21:22:28        wasim     -1.3  value_1
12:22:22        usman     -1.4  value_2
23:22:28        kashif    -2.9  value_3
21:12:28        ali       -10.1  value_2
14:42:25        nasir     -20.4  value_1
'''

print(my_string)


21:22:28        wasim     -1.3  value_1
12:22:22        usman     -1.4  value_2
23:22:28        kashif    -2.9  value_3
21:12:28        ali       -10.1  value_2
14:42:25        nasir     -20.4  value_1



# Regex not understand. Need practice

In [64]:
import re

patterns: str = r'''
\d{2}:\d{2}:\d{2}       # Time format like 12:34:56
\s+(.*)                 # Any characters after the time, captured in a group
\s+-10\.1               # Exact string '-10.1'
\s+value_?\s?\d{5,6}    # 'value_' possibly followed by a space and then 5 or 6 digits
'''
data : list[list[str]] = re.findall(patterns, my_string)
data


[]