## Working with dataframe

In [16]:
import pandas as pd
import numpy as np

In [17]:
# version check
pd.__version__

'2.2.2'

## what is a dataframe?

In [18]:
# some python list
names = ['a', 'b', 'c', 'd']
ages = [20, 25, 19, 26]
married = [False, False, False, True]

In [19]:
# pandas series
# one dimentional
pd.Series(names, name='names')
pd.Series(ages, name='ages')
pd.Series(ages, index=names)

a    20
b    25
c    19
d    26
dtype: int64

In [20]:
# pandas dataframe
# multidimentional
df = pd.DataFrame({'name': names, 'age': ages, 'married': married})
df

Unnamed: 0,name,age,married
0,a,20,False
1,b,25,False
2,c,19,False
3,d,26,True


In [21]:
# access the dataframe row
df.iloc[1, 0]

'b'

In [22]:
# ndim
df.ndim

2

In [23]:
df.shape

(4, 3)

In [24]:
# df is collection of series
df.name

0    a
1    b
2    c
3    d
Name: name, dtype: object

In [25]:
df.dtypes

name       object
age         int64
married      bool
dtype: object

In [26]:
# add new name in name list
names.append('e')

In [27]:
names

['a', 'b', 'c', 'd', 'e']

In [28]:
len(names)

5

In [29]:
len(ages)

4

In [30]:
# create dataframe
pd.DataFrame({'name': names, 'age': ages, 'married': married})

ValueError: All arrays must be of the same length

ValueError: All arrays must be of the same length

In [32]:
names.remove('e')

In [33]:
names

['a', 'b', 'c', 'd']

In [34]:
pd.DataFrame({'name': names, 'age': ages, 'married': married})

Unnamed: 0,name,age,married
0,a,20,False
1,b,25,False
2,c,19,False
3,d,26,True


In [35]:
# from a dict
dictionary = {
    'e': 30,
    'f': 40,
    'g': 50
}

In [39]:
pd.DataFrame({'name': dictionary})

Unnamed: 0,name
e,30
f,40
g,50


In [43]:
# from tuples
tuple_name = tuple(names)
tuple_name

('a', 'b', 'c', 'd')

In [41]:
tuple_age = tuple(ages)

In [42]:
pd.DataFrame({'name': tuple_name, 
              'ages': tuple_age})

Unnamed: 0,name,ages
0,a,20
1,b,25
2,c,19
3,d,26


In [45]:
# from series
series_name = pd.Series(names)
series_name

0    a
1    b
2    c
3    d
dtype: object

In [46]:
series_ages = pd.Series(ages)

In [47]:
pd.DataFrame({
    'name': series_name,
    'ages': series_ages
})

Unnamed: 0,name,ages
0,a,20
1,b,25
2,c,19
3,d,26


In [48]:
# from a nested dictionary
enumerate(names)

<enumerate at 0x1fd44625710>

In [49]:
list(enumerate(names))

[(0, 'a'), (1, 'b'), (2, 'c'), (3, 'd')]

In [52]:
dict_name = {x:y for x,y in enumerate(names)}
dict_name

{0: 'a', 1: 'b', 2: 'c', 3: 'd'}

In [53]:
dict_age = {x:y for x,y in enumerate(ages)}
dict_age

{0: 20, 1: 25, 2: 19, 3: 26}

In [54]:
dict_married = {x:y for x,y in enumerate(married)}
dict_married

{0: False, 1: False, 2: False, 3: True}

In [55]:
def convertToDict(input_list):
    return {x:y for x,y in enumerate(input_list)}

In [56]:
convertToDict(married)

{0: False, 1: False, 2: False, 3: True}

In [57]:
convertToDict(ages)

{0: 20, 1: 25, 2: 19, 3: 26}

In [58]:
# dataframe
pd.DataFrame({'name': dict_name, 'age': dict_age, 'married': dict_married})

Unnamed: 0,name,age,married
0,a,20,False
1,b,25,False
2,c,19,False
3,d,26,True


In [60]:
# row lavel
pd.DataFrame([{
    'name': 'A',
    'age': 30,
    'married': True
}])

Unnamed: 0,name,age,married
0,A,30,True


In [61]:
# zip method
list(zip(names, ages, married))

[('a', 20, False), ('b', 25, False), ('c', 19, False), ('d', 26, True)]

In [67]:
row = [{'name': names, 
  'age': ages,
  'married': married} for names, ages, married in zip(names, ages, married)]
row

[{'name': 'a', 'age': 20, 'married': False},
 {'name': 'b', 'age': 25, 'married': False},
 {'name': 'c', 'age': 19, 'married': False},
 {'name': 'd', 'age': 26, 'married': True}]

In [66]:
pd.DataFrame(row)

Unnamed: 0,name,age,married
0,a,20,False
1,b,25,False
2,c,19,False
3,d,26,True


In [68]:
# info() method
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   name     4 non-null      object
 1   age      4 non-null      int64 
 2   married  4 non-null      bool  
dtypes: bool(1), int64(1), object(1)
memory usage: 200.0+ bytes


In [69]:
# verbose parameter
df.info(verbose=False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Columns: 3 entries, name to married
dtypes: bool(1), int64(1), object(1)
memory usage: 200.0+ bytes


In [72]:
# maxcol parameter
df.info(max_cols=1)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Columns: 3 entries, name to married
dtypes: bool(1), int64(1), object(1)
memory usage: 200.0+ bytes


In [73]:
# memory usage
df.info(memory_usage=False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   name     4 non-null      object
 1   age      4 non-null      int64 
 2   married  4 non-null      bool  
dtypes: bool(1), int64(1), object(1)

In [74]:
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   name     4 non-null      object
 1   age      4 non-null      int64 
 2   married  4 non-null      bool  
dtypes: bool(1), int64(1), object(1)
memory usage: 368.0 bytes


In [None]:
data = """
pin: 76032, name: Tasnim, balance: 800,
pin: 36794, name: tasnim, balance: 1000,
pin: 83429, name: Tasnim, balance: 10000,
pin: 34068, name: Munni, balance: 50000
"""