In [66]:
import pandas as pd
import numpy as np
from numpy import nan as NA

ser = pd.Series([1, NA, 3.5, None, 7])
ser


0    1.0
1    NaN
2    3.5
3    NaN
4    7.0
dtype: float64

In [67]:
ser.isna().any()


True

In [68]:
ser.dropna()


0    1.0
2    3.5
4    7.0
dtype: float64

In [69]:
df = pd.DataFrame(np.random.randn(4, 3))
df.loc[2] = None
df.iloc[0, 2] = None
df


Unnamed: 0,0,1,2
0,0.158635,-1.317386,
1,1.234348,0.329145,0.307045
2,,,
3,0.623923,-0.666902,-0.436461


In [70]:
df.isna()


Unnamed: 0,0,1,2
0,False,False,True
1,False,False,False
2,True,True,True
3,False,False,False


In [71]:
df.isna().any()  # columns


0    True
1    True
2    True
dtype: bool

In [72]:
df.T.isna().any()  # rows


0     True
1    False
2     True
3    False
dtype: bool

In [73]:
naRows = df.T.isna().any()
len(naRows[naRows == True])


2

In [74]:
df, df.dropna(), df.dropna(how='all'), df.dropna(thresh=1), df.dropna(
    axis=1, thresh=2)  # thresh doesnt work on cols?


(          0         1         2
 0  0.158635 -1.317386       NaN
 1  1.234348  0.329145  0.307045
 2       NaN       NaN       NaN
 3  0.623923 -0.666902 -0.436461,
           0         1         2
 1  1.234348  0.329145  0.307045
 3  0.623923 -0.666902 -0.436461,
           0         1         2
 0  0.158635 -1.317386       NaN
 1  1.234348  0.329145  0.307045
 3  0.623923 -0.666902 -0.436461,
           0         1         2
 0  0.158635 -1.317386       NaN
 1  1.234348  0.329145  0.307045
 3  0.623923 -0.666902 -0.436461,
           0         1         2
 0  0.158635 -1.317386       NaN
 1  1.234348  0.329145  0.307045
 2       NaN       NaN       NaN
 3  0.623923 -0.666902 -0.436461)

In [75]:
df.fillna(0)


Unnamed: 0,0,1,2
0,0.158635,-1.317386,0.0
1,1.234348,0.329145,0.307045
2,0.0,0.0,0.0
3,0.623923,-0.666902,-0.436461


In [76]:
df.fillna({0: 1, 2: 3})


Unnamed: 0,0,1,2
0,0.158635,-1.317386,3.0
1,1.234348,0.329145,0.307045
2,1.0,,3.0
3,0.623923,-0.666902,-0.436461


In [77]:
df.fillna(method='ffill'), df.fillna(method='ffill', axis=1)


(          0         1         2
 0  0.158635 -1.317386       NaN
 1  1.234348  0.329145  0.307045
 2  1.234348  0.329145  0.307045
 3  0.623923 -0.666902 -0.436461,
           0         1         2
 0  0.158635 -1.317386 -1.317386
 1  1.234348  0.329145  0.307045
 2       NaN       NaN       NaN
 3  0.623923 -0.666902 -0.436461)

In [78]:
df.fillna(method='bfill')


Unnamed: 0,0,1,2
0,0.158635,-1.317386,0.307045
1,1.234348,0.329145,0.307045
2,0.623923,-0.666902,-0.436461
3,0.623923,-0.666902,-0.436461


In [79]:
ser2 = pd.Series([None, 2, None, 1, 5])
ser2, ser2.fillna(ser2.mean())


(0    NaN
 1    2.0
 2    NaN
 3    1.0
 4    5.0
 dtype: float64,
 0    2.666667
 1    2.000000
 2    2.666667
 3    1.000000
 4    5.000000
 dtype: float64)

In [80]:
df, df.fillna({0: df[0].mean()})


(          0         1         2
 0  0.158635 -1.317386       NaN
 1  1.234348  0.329145  0.307045
 2       NaN       NaN       NaN
 3  0.623923 -0.666902 -0.436461,
           0         1         2
 0  0.158635 -1.317386       NaN
 1  1.234348  0.329145  0.307045
 2  0.672302       NaN       NaN
 3  0.623923 -0.666902 -0.436461)

In [81]:
df1 = pd.DataFrame({'v1': ['a', 'b', 'c']*3, 'v2': [1, 2]*4 + [1]})
df1, df1.duplicated()


(  v1  v2
 0  a   1
 1  b   2
 2  c   1
 3  a   2
 4  b   1
 5  c   2
 6  a   1
 7  b   2
 8  c   1,
 0    False
 1    False
 2    False
 3    False
 4    False
 5    False
 6     True
 7     True
 8     True
 dtype: bool)

In [82]:
df1.drop_duplicates()


Unnamed: 0,v1,v2
0,a,1
1,b,2
2,c,1
3,a,2
4,b,1
5,c,2


In [83]:
df2 = pd.DataFrame({'city': ['Warszawa', 'Lublin', 'Grodzisk Maz.', 'Krakow', 'Poznan', 'Lodz',
                   'Pruszkow', 'Gdansk', 'Wroclaw'], 'pop (thousand)': [1790, 337, 32, 780, 543, 667, 62, +470, 642]})
df2


Unnamed: 0,city,pop (thousand)
0,Warszawa,1790
1,Lublin,337
2,Grodzisk Maz.,32
3,Krakow,780
4,Poznan,543
5,Lodz,667
6,Pruszkow,62
7,Gdansk,470
8,Wroclaw,642


In [84]:
cityToVoivodship = {'Warszawa': 'Mazowieckie', 'Krakow': 'Malopolskie', 'Poznan': 'Wielkopolskie', 'Lublin': 'Lubelskie', 'Lodz': 'Lodzkie', 'Gdansk': 'Pomorskie', 'Wroclaw': 'Dolnoslaskie', 'Grodzisk Maz.':'Mazowieckie', 'Pruszkow':'Mazowieckie'}
df2['voivodship'] = df2['city'].map(cityToVoivodship)
df2

Unnamed: 0,city,pop (thousand),voivodship
0,Warszawa,1790,Mazowieckie
1,Lublin,337,Lubelskie
2,Grodzisk Maz.,32,Mazowieckie
3,Krakow,780,Malopolskie
4,Poznan,543,Wielkopolskie
5,Lodz,667,Lodzkie
6,Pruszkow,62,Mazowieckie
7,Gdansk,470,Pomorskie
8,Wroclaw,642,Dolnoslaskie


In [85]:
df2['pop (million)'] = df2['pop (thousand)'].map(lambda x: x/1000)
df2

Unnamed: 0,city,pop (thousand),voivodship,pop (million)
0,Warszawa,1790,Mazowieckie,1.79
1,Lublin,337,Lubelskie,0.337
2,Grodzisk Maz.,32,Mazowieckie,0.032
3,Krakow,780,Malopolskie,0.78
4,Poznan,543,Wielkopolskie,0.543
5,Lodz,667,Lodzkie,0.667
6,Pruszkow,62,Mazowieckie,0.062
7,Gdansk,470,Pomorskie,0.47
8,Wroclaw,642,Dolnoslaskie,0.642


In [86]:
df2['voivodship'] = df2['voivodship'].str.lower()
df2

Unnamed: 0,city,pop (thousand),voivodship,pop (million)
0,Warszawa,1790,mazowieckie,1.79
1,Lublin,337,lubelskie,0.337
2,Grodzisk Maz.,32,mazowieckie,0.032
3,Krakow,780,malopolskie,0.78
4,Poznan,543,wielkopolskie,0.543
5,Lodz,667,lodzkie,0.667
6,Pruszkow,62,mazowieckie,0.062
7,Gdansk,470,pomorskie,0.47
8,Wroclaw,642,dolnoslaskie,0.642


In [87]:
df2.replace({'mazowieckie':'stoleczne', 'wielkoposkie':'wielkopolskie'})
df2

Unnamed: 0,city,pop (thousand),voivodship,pop (million)
0,Warszawa,1790,mazowieckie,1.79
1,Lublin,337,lubelskie,0.337
2,Grodzisk Maz.,32,mazowieckie,0.032
3,Krakow,780,malopolskie,0.78
4,Poznan,543,wielkopolskie,0.543
5,Lodz,667,lodzkie,0.667
6,Pruszkow,62,mazowieckie,0.062
7,Gdansk,470,pomorskie,0.47
8,Wroclaw,642,dolnoslaskie,0.642


In [88]:
df2.replace(['mazowieckie', 'wielkopolskie'], ['stoleczne', 'poznanskie'])

Unnamed: 0,city,pop (thousand),voivodship,pop (million)
0,Warszawa,1790,stoleczne,1.79
1,Lublin,337,lubelskie,0.337
2,Grodzisk Maz.,32,stoleczne,0.032
3,Krakow,780,malopolskie,0.78
4,Poznan,543,poznanskie,0.543
5,Lodz,667,lodzkie,0.667
6,Pruszkow,62,stoleczne,0.062
7,Gdansk,470,pomorskie,0.47
8,Wroclaw,642,dolnoslaskie,0.642


In [89]:
df2.rename(index=lambda x: x*10, columns=str.title)

Unnamed: 0,City,Pop (Thousand),Voivodship,Pop (Million)
0,Warszawa,1790,mazowieckie,1.79
10,Lublin,337,lubelskie,0.337
20,Grodzisk Maz.,32,mazowieckie,0.032
30,Krakow,780,malopolskie,0.78
40,Poznan,543,wielkopolskie,0.543
50,Lodz,667,lodzkie,0.667
60,Pruszkow,62,mazowieckie,0.062
70,Gdansk,470,pomorskie,0.47
80,Wroclaw,642,dolnoslaskie,0.642


In [90]:
populationBins = [50, 100, 500, 1000]
cats = pd.cut(list(df2['pop (thousand)']), populationBins)
cats, cats.codes

([NaN, (100.0, 500.0], NaN, (500.0, 1000.0], (500.0, 1000.0], (500.0, 1000.0], (50.0, 100.0], (100.0, 500.0], (500.0, 1000.0]]
 Categories (3, interval[int64, right]): [(50, 100] < (100, 500] < (500, 1000]],
 array([-1,  1, -1,  2,  2,  2,  0,  1,  2], dtype=int8))

In [100]:
correctedBins = [0, 50, 100, 500, 1000, float("inf")]
popCats = ['<50', '<100', '<500', '<1000', 'over 1000']
cats = pd.cut(list(df2['pop (thousand)']), correctedBins)
cats, cats.codes

([(1000.0, inf], (100.0, 500.0], (0.0, 50.0], (500.0, 1000.0], (500.0, 1000.0], (500.0, 1000.0], (50.0, 100.0], (100.0, 500.0], (500.0, 1000.0]]
 Categories (5, interval[float64, right]): [(0.0, 50.0] < (50.0, 100.0] < (100.0, 500.0] < (500.0, 1000.0] < (1000.0, inf]],
 array([4, 2, 0, 3, 3, 3, 1, 2, 3], dtype=int8))

In [103]:
df2['pop category'] = pd.Series(cats.codes)
# df2.replace(df2['pop category'], pd.Series(cats.codes))
df2['pop category'] = df2['pop category'].replace(range(5), list(popCats))
df2

Unnamed: 0,city,pop (thousand),voivodship,pop (million),pop category
0,Warszawa,1790,mazowieckie,1.79,over 1000
1,Lublin,337,lubelskie,0.337,<500
2,Grodzisk Maz.,32,mazowieckie,0.032,<50
3,Krakow,780,malopolskie,0.78,<1000
4,Poznan,543,wielkopolskie,0.543,<1000
5,Lodz,667,lodzkie,0.667,<1000
6,Pruszkow,62,mazowieckie,0.062,<100
7,Gdansk,470,pomorskie,0.47,<500
8,Wroclaw,642,dolnoslaskie,0.642,<1000


In [104]:
pd.value_counts(cats)

(500.0, 1000.0]    4
(100.0, 500.0]     2
(0.0, 50.0]        1
(50.0, 100.0]      1
(1000.0, inf]      1
dtype: int64