# Deduplicating

In [1]:
import pandas as pd

dframe = pd.DataFrame({
    'color': ['white','white','red','red','white'],
    'value': [2,1,3,3,2]
})

dframe


Unnamed: 0,color,value
0,white,2
1,white,1
2,red,3
3,red,3
4,white,2


In [2]:
# де саме дублікати (другі та наступні входження)
dframe.duplicated()

0    False
1    False
2    False
3     True
4     True
dtype: bool

In [3]:
# подивитися дублікати
dframe[dframe.duplicated()]

Unnamed: 0,color,value
3,red,3
4,white,2


In [4]:
# прибрати дублікати з таблиці
clean = dframe.drop_duplicates()

In [5]:
clean

Unnamed: 0,color,value
0,white,2
1,white,1
2,red,3


# Mapping

In [6]:
frame = pd.DataFrame({
    'item':  ['ball','mug','pen','pencil','ashtray'],
    'color': ['white','rosso','verde','black','yellow'],
    'price': [5.56, 4.20, 1.30, 0.56, 2.75]
})
frame

Unnamed: 0,item,color,price
0,ball,white,5.56
1,mug,rosso,4.2
2,pen,verde,1.3
3,pencil,black,0.56
4,ashtray,yellow,2.75


In [7]:
newcolors = {'rosso': 'red',
             'verde': 'green'}
newcolors

{'rosso': 'red', 'verde': 'green'}

In [8]:
# замінюємо кольори відповідно до словника
frame = frame.replace(newcolors)
frame

Unnamed: 0,item,color,price
0,ball,white,5.56
1,mug,red,4.2
2,pen,green,1.3
3,pencil,black,0.56
4,ashtray,yellow,2.75


In [9]:
import numpy as np

ser = pd.Series([1, 3, np.nan, 4, 6, np.nan, 3])
ser


0    1.0
1    3.0
2    NaN
3    4.0
4    6.0
5    NaN
6    3.0
dtype: float64

In [10]:
ser_filled = ser.replace(np.nan, 0)
ser_filled

0    1.0
1    3.0
2    0.0
3    4.0
4    6.0
5    0.0
6    3.0
dtype: float64

## Adding values with mapping

In [11]:
frame = pd.DataFrame({
    'item':  ['ball','mug','pen','pencil','ashtray'],
    'color': ['white','red','green','black','yellow']
})
frame

Unnamed: 0,item,color
0,ball,white
1,mug,red
2,pen,green
3,pencil,black
4,ashtray,yellow


In [12]:
prices = {'ball':5.56,
          'mug':4.20,
          'bottle':1.30,
          'scissors':3.41,
          'pen':1.30,
          'pencil':0.56,
          'ashtray':2.75}

In [13]:
frame['price'] = frame['item'].map(prices)
frame

Unnamed: 0,item,color,price
0,ball,white,5.56
1,mug,red,4.2
2,pen,green,1.3
3,pencil,black,0.56
4,ashtray,yellow,2.75


## Rename indexes and columns

In [14]:
frame = pd.DataFrame({
    'item':  ['ball','mug','pen','pencil','ashtray'],
    'color': ['white','red','green','black','yellow'],
    'price': [5.56, 4.20, 1.30, 0.56, 2.75]
})
frame

Unnamed: 0,item,color,price
0,ball,white,5.56
1,mug,red,4.2
2,pen,green,1.3
3,pencil,black,0.56
4,ashtray,yellow,2.75


In [15]:
# перейменувати значення індексу
reindex = {0:'first', 1:'second', 2:'third', 3:'fourth', 4:'fifth'}
renamed_idx = frame.rename(reindex)
renamed_idx

Unnamed: 0,item,color,price
first,ball,white,5.56
second,mug,red,4.2
third,pen,green,1.3
fourth,pencil,black,0.56
fifth,ashtray,yellow,2.75


In [16]:
# перейменувати колонки
recolumn = {'item':'object', 'price':'value'}
renamed_both = frame.rename(index=reindex, columns=recolumn)
renamed_both

Unnamed: 0,object,color,value
first,ball,white,5.56
second,mug,red,4.2
third,pen,green,1.3
fourth,pencil,black,0.56
fifth,ashtray,yellow,2.75


In [18]:
# короткий запис для одиничних замін + зміни на місці
frame.rename(index={1: 'first'}, columns={'item': 'object'}, inplace=True)
frame

Unnamed: 0,object,color,price
0,ball,white,5.56
first,mug,red,4.2
2,pen,green,1.3
3,pencil,black,0.56
4,ashtray,yellow,2.75


# Discretization and binning

In [19]:
results = [12,34,67,55,28,90,99,12,3,56,74,44,87,23,49,89,87]

In [20]:
bins = [0,25,50,75,100]

In [23]:
cat = pd.Series(pd.cut(results, bins))
cat

0       (0, 25]
1      (25, 50]
2      (50, 75]
3      (50, 75]
4      (25, 50]
5     (75, 100]
6     (75, 100]
7       (0, 25]
8       (0, 25]
9      (50, 75]
10     (50, 75]
11     (25, 50]
12    (75, 100]
13      (0, 25]
14     (25, 50]
15    (75, 100]
16    (75, 100]
dtype: category
Categories (4, interval[int64, right]): [(0, 25] < (25, 50] < (50, 75] < (75, 100]]

In [26]:
cat.value_counts()

(75, 100]    5
(0, 25]      4
(25, 50]     4
(50, 75]     4
Name: count, dtype: int64

In [28]:
bin_names = ['unlikely', 'less likely', 'likely', 'highly likely']
pd.Series(pd.cut(results, bins, labels=bin_names))

0          unlikely
1       less likely
2            likely
3            likely
4       less likely
5     highly likely
6     highly likely
7          unlikely
8          unlikely
9            likely
10           likely
11      less likely
12    highly likely
13         unlikely
14      less likely
15    highly likely
16    highly likely
dtype: category
Categories (4, object): ['unlikely' < 'less likely' < 'likely' < 'highly likely']

In [34]:
uniform = pd.Series(pd.cut(results, 5))
uniform

0     (2.904, 22.2]
1      (22.2, 41.4]
2      (60.6, 79.8]
3      (41.4, 60.6]
4      (22.2, 41.4]
5      (79.8, 99.0]
6      (79.8, 99.0]
7     (2.904, 22.2]
8     (2.904, 22.2]
9      (41.4, 60.6]
10     (60.6, 79.8]
11     (41.4, 60.6]
12     (79.8, 99.0]
13     (22.2, 41.4]
14     (41.4, 60.6]
15     (79.8, 99.0]
16     (79.8, 99.0]
dtype: category
Categories (5, interval[float64, right]): [(2.904, 22.2] < (22.2, 41.4] < (41.4, 60.6] < (60.6, 79.8] < (79.8, 99.0]]

In [35]:
uniform.value_counts()

(79.8, 99.0]     5
(41.4, 60.6]     4
(2.904, 22.2]    3
(22.2, 41.4]     3
(60.6, 79.8]     2
Name: count, dtype: int64

In [31]:
quintiles = pd.Series(pd.qcut(results, 5)) # 5 квантильних інтервалів
quintiles

0     (2.999, 24.0]
1      (24.0, 46.0]
2      (62.6, 87.0]
3      (46.0, 62.6]
4      (24.0, 46.0]
5      (87.0, 99.0]
6      (87.0, 99.0]
7     (2.999, 24.0]
8     (2.999, 24.0]
9      (46.0, 62.6]
10     (62.6, 87.0]
11     (24.0, 46.0]
12     (62.6, 87.0]
13    (2.999, 24.0]
14     (46.0, 62.6]
15     (87.0, 99.0]
16     (62.6, 87.0]
dtype: category
Categories (5, interval[float64, right]): [(2.999, 24.0] < (24.0, 46.0] < (46.0, 62.6] < (62.6, 87.0] < (87.0, 99.0]]

In [32]:
quintiles.value_counts()

(2.999, 24.0]    4
(62.6, 87.0]     4
(24.0, 46.0]     3
(46.0, 62.6]     3
(87.0, 99.0]     3
Name: count, dtype: int64

# Detecting and Filtering Outliers

In [36]:
randframe = pd.DataFrame(np.random.randn(1000, 3))
randframe.describe()

Unnamed: 0,0,1,2
count,1000.0,1000.0,1000.0
mean,0.044372,-0.008285,0.008627
std,0.9693,0.999979,0.969368
min,-2.740964,-3.103112,-2.951697
25%,-0.594573,-0.704836,-0.667392
50%,0.035276,0.007687,-0.008043
75%,0.685671,0.66775,0.661739
max,2.965759,2.878379,2.81236


In [38]:
# стандартні відхилення по стовпцях
stds = randframe.std()

stds

0    0.969300
1    0.999979
2    0.969368
dtype: float64

In [39]:
# рядки з викидами (поріг 3*std для відповідної колонки)
outliers = randframe[(np.abs(randframe) > (3 * stds)).any(axis=1)]
outliers

Unnamed: 0,0,1,2
183,2.965759,0.574156,0.419192
313,1.04667,-3.103112,0.975345
573,0.636324,-0.883087,-2.951697
953,-2.598517,-3.082887,-1.367202


# Permutation

In [40]:
nframe = pd.DataFrame(np.arange(25).reshape(5, 5))

nframe


Unnamed: 0,0,1,2,3,4
0,0,1,2,3,4
1,5,6,7,8,9
2,10,11,12,13,14
3,15,16,17,18,19
4,20,21,22,23,24


In [41]:
new_order = np.random.permutation(5)  # напр., array([2, 3, 0, 1, 4])
nframe_shuffled = nframe.take(new_order)
nframe_shuffled

Unnamed: 0,0,1,2,3,4
3,15,16,17,18,19
4,20,21,22,23,24
0,0,1,2,3,4
1,5,6,7,8,9
2,10,11,12,13,14


In [43]:
# приклад часткової перестановки з явним порядком
new_order = [3, 4, 2]
nframe_partial = nframe.take(new_order)
nframe_partial

Unnamed: 0,0,1,2,3,4
3,15,16,17,18,19
4,20,21,22,23,24
2,10,11,12,13,14


In [44]:
# вибрати 3 випадкові індекси з повтореннями
sample_idx = np.random.randint(0, len(nframe), size=3)
sample_rows = nframe.take(sample_idx)
sample_rows


Unnamed: 0,0,1,2,3,4
0,0,1,2,3,4
1,5,6,7,8,9
0,0,1,2,3,4
