In [1]:
import pandas as pd
import numpy as np

#  Removing Duplicates

In [2]:
data = pd.DataFrame({'k1': ['one', 'two'] * 3 + ['two'],
                     'k2': [1,1,2,3,3,4,4]
                    })


In [3]:
data

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4
6,two,4


In [4]:
data.duplicated()

0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool

In [5]:
data.drop_duplicates(['k1'])

Unnamed: 0,k1,k2
0,one,1
1,two,1


In [6]:
data.drop_duplicates(['k1'], keep='last')

Unnamed: 0,k1,k2
4,one,3
6,two,4


# Transforming Data Using a Function or Mapping

In [7]:
data = pd.DataFrame({'food': ['bacon', 'pulled pork', 'bacon', 'Pastrami', 'Corned beef', 'Bacon', 'pastrami', 'honey ham', 'novalox'],
                     'ounces': [4, 3, 12, 6, 7.5, 8, 3 , 5, 6]
                    })

In [8]:
data

Unnamed: 0,food,ounces
0,bacon,4.0
1,pulled pork,3.0
2,bacon,12.0
3,Pastrami,6.0
4,Corned beef,7.5
5,Bacon,8.0
6,pastrami,3.0
7,honey ham,5.0
8,novalox,6.0


In [13]:
meat_to_animal = {
    'bacon': 'pig',
    'pulled pork': 'pig',
    'pastrami': 'cow',
    'corned beef': 'cow',
    'honey ham': 'pig',
    'novalox': 'salmon'
}

In [14]:
lowerdcased = data['food'].str.lower()
lowerdcased

0          bacon
1    pulled pork
2          bacon
3       pastrami
4    corned beef
5          bacon
6       pastrami
7      honey ham
8        novalox
Name: food, dtype: object

1.) The map method on a series accepts a "function" or "dict-like" object containing a mapping.
2.) Map is a convenient way to perform element-wise transformations and other data cleaning-related operations.

In [15]:
data['animal'] = lowerdcased.map(meat_to_animal)
data

Unnamed: 0,food,ounces,animal
0,bacon,4.0,pig
1,pulled pork,3.0,pig
2,bacon,12.0,pig
3,Pastrami,6.0,cow
4,Corned beef,7.5,cow
5,Bacon,8.0,pig
6,pastrami,3.0,cow
7,honey ham,5.0,pig
8,novalox,6.0,salmon


In [16]:
data['food'].map(lambda x: meat_to_animal[x.lower()])

0       pig
1       pig
2       pig
3       cow
4       cow
5       pig
6       cow
7       pig
8    salmon
Name: food, dtype: object

# Replacing Values

In [17]:
data = pd.Series([1., -999, 2., -999., -1000, 3.])
data.replace(-999, np.nan)
data.replace([-999, -1000], np.nan)
data.replace([-999, -1000], [1,2])
data.replace({-999: np.nan, -1000: 0})

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

# Renaming Axis Indexes  

In [18]:

data = pd.DataFrame(np.arange(12).reshape((3,4)),
                   index=['Ohio', 'Colorado', 'New York'],
                   columns=['one', 'two', 'three', 'four'])
data
transform = lambda x: x[:4].upper()
data.index.map(transform)
data.index = data.index.map(transform)
data.rename(index=str.title, columns=str.upper)
data.rename(index={'OHIO': 'INDIANA'},
           columns={'three': 'peekaboo'})

Unnamed: 0,one,two,peekaboo,four
INDIANA,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


# Discretization and Binning 

Suppose you have data about a group of people in a study, and you want to group them into discrete age buckets:

In [19]:
ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]

Let's divide these into bins of 18 to 25, 26 to 35, 36 to 60 and finally 61 and older.

In [20]:
bins = [18, 25, 35, 60, 100]
cats = pd.cut(ages, bins)
cats

[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

In [21]:
cats.codes

array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)

In [22]:
cats.categories

IntervalIndex([(18, 25], (25, 35], (35, 60], (60, 100]],
              closed='right',
              dtype='interval[int64]')

In [23]:
pd.value_counts(cats)

(18, 25]     5
(35, 60]     3
(25, 35]     3
(60, 100]    1
dtype: int64

In [24]:
group_names = ['Youth', 'YoungAdult', 'MiddleAged', 'Senior']
pd.cut(ages, bins, labels = group_names)
data = np.random.rand(20)
data

array([0.67276583, 0.00256335, 0.9937732 , 0.82498912, 0.96935856,
       0.10337314, 0.17138526, 0.66342495, 0.33697757, 0.26072271,
       0.11610387, 0.88444854, 0.60770183, 0.21689021, 0.03497774,
       0.78484552, 0.56872356, 0.57699134, 0.36314453, 0.36254942])

In [25]:
pd.cut(data, 4, precision =2)

[(0.5, 0.75], (0.0016, 0.25], (0.75, 0.99], (0.75, 0.99], (0.75, 0.99], ..., (0.75, 0.99], (0.5, 0.75], (0.5, 0.75], (0.25, 0.5], (0.25, 0.5]]
Length: 20
Categories (4, interval[float64]): [(0.0016, 0.25] < (0.25, 0.5] < (0.5, 0.75] < (0.75, 0.99]]

A closely related function, qcut bins the data based on sample quantiles. Depending on the distribution of data, using cut will not usally result in each bin having the same number of data points. Since qcut uses sample quantiles instead, by definition you will obtain roughly equal-size bins:

In [26]:
data = np.random.randn(1000) # Normally distributed
cats = pd.qcut(data, 4) # Cut into quartiles
pd.value_counts(cats)

(0.747, 3.451]      250
(0.0355, 0.747]     250
(-0.639, 0.0355]    250
(-3.516, -0.639]    250
dtype: int64

In [27]:
pd.qcut(data, [0, 0.1, 0.5, 0.9, 1])

[(0.0355, 1.295], (1.295, 3.451], (0.0355, 1.295], (1.295, 3.451], (1.295, 3.451], ..., (1.295, 3.451], (0.0355, 1.295], (-1.239, 0.0355], (-1.239, 0.0355], (-1.239, 0.0355]]
Length: 1000
Categories (4, interval[float64]): [(-3.516, -1.239] < (-1.239, 0.0355] < (0.0355, 1.295] < (1.295, 3.451]]

# Detecting and Filtering Outliers

In [28]:
data = pd.DataFrame(np.random.randn(1000,4))

In [30]:
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,-0.019399,0.006437,0.008188,-0.026709
std,0.985229,1.034343,1.015705,1.040389
min,-3.204201,-3.026197,-3.060892,-3.540787
25%,-0.684025,-0.669534,-0.704891,-0.79221
50%,-0.008338,0.021598,0.040555,-0.069232
75%,0.634816,0.688058,0.719048,0.725262
max,3.188895,3.359174,3.799926,2.874496


In [31]:
col = data[2]

In [33]:
col[np.abs(col) > 3]

414   -3.060892
898    3.799926
904    3.635030
Name: 2, dtype: float64

Values can be set based on these criteria. Here is code to cap values outside the interval -3 to 3:

In [53]:
data[(np.abs(data) > 3).any( axis=1)] # To Select all rows having a value exceeding 3 or -3, you can use the "any" method

Unnamed: 0,0,1,2,3
135,1.900922,-3.026197,-1.033015,1.93345
414,-0.812429,-1.370275,-3.060892,0.10836
435,-0.41616,3.100117,-0.462754,-0.660778
617,-0.170495,3.359174,-1.849303,0.722903
714,3.188895,1.277111,0.420808,0.410674
742,-3.204201,0.352732,0.87709,1.480112
764,0.46134,3.272718,0.437307,0.346496
852,-0.282202,0.273615,-0.465713,-3.540787
898,-0.060777,-0.404156,3.799926,-0.740795
904,0.236378,-0.285083,3.63503,-0.737637


In [61]:
data[np.abs(data) > 3] = np.sign(data) * 3
# The statement np.sign(data) produces 1 and -1 values based on whether the values in data are positive or negative:

In [62]:
data.describe() ## see the min and max values all are capped between -3 and 3

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,-0.019384,0.005732,0.006814,-0.026169
std,0.983991,1.03204,1.010765,1.0387
min,-3.0,-3.0,-3.0,-3.0
25%,-0.684025,-0.669534,-0.704891,-0.79221
50%,-0.008338,0.021598,0.040555,-0.069232
75%,0.634816,0.688058,0.719048,0.725262
max,3.0,3.0,3.0,2.874496


# Permutation and Random Sampling

Permuting (Randomly Reordering)

In [3]:
df = pd.DataFrame(np.arange(5*4).reshape(5, 4))

In [4]:
sampler = np.random.permutation(5)
sampler

array([1, 4, 2, 0, 3])

In [5]:
df

Unnamed: 0,0,1,2,3
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15
4,16,17,18,19


In [6]:
df.take(sampler)

Unnamed: 0,0,1,2,3
1,4,5,6,7
4,16,17,18,19
2,8,9,10,11
0,0,1,2,3
3,12,13,14,15


In [12]:
df.sample(n=3, replace=True)

Unnamed: 0,0,1,2,3
1,4,5,6,7
0,0,1,2,3
0,0,1,2,3


In [13]:
df

Unnamed: 0,0,1,2,3
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15
4,16,17,18,19


In [15]:
df.sample?

In [16]:
choices = pd.Series([5,7,-1,6,4])

In [21]:
draws = choices.sample(n=10, replace=True)

In [18]:
draws

1    7
0    5
0    5
4    4
1    7
2   -1
0    5
2   -1
4    4
0    5
dtype: int64

# Computing Indicator/Dummy Variables

In [22]:
df = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'b'],
                  'data1': range(6)})

In [24]:
pd.get_dummies(df['key'])

Unnamed: 0,a,b,c
0,0,1,0
1,0,1,0
2,1,0,0
3,0,0,1
4,1,0,0
5,0,1,0


In [26]:
dummies = pd.get_dummies(df['key'], prefix='key')

In [28]:
dummies

Unnamed: 0,key_a,key_b,key_c
0,0,1,0
1,0,1,0
2,1,0,0
3,0,0,1
4,1,0,0
5,0,1,0


In [27]:
df

Unnamed: 0,key,data1
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,b,5


In [29]:
df_with_dummy  =  df[['data1']].join(dummies)

In [30]:
df_with_dummy

Unnamed: 0,data1,key_a,key_b,key_c
0,0,0,1,0
1,1,0,1,0
2,2,1,0,0
3,3,0,0,1
4,4,1,0,0
5,5,0,1,0


In [2]:
mnames = ['movie_id', 'title', 'genres']

In [13]:
movies = pd.read_table('D:\Ryzen1700\Study\ML\Dataset\ml-25m\ml-25m\movies.csv', sep=',', header=0, names = mnames)

In [14]:
movies[:10]

Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance
7,8,Tom and Huck (1995),Adventure|Children
8,9,Sudden Death (1995),Action
9,10,GoldenEye (1995),Action|Adventure|Thriller


In [21]:
all_genres = []

In [22]:
for x in movies.genres:
    all_genres.extend(x.split('|'))

In [24]:

genres = pd.unique(all_genres)

In [25]:
genres

array(['Adventure', 'Animation', 'Children', 'Comedy', 'Fantasy',
       'Romance', 'Drama', 'Action', 'Crime', 'Thriller', 'Horror',
       'Mystery', 'Sci-Fi', 'IMAX', 'Documentary', 'War', 'Musical',
       'Western', 'Film-Noir', '(no genres listed)'], dtype=object)

In [27]:
len(movies)
movies.shape

(62423, 3)

In [29]:

zero_matrix = np.zeros((len(movies), len(genres)))

In [30]:
zero_matrix

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [31]:
dummies = pd.DataFrame(zero_matrix, columns=genres)

In [32]:
dummies

Unnamed: 0,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Mystery,Sci-Fi,IMAX,Documentary,War,Musical,Western,Film-Noir,(no genres listed)
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62418,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
62419,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
62420,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
62421,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [33]:
gen = movies.genres[0]
gen

'Adventure|Animation|Children|Comedy|Fantasy'

In [34]:
dummies.columns.get_indexer(gen.split('|'))

array([0, 1, 2, 3, 4], dtype=int64)

In [35]:
for i, gen in enumerate(movies.genres):
    indices = dummies.columns.get_indexer(gen.split('|'))
    dummies.iloc[i, indices] = 1

In [36]:
movies_windic = movies.join(dummies.add_prefix('Genre_'))

In [38]:
movies_windic

Unnamed: 0,movie_id,title,genres,Genre_Adventure,Genre_Animation,Genre_Children,Genre_Comedy,Genre_Fantasy,Genre_Romance,Genre_Drama,...,Genre_Horror,Genre_Mystery,Genre_Sci-Fi,Genre_IMAX,Genre_Documentary,Genre_War,Genre_Musical,Genre_Western,Genre_Film-Noir,Genre_(no genres listed)
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1.0,1.0,1.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,Jumanji (1995),Adventure|Children|Fantasy,1.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,Grumpier Old Men (1995),Comedy|Romance,0.0,0.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,0.0,0.0,0.0,1.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,Father of the Bride Part II (1995),Comedy,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62418,209157,We (2018),Drama,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
62419,209159,Window of the Soul (2001),Documentary,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
62420,209163,Bad Poems (2018),Comedy|Drama,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
62421,209169,A Girl Thing (2001),(no genres listed),0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


A useful recipe for statistical application is to combine get_dummies with a discretization function like cut:

In [39]:
np.random.seed(12345)

In [40]:
values = np.random.rand(10)

In [41]:
values

array([0.92961609, 0.31637555, 0.18391881, 0.20456028, 0.56772503,
       0.5955447 , 0.96451452, 0.6531771 , 0.74890664, 0.65356987])

In [42]:
bins = [0, 0.2, 0.4, 0.6, 0.8, 1.]

In [43]:
pd.get_dummies(pd.cut(values, bins))

Unnamed: 0,"(0.0, 0.2]","(0.2, 0.4]","(0.4, 0.6]","(0.6, 0.8]","(0.8, 1.0]"
0,0,0,0,0,1
1,0,1,0,0,0
2,1,0,0,0,0
3,0,1,0,0,0
4,0,0,1,0,0
5,0,0,1,0,0
6,0,0,0,0,1
7,0,0,0,1,0
8,0,0,0,1,0
9,0,0,0,1,0
