# Missing data
- __we can use 'dropna' function to filter the missing data__
- __example__

# Filtering missing data

In [14]:
import pandas as pd
import numpy as np

data = pd.Series([1,np.nan,1.3,5,7,np.nan])
data.dropna()

0    1.0
2    1.3
3    5.0
4    7.0
dtype: float64

- __'dropna' function with DataFrame__
- __'dropna' with (how = 'all) arguement will remove data in which all the data is missing__

In [15]:
data = pd.DataFrame([
    [1., 6.5, 3.], 
    [1., np.nan, np.nan],
    [np.nan,np.nan, np.nan],
    [np.nan, 6.5, 3.]
])
data.dropna()

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


In [16]:
data.dropna(how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0



# Filling missing data
- __fill with using 'fillna' function__
- __example__

In [17]:
data.fillna(0)

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,0.0,0.0
2,0.0,0.0,0.0
3,0.0,6.5,3.0


- __'fillna' function with 'inplace=True' will modify the same original data, without returning new object__

In [21]:
__=data.fillna(0, inplace=True)
data

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,0.0,0.0
2,0.0,0.0,0.0
3,0.0,6.5,3.0


- __we can also fill data with the help of indx__

In [22]:
data.fillna({0:11, 2:22})

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,0.0,0.0
2,0.0,0.0,0.0
3,0.0,6.5,3.0


- __method = 'ffill' with 'fillna' function__

In [26]:
df = pd.DataFrame(np.random.randn(6, 3))
df.iloc[2:, 1] = np.nan
df.iloc[4:, 2] = np.nan
df

Unnamed: 0,0,1,2
0,-0.250212,0.361213,0.355333
1,-0.505847,0.29108,-1.354529
2,-1.319543,,-1.199914
3,-0.779005,,0.858671
4,0.445762,,
5,-0.07187,,


- __method = 'ffill' will fill the misisng data last data present in the column__

In [27]:
df.fillna(method='ffill')

Unnamed: 0,0,1,2
0,-0.250212,0.361213,0.355333
1,-0.505847,0.29108,-1.354529
2,-1.319543,0.29108,-1.199914
3,-0.779005,0.29108,0.858671
4,0.445762,0.29108,0.858671
5,-0.07187,0.29108,0.858671


- __fill data with limit__

In [29]:
df.fillna(method='ffill', limit=2)

Unnamed: 0,0,1,2
0,-0.250212,0.361213,0.355333
1,-0.505847,0.29108,-1.354529
2,-1.319543,0.29108,-1.199914
3,-0.779005,0.29108,0.858671
4,0.445762,,0.858671
5,-0.07187,,0.858671


# Removing duplicate data
- __we have this data, here we have this data, where 'two' is repeated in 'k1'__
- __example__

In [37]:
data = pd.DataFrame({'k1': ['one', 'two'] * 3 + ['two'],
                     'k2': [1, 1, 2, 3, 3, 4, 4]})
data

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4
6,two,4


In [38]:
data.duplicated()

0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool

- __'drop_duplicate' is use to remove duplicate data__

In [39]:
data.drop_duplicates()

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4


# Transforming Data Using a Function or Mapping
- __example__

In [40]:
data = pd.DataFrame({'food': ['bacon', 'pulled pork', 'bacon','Pastrami', 'corned beef', 'Bacon','pastrami', 'honey ham', 'nova lox'],
                    'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})
data

Unnamed: 0,food,ounces
0,bacon,4.0
1,pulled pork,3.0
2,bacon,12.0
3,Pastrami,6.0
4,corned beef,7.5
5,Bacon,8.0
6,pastrami,3.0
7,honey ham,5.0
8,nova lox,6.0


- __let's add the column that indicate type of animal food come from__
- __example__

In [43]:
meat_to_animal = {
'bacon': 'pig',
'pulled pork': 'pig',
'pastrami': 'cow',
'corned beef': 'cow',
'honey ham': 'pig',
'nova lox': 'salmon'
}

In [48]:
food = data['food']

In [49]:
data['animal'] = food.map(meat_to_animal)

In [50]:
data

Unnamed: 0,food,ounces,animal
0,bacon,4.0,pig
1,pulled pork,3.0,pig
2,bacon,12.0,pig
3,Pastrami,6.0,
4,corned beef,7.5,cow
5,Bacon,8.0,
6,pastrami,3.0,cow
7,honey ham,5.0,pig
8,nova lox,6.0,salmon


# Replacing values
- __example__

In [59]:
data = pd.Series([1., -999., 2., -99., -1000., 3.])

In [60]:
data.replace(-999., 999)

0       1.0
1     999.0
2       2.0
3     -99.0
4   -1000.0
5       3.0
dtype: float64

In [64]:
data.replace([1,2],[11,22])

0      11.0
1    -999.0
2      22.0
3     -99.0
4   -1000.0
5       3.0
dtype: float64

# Renaming axis indexes
- __example__

In [65]:
data = pd.DataFrame(np.arange(12).reshape((3, 4)),
                    index=['Ohio', 'Colorado', 'New York'],
                    columns=['one', 'two', 'three', 'four']
                   )
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
New York,8,9,10,11


- __Let's tranform index of above data__

In [70]:
transform =  lambda x: x.upper()

In [71]:
data.index.map(transform)

Index(['OHIO', 'COLORADO', 'NEW YORK'], dtype='object')

- __we can modify the original data without returning new object__

In [74]:
data.index = data.index.map(transform)
data

Unnamed: 0,one,two,three,four
OHIO,0,1,2,3
COLORADO,4,5,6,7
NEW YORK,8,9,10,11


- __tranform data without modifying original data__

In [81]:
data.rename(index = str.upper, columns = str.upper)

Unnamed: 0,ONE,TWO,THREE,FOUR
OHIO,0,1,2,3
COLORADO,4,5,6,7
NEW YORK,8,9,10,11


- __rename column__

In [83]:
data.rename(index = {'OHIO':'california'})

Unnamed: 0,one,two,three,four
california,0,1,2,3
COLORADO,4,5,6,7
NEW YORK,8,9,10,11


# Discretization and Binning
- __Let's divide the age into group like 18-25, 26-35, 36-60, 61-100__
- __example__

In [85]:
ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]

bins = [18,25,35,60,100]

In [87]:
data = pd.cut(ages,bins)
data

[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

- __Let's categories into group__

In [89]:
data.categories

IntervalIndex([(18, 25], (25, 35], (35, 60], (60, 100]],
              closed='right',
              dtype='interval[int64]')

- __now count the values into those group__

In [90]:
data.value_counts()

(18, 25]     5
(25, 35]     3
(35, 60]     3
(60, 100]    1
dtype: int64

- __we can categories the data using arguements too__
- __example__

In [93]:
data = np.random.randn(1000)

In [95]:
pd.cut(data,4)

[(-1.59, 0.042], (0.042, 1.674], (-1.59, 0.042], (0.042, 1.674], (0.042, 1.674], ..., (0.042, 1.674], (0.042, 1.674], (-1.59, 0.042], (-1.59, 0.042], (0.042, 1.674]]
Length: 1000
Categories (4, interval[float64]): [(-3.228, -1.59] < (-1.59, 0.042] < (0.042, 1.674] < (1.674, 3.305]]

# Detecting and Filtering Outliers

# Permutation and Random Sampling

- __Permuting (or randomly reordering) a Series or the rows in a DataFrame is easy to do using the 'numpy.random.permutation' function__
- __example__

In [96]:
df = pd.DataFrame(np.arange(5 * 4).reshape((5, 4)))
df

Unnamed: 0,0,1,2,3
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15
4,16,17,18,19


In [101]:
sampler = np.random.permutation(5)
sampler

array([0, 3, 1, 2, 4])

In [102]:
df.take(sampler)

Unnamed: 0,0,1,2,3
0,0,1,2,3
3,12,13,14,15
1,4,5,6,7
2,8,9,10,11
4,16,17,18,19


In [103]:
df.sample(n=3)

Unnamed: 0,0,1,2,3
3,12,13,14,15
1,4,5,6,7
2,8,9,10,11


# Computing Indicator/Dummy Variables

In [104]:
mnames = ['movie_id', 'title', 'genres']

In [106]:
movies = pd.read_table('ml-1m/movies.dat', sep='::', header=None, names=mnames)
movies[:7]

  movies = pd.read_table('ml-1m/movies.dat', sep='::', header=None, names=mnames)


Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance


In [107]:
all_genre = []
for i in movies.genres:
    all_genre.extend(i.split('|'))
    
pd.unique(all_genre)

array(['Animation', "Children's", 'Comedy', 'Adventure', 'Fantasy',
       'Romance', 'Drama', 'Action', 'Crime', 'Thriller', 'Horror',
       'Sci-Fi', 'Documentary', 'War', 'Musical', 'Mystery', 'Film-Noir',
       'Western'], dtype=object)