# Module 1- Pandas_Data_Wrangling

![data_cleaning.jpeg](attachment:data_cleaning.jpeg)

## Learning Objective.

- Introduced to data cleaning using Python.

In [None]:
from __future__ import division
from numpy.random import randn
import numpy as np
import os
import matplotlib.pyplot as plt
np.random.seed(12345)
plt.rc('figure', figsize=(10, 6))
from pandas import Series, DataFrame
import pandas
import pandas as pd

### Data transformation


#### Remove duplicates.

Duplicate rows may be found in a DataFrame for any number of reasons. Here is an example:

In [None]:
data = DataFrame({'k1': ['one'] * 3 + ['two'] * 4,
                  'k2': [1, 1, 2, 3, 3, 4, 4]})
data

In [None]:
# Displays the duplicate occurrences.
data[data.duplicated()]

In [None]:
data.drop_duplicates()

In [None]:
data.drop_duplicates(inplace=True)
data

In [None]:
data = DataFrame({'k1': ['one'] * 3 + ['two'] * 4,
                  'k2': [1, 1, 2, 3, 3, 4, 4]})
data

In [None]:
data.drop_duplicates(['k1'])

In [None]:
# If we want to keep the last occurrence of a duplicate.
data.drop_duplicates(['k1', 'k2'], keep='last')

### Replacing values


Filling in missing data with fillna method can be thought of as a special case of a more general value replacement.

`replace` provides a simpler and more flexible way to do so.

In [None]:
data = Series([1., -999., 2., -999., -1000., 3.])
data

In [None]:
# the first parameter is the value we are lookign for 
# the second is what we want to replace it with.
data_1=data.replace(-999, np.nan)
data_1

In [None]:
#Finding which columns have missing values
data_1.isnull()

In [None]:
# you can pass a list of values to be replaced
data.replace([-999, -1000], 100)

In [None]:
# you can also pass a list of values to be used as replacements
data.replace([-999, -1000], [np.nan, 0])

In [None]:
#you can also pass it a dict
data.replace({-999: np.nan, -1000: 0})

### Renaming axis indexes


In [None]:
data = DataFrame(np.arange(12).reshape((3, 4)),
                 index=['Ohio', 'Colorado', 'New York'],
                 columns=['one', 'two', 'three', 'four'])
data

In [None]:
data.index.map(str.upper)

In [None]:
data.index = data.index.map(str.upper)
data

In [None]:
data.rename(index=str.title, columns=str.upper)

### String manipulation

- String object methods

In [None]:
val = 'a,b:,  lassie'
val.split(',')

In [None]:
# What data type is returned.
type(val.split(','))

In [None]:
# Want to remove colon
pieces = [x.strip(':') for x in val.split(',')]
type(pieces)
pieces

In [None]:
# Remove space
pieces = [x.strip() for x in pieces]
pieces

In [None]:
first, second, third = pieces
first + '::' + second + '::' + third

In [None]:
# Join() returns a string in which the string elements of sequence 
# have been joined by str separator.
' '.join(pieces)

In [None]:
'K' in pieces

In [None]:
val.index('b')

In [None]:
# Notice the difference in what is returned.
print(val.find('Z'))
print(val.find('b'))

In [None]:
# Counts the number of occurrences of a character.
val.count(',')


In [None]:
val.replace(',','!')

In [None]:
val.replace(',', '')

In [None]:
data = {'Dave': 'dave@google.com', 'Steve': 'steve@gmail.com',
        'Rob': 'rob@gmail.com', 'Wes': np.nan}
data = Series(data)

In [None]:
data

In [None]:
data.isnull()


In [None]:
data.str.contains('gmail')

### What Else Can We do?

- Let's work on some stuff together!