# Pandas for Data Analysis: Data Wrangling

**Outline:**

* Dealing with Columns
* [Dealing with Categorical Data](#Dealing-with-Categorical-Data)
* Dealing with String Data (Using str Functions)
* Mapping or Applying Function Along Axis
* Scaling and Normalization
* Parsing Dates
* [Handling Missing Data](#Handling-Missing-Data)


In [1]:
import pandas as pd

In [2]:
columns = ['age', 'Work Class', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'Money Per Year']
adult = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data', names=columns)

In [3]:
adult.to_csv('adult.csv')

## Dealing with Columns

### Renaming Columns

In [None]:
adult = pd.read_csv('adult.csv', index_col=0)

In [None]:
adult.head()

In [None]:
adult_new = adult.rename(columns={'Work Class': 'workclass'})

In [None]:
adult_new.head()

In [None]:
adult_new.columns

In [None]:
adult_new.columns = adult_new.columns.str.lower().str.replace(' ', '-')

In [None]:
adult_new.columns

In [None]:
adult_new.info()

### Adding New Columns

In [None]:
adult['normalized-age'] = (adult.age - adult.age.mean()) / adult.age.std()

In [None]:
adult.head()

In [None]:
adult['normalized-age'] > 1

In [None]:
adult[adult['normalized-age'] > 1]

In [None]:
adult[adult['age'] > 80]

In [None]:
adult.info()

### Removing Existing Columns

In [None]:
adult.drop('normalized-age')

We need to specify the parameter called `axis` when we drop.

In [None]:
adult.drop('normalized-age', axis=1)

In [None]:
adult.head()

In [None]:
adult = adult.drop('normalized-age', axis=1)

In [None]:
adult.head()

Remove rows?

In [None]:
adult.drop([0, 1, axis=0)

## Dealing with Categorical Data

In [39]:
adult.education.unique()

array(['Bachelors', 'HS-grad', '11th', 'Masters', '9th', 'Some-college',
       'Assoc-acdm', 'Assoc-voc', '7th-8th', 'Doctorate', 'Prof-school',
       '5th-6th', '10th', '1st-4th', 'Preschool', '12th'], dtype=object)

In [40]:
adult.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32561 entries, 0 to 32560
Data columns (total 15 columns):
age               32561 non-null int64
work-class        32561 non-null object
fnlwgt            32561 non-null int64
education         32561 non-null object
education-num     32561 non-null int64
marital-status    32561 non-null object
occupation        32561 non-null object
relationship      32561 non-null object
race              32561 non-null object
sex               32561 non-null object
capital-gain      32561 non-null int64
capital-loss      32561 non-null int64
hours-per-week    32561 non-null int64
native-country    32561 non-null object
money-per-year    32561 non-null object
dtypes: int64(6), object(9)
memory usage: 4.0+ MB


In [41]:
adult.education = adult.education.astype('category')

In [42]:
adult.head()

Unnamed: 0,age,work-class,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,money-per-year
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [43]:
adult.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32561 entries, 0 to 32560
Data columns (total 15 columns):
age               32561 non-null int64
work-class        32561 non-null object
fnlwgt            32561 non-null int64
education         32561 non-null category
education-num     32561 non-null int64
marital-status    32561 non-null object
occupation        32561 non-null object
relationship      32561 non-null object
race              32561 non-null object
sex               32561 non-null object
capital-gain      32561 non-null int64
capital-loss      32561 non-null int64
hours-per-week    32561 non-null int64
native-country    32561 non-null object
money-per-year    32561 non-null object
dtypes: category(1), int64(6), object(8)
memory usage: 3.8+ MB


In [44]:
adult.education.head()

0    Bachelors
1    Bachelors
2      HS-grad
3         11th
4    Bachelors
Name: education, dtype: category
Categories (16, object): [10th, 11th, 12th, 1st-4th, ..., Masters, Preschool, Prof-school, Some-college]

In [45]:
adult.education.cat.codes.head()

0     9
1     9
2    11
3     1
4     9
dtype: int8

In [46]:
for each in adult.education.unique():
    print(each)

Bachelors
HS-grad
11th
Masters
9th
Some-college
Assoc-acdm
Assoc-voc
7th-8th
Doctorate
Prof-school
5th-6th
10th
1st-4th
Preschool
12th


In [47]:
adult = pd.read_csv('adult.csv', index_col=0)
adult.education = adult.education.str.replace(' ', '')
categories = ['Preschool', '1st-4th', '5th-6th', '7th-8th', '9th', '10th', '11th', '12th', 'HS-grad', 'Bachelors', 'Some-college', 'Masters', 'Doctorate', 'Prof-school', 'Assoc-acdm', 'Assoc-voc']
adult.education = adult.education.astype('category', categories=categories, ordered=True)

In [49]:
adult.sort_values('education').head(2)

Unnamed: 0,age,Work Class,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,Money Per Year
22940,25,Private,266820,Preschool,1,Never-married,Farming-fishing,Not-in-family,White,Male,0,0,35,Mexico,<=50K
13248,68,Private,168794,Preschool,1,Never-married,Machine-op-inspct,Not-in-family,White,Male,0,0,10,United-States,<=50K


In [50]:
adult.sort_values('education').tail(2)

Unnamed: 0,age,Work Class,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,Money Per Year
15870,36,Private,272944,Assoc-voc,11,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,60,United-States,>50K
25920,24,Private,278130,Assoc-voc,11,Never-married,Machine-op-inspct,Not-in-family,White,Male,0,0,40,United-States,<=50K


In [51]:
adult.loc[adult.education >= 'Masters', :]

Unnamed: 0,age,Work Class,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,Money Per Year
5,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K
8,31,Private,45781,Masters,14,Never-married,Prof-specialty,Not-in-family,White,Female,14084,0,50,United-States,>50K
13,32,Private,205019,Assoc-acdm,12,Never-married,Sales,Not-in-family,Black,Male,0,0,50,United-States,<=50K
14,40,Private,121772,Assoc-voc,11,Married-civ-spouse,Craft-repair,Husband,Asian-Pac-Islander,Male,0,0,40,?,>50K
19,43,Self-emp-not-inc,292175,Masters,14,Divorced,Exec-managerial,Unmarried,White,Female,0,0,45,United-States,>50K
20,40,Private,193524,Doctorate,16,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,60,United-States,>50K
30,23,Local-gov,190709,Assoc-acdm,12,Never-married,Protective-serv,Not-in-family,White,Male,0,0,52,United-States,<=50K
39,48,Self-emp-not-inc,265477,Assoc-acdm,12,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,40,United-States,<=50K
47,44,Private,128354,Masters,14,Divorced,Exec-managerial,Unmarried,White,Female,0,0,40,United-States,<=50K
48,41,State-gov,101603,Assoc-voc,11,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,40,United-States,<=50K


## Map, Apply, and Filter

## Handling Missing Data

In [52]:
titanic = pd.read_csv('http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic3.csv')

In [60]:
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 14 columns):
pclass       int64
survived     int64
name         object
sex          object
age          float64
sibsp        int64
parch        int64
ticket       object
fare         float64
cabin        object
embarked     object
boat         object
body         float64
home.dest    object
dtypes: float64(3), int64(4), object(7)
memory usage: 143.2+ KB


In [54]:
titanic.shape

(1309, 14)

In [55]:
titanic.head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2.0,,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.92,1,2,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"


In [56]:
titanic.isnull().head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,False,False,False,False,False,False,False,False,False,False,False,False,True,False
1,False,False,False,False,False,False,False,False,False,False,False,False,True,False
2,False,False,False,False,False,False,False,False,False,False,False,True,True,False
3,False,False,False,False,False,False,False,False,False,False,False,True,False,False
4,False,False,False,False,False,False,False,False,False,False,False,True,True,False


In [57]:
titanic.notnull().head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,True,True,True,True,True,True,True,True,True,True,True,True,False,True
1,True,True,True,True,True,True,True,True,True,True,True,True,False,True
2,True,True,True,True,True,True,True,True,True,True,True,False,False,True
3,True,True,True,True,True,True,True,True,True,True,True,False,True,True
4,True,True,True,True,True,True,True,True,True,True,True,False,False,True


In [58]:
titanic.isnull().sum()

pclass          0
survived        0
name            0
sex             0
age           263
sibsp           0
parch           0
ticket          0
fare            1
cabin        1014
embarked        2
boat          823
body         1188
home.dest     564
dtype: int64

In [61]:
titanic.drop('body', axis=1).dropna().shape

(159, 13)

In [62]:
titanic.dropna(subset=['age', 'body'], how='any').shape

(120, 14)

In [63]:
titanic.dropna(subset=['age', 'body'], how='all').shape

(1047, 14)

In [64]:
body_mean = titanic.body.mean()

In [65]:
titanic.body = titanic.body.fillna(body_mean).head()

In [66]:
titanic.head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2.0,160.809917,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.92,1,2,113781,151.55,C22 C26,S,11.0,160.809917,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S,,160.809917,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S,,160.809917,"Montreal, PQ / Chesterville, ON"


In [69]:
titanic.cabin.value_counts(dropna=False).head()

NaN                1014
C23 C25 C27           6
B57 B59 B63 B66       5
G6                    5
D                     4
Name: cabin, dtype: int64

In [70]:
titanic.cabin.fillna('C23 C25 C27').value_counts().head()

C23 C25 C27        1020
B57 B59 B63 B66       5
G6                    5
D                     4
F2                    4
Name: cabin, dtype: int64