# Tutorial 3.3b: Pandas Column Manipulation
Python for Data Analytics | Module 3  
Professor James Ng

In [None]:
# SETUP: DO NOT CHANGE
import numpy as np
import pandas as pd

In [None]:
# Download the College Scorecard dataset from OSF
!curl -L https://osf.io/cz253/download --create-dirs -o data-sets/college-scorecard-data-scrubbed.csv

college_scorecard = pd.read_csv(
    'data-sets/college-scorecard-data-scrubbed.csv', 
    encoding='latin-1')
college_scorecard.head()

In [None]:
college_scorecard.info()

In [None]:
college_scorecard['predominant_degree_desc'].dtype

## Creating Categorical Variables

In [None]:
college_scorecard['schooltype'] = college_scorecard['predominant_degree_desc'].astype('category')

In [None]:
college_scorecard['schooltype'].dtype

### Ordered categories

In [None]:
from pandas.api.types import CategoricalDtype

In [None]:
college_scorecard['schooltype'].cat.set_categories(['Certificate', 'Associates', 'Bachelors', 'Graduate'], 
                                                   ordered=True, inplace=True)

In [None]:
college_scorecard['schooltype'].dtype

### Binning data into categories

Bin the `pell_grant_receipients` column into four bins: 0-0.25, 0.25-0.50, 0.50-0.75, 0.75-1.00

In [None]:
college_scorecard['pell_grant_cat'] = pd.cut(college_scorecard['pell_grant_receipents'], [0, 0.25, 0.5, 0.75, 1])

In [None]:
college_scorecard['pell_grant_cat'].dtype

In [None]:
college_scorecard[['pell_grant_receipents', 'pell_grant_cat']]

## Arranging columns

In [None]:
# Really no good way of doing this. Limitation of pandas. 
# Most straightforward way is to select a list of columns in the order you want, but with many columns 
# this becomes impractical.
# Many other ways to do it, but all are hack-ish.

### Example 1: Select desired columns in desired order

In [None]:
dfwanted = college_scorecard[['UNITID', 'OPEID', 'OPEID6', 
                            'predominant_degree_desc', 'schooltype', 
                            'pell_grant_receipents', 'pell_grant_cat']]
dfwanted.head()

### Example 2: Bring last column to front

In [None]:
# Example of hack-ish way of bringing the last (rightmost) column to front (left)
cols = college_scorecard.columns.tolist()

In [None]:
cols

In [None]:
cols = cols[-1:] + cols[:-1] # what is this doing?

In [None]:
college_scorecard[cols]

## Converting categorical variable into dummy/indicator variables

In [None]:
dfwanted['predominant_degree_desc'].sample(20)
# here instead of head(), I am taking a look at 20 random rows

In [None]:
pd.get_dummies(dfwanted['schooltype'])

In [None]:
# get_dummies inplace 

dfwanted = pd.get_dummies( dfwanted, columns=['predominant_degree_desc'])

In [None]:
dfwanted.head()