In [1]:
import pandas as pd

In [2]:
df=pd.DataFrame(['A+', 'A', 'A-', 'B+', 'B', 'B-', 'C+', 'C', 'C-', 'D+', 'D'],
                index=['excellent', 'excellent', 'excellent', 'good', 'good', 'good', 
                       'ok', 'ok', 'ok', 'poor', 'poor'],
               columns=["Grades"])
df

Unnamed: 0,Grades
excellent,A+
excellent,A
excellent,A-
good,B+
good,B
good,B-
ok,C+
ok,C
ok,C-
poor,D+


In [3]:
# Now, if we check the datatype of this column, we see that it's just an object, since we set string values
df.dtypes

Grades    object
dtype: object

In [4]:
# We can, however, tell pandas that we want to change the type to category, using the astype() function
df["Grades"].astype("category")

excellent    A+
excellent     A
excellent    A-
good         B+
good          B
good         B-
ok           C+
ok            C
ok           C-
poor         D+
poor          D
Name: Grades, dtype: category
Categories (11, object): ['A', 'A+', 'A-', 'B', ..., 'C+', 'C-', 'D', 'D+']

In [5]:
# We see now that there are eleven categories, and pandas is aware of what those categories are. More
# interesting though is that our data isn't just categorical, but that it's ordered. That is, an A- comes
# after a B+, and B comes before a B+. We can tell pandas that the data is ordered by first creating a new
# categorical data type with the list of the categories (in order) and the ordered=True flag
my_categories=pd.CategoricalDtype(categories=['D', 'D+', 'C-', 'C', 'C+', 'B-', 'B', 'B+', 'A-', 'A', 'A+'], 
                           ordered=True)
# then we can just pass this to the astype() function
grades=df["Grades"].astype(my_categories)
grades

excellent    A+
excellent     A
excellent    A-
good         B+
good          B
good         B-
ok           C+
ok            C
ok           C-
poor         D+
poor          D
Name: Grades, dtype: category
Categories (11, object): ['D' < 'D+' < 'C-' < 'C' ... 'B+' < 'A-' < 'A' < 'A+']

In [6]:
# Now we see that pandas is not only aware that there are 11 categories, but it is also aware of the order of
# those categoreies. So, what can you do with this? Well because there is an ordering this can help with
# comparisons and boolean masking. For instance, if we have a list of our grades and we compare them to a “C”
# we see that the lexicographical comparison returns results we were not intending. 

df[df["Grades"]>"C"]

Unnamed: 0,Grades
ok,C+
ok,C-
poor,D+
poor,D


In [7]:
grades[grades > "C"]

excellent    A+
excellent     A
excellent    A-
good         B+
good          B
good         B-
ok           C+
Name: Grades, dtype: category
Categories (11, object): ['D' < 'D+' < 'C-' < 'C' ... 'B+' < 'A-' < 'A' < 'A+']

In [8]:
# So a C+ is great than a C, but a C- and D certainly are not. However, if we broadcast over the dataframe
# which has the type set to an ordered categorical

In [9]:
# Sometimes it is useful to represent categorical values as each being a column with a true or a false as to
# whether the category applies. This is especially common in feature extraction, which is a topic in the data
# mining course. Variables with a boolean value are typically called dummy variables, and pandas has a built
# in function called get_dummies which will convert the values of a single column into multiple columns of
# zeros and ones indicating the presence of the dummy variable.

In [10]:
# There’s one more common scale-based operation I’d like to talk about, and that’s on converting a scale from
# something that is on the interval or ratio scale, like a numeric grade, into one which is categorical. Now,
# this might seem a bit counter intuitive to you, since you are losing information about the value. But it’s
# commonly done in a couple of places. For instance, if you are visualizing the frequencies of categories,
# this can be an extremely useful approach, and histograms are regularly used with converted interval or ratio
# data. In addition, if you’re using a machine learning classification approach on data, you need to be using
# categorical data, so reducing dimensionality may be useful just to apply a given technique. Pandas has a
# function called cut which takes as an argument some array-like structure like a column of a dataframe or a
# series. It also takes a number of bins to be used, and all bins are kept at equal spacing.
 
# Lets go back to our census data for an example. We saw that we could group by state, then aggregate to get a
# list of the average county size by state. If we further apply cut to this with, say, ten bins, we can see
# the states listed as categoricals using the average county size.

# let's bring in numpy
import numpy as np

# Now we read in our dataset
df=pd.read_csv("census.csv")
df.where(df['STNAME']=="Alabama").dropna()

Unnamed: 0,SUMLEV,REGION,DIVISION,STATE,COUNTY,STNAME,CTYNAME,CENSUS2010POP,ESTIMATESBASE2010,POPESTIMATE2010,...,RDOMESTICMIG2011,RDOMESTICMIG2012,RDOMESTICMIG2013,RDOMESTICMIG2014,RDOMESTICMIG2015,RNETMIG2011,RNETMIG2012,RNETMIG2013,RNETMIG2014,RNETMIG2015
0,40.0,3.0,6.0,1.0,0.0,Alabama,Alabama,4779736.0,4780127.0,4785161.0,...,0.002295,-0.193196,0.381066,0.582002,-0.467369,1.030015,0.826644,1.383282,1.724718,0.712594
1,50.0,3.0,6.0,1.0,1.0,Alabama,Autauga County,54571.0,54571.0,54660.0,...,7.242091,-2.915927,-3.012349,2.265971,-2.530799,7.606016,-2.626146,-2.722002,2.592270,-2.187333
2,50.0,3.0,6.0,1.0,3.0,Alabama,Baldwin County,182265.0,182265.0,183193.0,...,14.832960,17.647293,21.845705,19.243287,17.197872,15.844176,18.559627,22.727626,20.317142,18.293499
3,50.0,3.0,6.0,1.0,5.0,Alabama,Barbour County,27457.0,27457.0,27341.0,...,-4.728132,-2.500690,-7.056824,-3.904217,-10.543299,-4.874741,-2.758113,-7.167664,-3.978583,-10.543299
4,50.0,3.0,6.0,1.0,7.0,Alabama,Bibb County,22915.0,22919.0,22861.0,...,-5.527043,-5.068871,-6.201001,-0.177537,0.177258,-5.088389,-4.363636,-5.403729,0.754533,1.107861
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
63,50.0,3.0,6.0,1.0,125.0,Alabama,Tuscaloosa County,194656.0,194653.0,194977.0,...,3.983504,4.256278,5.261075,4.787490,1.884402,5.306232,5.202679,6.333332,6.057539,3.158710
64,50.0,3.0,6.0,1.0,127.0,Alabama,Walker County,67023.0,67023.0,67004.0,...,-4.579296,-4.336533,-0.666096,-3.300481,-3.087245,-4.085450,-3.869753,-0.181663,-2.692097,-2.460626
65,50.0,3.0,6.0,1.0,129.0,Alabama,Washington County,17581.0,17583.0,17610.0,...,-14.708407,-13.062030,-11.181733,1.067236,-0.950486,-13.849940,-12.075121,-10.122411,2.193763,0.118811
66,50.0,3.0,6.0,1.0,131.0,Alabama,Wilcox County,11670.0,11665.0,11557.0,...,-7.029724,-10.239356,-12.314507,-13.835235,0.090453,-7.029724,-10.239356,-12.314507,-13.835235,0.090453


In [11]:
# And we reduce this to country data
df=df[df['SUMLEV']==50]

# And for a few groups
df=df.set_index('STNAME').groupby(level=0)['CENSUS2010POP'].agg(np.average)

df.head()

STNAME
Alabama        71339.343284
Alaska         24490.724138
Arizona       426134.466667
Arkansas       38878.906667
California    642309.586207
Name: CENSUS2010POP, dtype: float64

In [12]:
pd.cut(df,10)

STNAME
Alabama                   (11706.087, 75333.413]
Alaska                    (11706.087, 75333.413]
Arizona                 (390320.176, 453317.529]
Arkansas                  (11706.087, 75333.413]
California              (579312.234, 642309.586]
Colorado                 (75333.413, 138330.766]
Connecticut             (390320.176, 453317.529]
Delaware                (264325.471, 327322.823]
District of Columbia    (579312.234, 642309.586]
Florida                 (264325.471, 327322.823]
Georgia                   (11706.087, 75333.413]
Hawaii                  (264325.471, 327322.823]
Idaho                     (11706.087, 75333.413]
Illinois                 (75333.413, 138330.766]
Indiana                   (11706.087, 75333.413]
Iowa                      (11706.087, 75333.413]
Kansas                    (11706.087, 75333.413]
Kentucky                  (11706.087, 75333.413]
Louisiana                 (11706.087, 75333.413]
Maine                    (75333.413, 138330.766]
Maryland     

In [13]:
dict = {
    "Names": ['Tanishka', 'Srashti', 'Radha', 'Rishabh', 'Ram'],
    "City": ['Panipat', 'Haridwar', 'Madhura', 'Batinda', 'Ayodhya'],
    "Age": [19,25,33,40,55],
    "Amount": [100.10,210.33,55.30,200,120.90]
}

In [14]:
df2 = pd.DataFrame(dict)

In [15]:
df2

Unnamed: 0,Names,City,Age,Amount
0,Tanishka,Panipat,19,100.1
1,Srashti,Haridwar,25,210.33
2,Radha,Madhura,33,55.3
3,Rishabh,Batinda,40,200.0
4,Ram,Ayodhya,55,120.9


In [16]:
bins = [0,18,25,35,45,55]
labels = ['Teen','Young','Adult','Mid-Adult','Old']

In [17]:
df2['Age_group'] = pd.cut(df2.Age, bins, labels= labels)

In [18]:
df2

Unnamed: 0,Names,City,Age,Amount,Age_group
0,Tanishka,Panipat,19,100.1,Young
1,Srashti,Haridwar,25,210.33,Young
2,Radha,Madhura,33,55.3,Adult
3,Rishabh,Batinda,40,200.0,Mid-Adult
4,Ram,Ayodhya,55,120.9,Old


In [19]:
pd.cut(df2.Age, bins, labels= labels)

0        Young
1        Young
2        Adult
3    Mid-Adult
4          Old
Name: Age, dtype: category
Categories (5, object): ['Teen' < 'Young' < 'Adult' < 'Mid-Adult' < 'Old']

In [20]:
val = pd.cut(df2.Age, bins, labels= labels, retbins = True)

In [21]:
val

(0        Young
 1        Young
 2        Adult
 3    Mid-Adult
 4          Old
 Name: Age, dtype: category
 Categories (5, object): ['Teen' < 'Young' < 'Adult' < 'Mid-Adult' < 'Old'],
 array([ 0, 18, 25, 35, 45, 55]))