# Lesson 20d: Text columns

## Libraries and the data 

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math as math

frame = pd.read_csv("PublicTransitExpenses.csv", usecols = ["Agency", "Reporter Type", "Total Operating Expenses"])
frame.head()

Unnamed: 0,Agency,Reporter Type,Total Operating Expenses
0,Washington County Commissioners,Reduced Reporter,$122524.00
1,Washington County Commissioners,Reduced Reporter,$272715.00
2,"Texoma Area Paratransit System, Inc",Full Reporter,$7295.00
3,Kalispel Tribe of Indians,Reduced Reporter,$37416.00
4,Kalispel Tribe of Indians,Reduced Reporter,$345789.00


In [4]:
# When we need to find a given word in the table (here in a column), we use:

frame[frame["Agency"].str.contains("Washington")]

Unnamed: 0,Agency,Reporter Type,Total Operating Expenses
0,Washington County Commissioners,Reduced Reporter,$122524.00
1,Washington County Commissioners,Reduced Reporter,$272715.00
199,Washington State Ferries,Full Reporter,$16215777.00
213,Washington State Ferries,Full Reporter,$11132493.00
364,Washington State Ferries,Full Reporter,$11750411.00
...,...,...,...
15871,Washington Hancock Community Agency,Rural Reporter,$995395.00
16213,City of Washington,Rural Reporter,$105117.00
16313,City of Washington,Rural Reporter,$102799.00
16316,Washington Parish Council on Aging,Rural Reporter,$546300.00


In [7]:
# To change all uppercase into lowercase, we use:

frame["Agency"] = frame["Agency"].str.lower()

frame.head()

Unnamed: 0,Agency,Reporter Type,Total Operating Expenses
0,washington county commissioners,Reduced Reporter,$122524.00
1,washington county commissioners,Reduced Reporter,$272715.00
2,"texoma area paratransit system, inc",Full Reporter,$7295.00
3,kalispel tribe of indians,Reduced Reporter,$37416.00
4,kalispel tribe of indians,Reduced Reporter,$345789.00


In [10]:
frame[frame["Agency"].str.contains("washington")]

Unnamed: 0,Agency,Reporter Type,Total Operating Expenses
0,washington county commissioners,Reduced Reporter,$122524.00
1,washington county commissioners,Reduced Reporter,$272715.00
199,washington state ferries,Full Reporter,$16215777.00
213,washington state ferries,Full Reporter,$11132493.00
364,washington state ferries,Full Reporter,$11750411.00
...,...,...,...
15871,washington hancock community agency,Rural Reporter,$995395.00
16213,city of washington,Rural Reporter,$105117.00
16313,city of washington,Rural Reporter,$102799.00
16316,washington parish council on aging,Rural Reporter,$546300.00


In [11]:
frame[frame["Agency"].str.endswith("ferries")]

Unnamed: 0,Agency,Reporter Type,Total Operating Expenses
199,washington state ferries,Full Reporter,$16215777.00
213,washington state ferries,Full Reporter,$11132493.00
364,washington state ferries,Full Reporter,$11750411.00
397,washington state ferries,Full Reporter,$22400732.00
604,washington state ferries,Full Reporter,$17715787.00
634,washington state ferries,Full Reporter,$236320045.00
653,washington state ferries,Full Reporter,$184425025.00
684,washington state ferries,Full Reporter,$186344635.00
915,washington state ferries,Full Reporter,$236711555.00
1114,washington state ferries,Full Reporter,$23046740.00


In [17]:
# It can be that I do not want to change the text in my original data frame, but I want to work with lower case.
# Then, I just need to read the orginal file once again and do the following:

frame = pd.read_csv("PublicTransitExpenses.csv", usecols = ["Agency", "Reporter Type", "Total Operating Expenses"])
frame.head()

Unnamed: 0,Agency,Reporter Type,Total Operating Expenses
0,Washington County Commissioners,Reduced Reporter,$122524.00
1,Washington County Commissioners,Reduced Reporter,$272715.00
2,"Texoma Area Paratransit System, Inc",Full Reporter,$7295.00
3,Kalispel Tribe of Indians,Reduced Reporter,$37416.00
4,Kalispel Tribe of Indians,Reduced Reporter,$345789.00


In [18]:
endsWith = frame["Agency"].str.lower().str.strip().str.endswith("ferries")

# str.strip() - cuts the space at the end and at the beggining of a given word.

# Then, all my changes are done on the copy of the frame and not on the original frame.

In [19]:
frame[endsWith]

Unnamed: 0,Agency,Reporter Type,Total Operating Expenses
199,Washington State Ferries,Full Reporter,$16215777.00
213,Washington State Ferries,Full Reporter,$11132493.00
364,Washington State Ferries,Full Reporter,$11750411.00
397,Washington State Ferries,Full Reporter,$22400732.00
604,Washington State Ferries,Full Reporter,$17715787.00
634,Washington State Ferries,Full Reporter,$236320045.00
653,Washington State Ferries,Full Reporter,$184425025.00
684,Washington State Ferries,Full Reporter,$186344635.00
915,Washington State Ferries,Full Reporter,$236711555.00
1114,Washington State Ferries,Full Reporter,$23046740.00


In [20]:
# Changing the index (as in previous lesson):

frame.set_index("Agency", inplace = True)
frame.head()

Unnamed: 0_level_0,Reporter Type,Total Operating Expenses
Agency,Unnamed: 1_level_1,Unnamed: 2_level_1
Washington County Commissioners,Reduced Reporter,$122524.00
Washington County Commissioners,Reduced Reporter,$272715.00
"Texoma Area Paratransit System, Inc",Full Reporter,$7295.00
Kalispel Tribe of Indians,Reduced Reporter,$37416.00
Kalispel Tribe of Indians,Reduced Reporter,$345789.00


In [23]:
# Now I need to make text modification on the index (if the index is of the string type):

frame.index = frame.index.str.strip().str.upper()
frame.head()

Unnamed: 0_level_0,Reporter Type,Total Operating Expenses
Agency,Unnamed: 1_level_1,Unnamed: 2_level_1
WASHINGTON COUNTY COMMISSIONERS,Reduced Reporter,$122524.00
WASHINGTON COUNTY COMMISSIONERS,Reduced Reporter,$272715.00
"TEXOMA AREA PARATRANSIT SYSTEM, INC",Full Reporter,$7295.00
KALISPEL TRIBE OF INDIANS,Reduced Reporter,$37416.00
KALISPEL TRIBE OF INDIANS,Reduced Reporter,$345789.00


In [26]:
frame["Reporter Type"].value_counts()

Full Reporter       13345
Rural Reporter       3116
Reduced Reporter     1313
Separate Service       70
Name: Reporter Type, dtype: int64

In [27]:
frame["Reporter Type"].str.split(" ").head()
# Here I split 2 words of the reporter type and they are separated by a comma in a form of a list.

Agency
WASHINGTON COUNTY COMMISSIONERS        [Reduced, Reporter]
WASHINGTON COUNTY COMMISSIONERS        [Reduced, Reporter]
TEXOMA AREA PARATRANSIT SYSTEM, INC       [Full, Reporter]
KALISPEL TRIBE OF INDIANS              [Reduced, Reporter]
KALISPEL TRIBE OF INDIANS              [Reduced, Reporter]
Name: Reporter Type, dtype: object

In [28]:
# If I need to have info about its 0th position I can use:
frame["Reporter Type"].str.split(" ").str[0].head()

Agency
WASHINGTON COUNTY COMMISSIONERS        Reduced
WASHINGTON COUNTY COMMISSIONERS        Reduced
TEXOMA AREA PARATRANSIT SYSTEM, INC       Full
KALISPEL TRIBE OF INDIANS              Reduced
KALISPEL TRIBE OF INDIANS              Reduced
Name: Reporter Type, dtype: object

In [29]:
# If I need to have these 2 words shown in 2 separate columns, I can use:

frame["Reporter Type"].str.split(" ", expand = True).head()

Unnamed: 0_level_0,0,1
Agency,Unnamed: 1_level_1,Unnamed: 2_level_1
WASHINGTON COUNTY COMMISSIONERS,Reduced,Reporter
WASHINGTON COUNTY COMMISSIONERS,Reduced,Reporter
"TEXOMA AREA PARATRANSIT SYSTEM, INC",Full,Reporter
KALISPEL TRIBE OF INDIANS,Reduced,Reporter
KALISPEL TRIBE OF INDIANS,Reduced,Reporter


In [31]:
# And now I will name these columns and add them to the original table:

frame[["ReporterType1", "ReporterType2"]] = frame["Reporter Type"].str.split(" ", expand = True)
frame.head()

Unnamed: 0_level_0,Reporter Type,Total Operating Expenses,ReporterType1,ReporterType2
Agency,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
WASHINGTON COUNTY COMMISSIONERS,Reduced Reporter,$122524.00,Reduced,Reporter
WASHINGTON COUNTY COMMISSIONERS,Reduced Reporter,$272715.00,Reduced,Reporter
"TEXOMA AREA PARATRANSIT SYSTEM, INC",Full Reporter,$7295.00,Full,Reporter
KALISPEL TRIBE OF INDIANS,Reduced Reporter,$37416.00,Reduced,Reporter
KALISPEL TRIBE OF INDIANS,Reduced Reporter,$345789.00,Reduced,Reporter


In [33]:
# Now I want to see how the function "split()" is sensitive to the number of spaces.
# First I will add a new column made of the old one

frame["Agency2"] = frame.index
frame.head()

Unnamed: 0_level_0,Reporter Type,Total Operating Expenses,ReporterType1,ReporterType2,Agency2
Agency,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
WASHINGTON COUNTY COMMISSIONERS,Reduced Reporter,$122524.00,Reduced,Reporter,WASHINGTON COUNTY COMMISSIONERS
WASHINGTON COUNTY COMMISSIONERS,Reduced Reporter,$272715.00,Reduced,Reporter,WASHINGTON COUNTY COMMISSIONERS
"TEXOMA AREA PARATRANSIT SYSTEM, INC",Full Reporter,$7295.00,Full,Reporter,"TEXOMA AREA PARATRANSIT SYSTEM, INC"
KALISPEL TRIBE OF INDIANS,Reduced Reporter,$37416.00,Reduced,Reporter,KALISPEL TRIBE OF INDIANS
KALISPEL TRIBE OF INDIANS,Reduced Reporter,$345789.00,Reduced,Reporter,KALISPEL TRIBE OF INDIANS


In [34]:
frame["Agency2"].str.split(" ", expand = True, n=10).head()
# n=10 is maximal number of spaces (columns) that I can get

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,10
Agency,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
WASHINGTON COUNTY COMMISSIONERS,WASHINGTON,COUNTY,COMMISSIONERS,,,,,,,,
WASHINGTON COUNTY COMMISSIONERS,WASHINGTON,COUNTY,COMMISSIONERS,,,,,,,,
"TEXOMA AREA PARATRANSIT SYSTEM, INC",TEXOMA,AREA,PARATRANSIT,"SYSTEM,",INC,,,,,,
KALISPEL TRIBE OF INDIANS,KALISPEL,TRIBE,OF,INDIANS,,,,,,,
KALISPEL TRIBE OF INDIANS,KALISPEL,TRIBE,OF,INDIANS,,,,,,,


In [40]:
# The method "apply()" can be used in hopeless situations.

# Below we want to add a comment if some condition is satisfied for a given cell in a given row.
# To do this, we need to define a function which checks if the condition is satisfied row by row:

def getComment(row):
    reporterType = row["Reporter Type"]
    cost = float(row["Total Operating Expenses"].replace("$"," "))
    
    if (cost > 200000):
        comment = "CLASS A"
    else:
        comment = "CLASS B"
    return (reporterType+"/"+comment)

In [41]:
frame.apply(getComment, axis = 1)

Agency
WASHINGTON COUNTY COMMISSIONERS        Reduced Reporter/CLASS B
WASHINGTON COUNTY COMMISSIONERS        Reduced Reporter/CLASS A
TEXOMA AREA PARATRANSIT SYSTEM, INC       Full Reporter/CLASS B
KALISPEL TRIBE OF INDIANS              Reduced Reporter/CLASS B
KALISPEL TRIBE OF INDIANS              Reduced Reporter/CLASS A
                                                 ...           
CEDAR AREA TRANSPORTATION SERVICE        Rural Reporter/CLASS B
TEHAMA COUNTY                            Rural Reporter/CLASS B
CITY OF DIXON                            Rural Reporter/CLASS A
MORONGO BASIN TRANSIT AUTHORITY          Rural Reporter/CLASS A
CITY OF ARVIN                            Rural Reporter/CLASS B
Length: 17844, dtype: object

In [42]:
frame.head()

Unnamed: 0_level_0,Reporter Type,Total Operating Expenses,ReporterType1,ReporterType2,Agency2
Agency,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
WASHINGTON COUNTY COMMISSIONERS,Reduced Reporter,$122524.00,Reduced,Reporter,WASHINGTON COUNTY COMMISSIONERS
WASHINGTON COUNTY COMMISSIONERS,Reduced Reporter,$272715.00,Reduced,Reporter,WASHINGTON COUNTY COMMISSIONERS
"TEXOMA AREA PARATRANSIT SYSTEM, INC",Full Reporter,$7295.00,Full,Reporter,"TEXOMA AREA PARATRANSIT SYSTEM, INC"
KALISPEL TRIBE OF INDIANS,Reduced Reporter,$37416.00,Reduced,Reporter,KALISPEL TRIBE OF INDIANS
KALISPEL TRIBE OF INDIANS,Reduced Reporter,$345789.00,Reduced,Reporter,KALISPEL TRIBE OF INDIANS
