# Lesson 20e: Preparing data after import

## Import libraries and load the data

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math as math

frame = pd.read_csv("PublicTransitExpenses.csv", low_memory = False)
frame.head()

Unnamed: 0,5 digit NTD ID,4 digit NTD ID,Agency,Reporter Type,Subrecipient Type,Organization Type,2015 Total Mode Vehicles (VOMS),Mode,Type of Service,Primary UZA Code,...,Casualty and Liability Costs,Taxes,PT Funds In Report,PT Funds Reported Separately,Miscellaneous Expenses,Reduced Reporter Total OE,Total Operating Expenses,Total Operating Expenses (No Funds Reported Separately),ADA Related Expenses,Location 1
0,30098,3098,Washington County Commissioners,Reduced Reporter,,,,DR,PT,,...,$0.00,$0.00,$0.00,$0.00,$0.00,$122524.00,$122524.00,$122524.00,$0.00,
1,30098,3098,Washington County Commissioners,Reduced Reporter,,,,MB,PT,,...,$0.00,$0.00,$0.00,$0.00,$0.00,$272715.00,$272715.00,$272715.00,$0.00,
2,60107,6107,"Texoma Area Paratransit System, Inc",Full Reporter,,,,CB,PT,,...,,,$3398.00,,,,$7295.00,$7295.00,,
3,9,0T09,Kalispel Tribe of Indians,Reduced Reporter,,Tribe,4.0,DR,DO,0.0,...,,,,,,$37416.00,$37416.00,$37416.00,,"Usk, WA\n"
4,9,0T09,Kalispel Tribe of Indians,Reduced Reporter,,Tribe,4.0,MB,DO,0.0,...,,,,,,$345789.00,$345789.00,$345789.00,,"Usk, WA\n"


In [2]:
# Checking the types of data and memory:

frame.info(memory_usage = "deep")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17844 entries, 0 to 17843
Data columns (total 39 columns):
 #   Column                                                   Non-Null Count  Dtype  
---  ------                                                   --------------  -----  
 0   5 digit NTD ID                                           17844 non-null  object 
 1   4 digit NTD ID                                           17719 non-null  object 
 2   Agency                                                   17844 non-null  object 
 3   Reporter Type                                            17844 non-null  object 
 4   Subrecipient Type                                        3072 non-null   object 
 5   Organization Type                                        17759 non-null  object 
 6   2015 Total Mode Vehicles (VOMS)                          17522 non-null  float64
 7   Mode                                                     17844 non-null  object 
 8   Type of Service           

In [3]:
# I want to work on a selection of the data:

frame = pd.read_csv("PublicTransitExpenses.csv", usecols = ["Agency", "Reporter Type", "Organization Type",
                                                           "Rail (Y/N)", "Fixed Route (Y/N)", "Service Costs",
                                                           "Tires and Tubes", "Total Operating Expenses",
                                                           "Service Area Population"] ,low_memory = False)
frame.head()

Unnamed: 0,Agency,Reporter Type,Organization Type,Service Area Population,Rail (Y/N),Fixed Route (Y/N),Service Costs,Tires and Tubes,Total Operating Expenses
0,Washington County Commissioners,Reduced Reporter,,,,,$0.00,$0.00,$122524.00
1,Washington County Commissioners,Reduced Reporter,,,,,$0.00,$0.00,$272715.00
2,"Texoma Area Paratransit System, Inc",Full Reporter,,,,,,,$7295.00
3,Kalispel Tribe of Indians,Reduced Reporter,Tribe,,N,N,,,$37416.00
4,Kalispel Tribe of Indians,Reduced Reporter,Tribe,,N,Y,,,$345789.00


In [4]:
frame.info(memory_usage = "deep")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17844 entries, 0 to 17843
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Agency                    17844 non-null  object 
 1   Reporter Type             17844 non-null  object 
 2   Organization Type         17759 non-null  object 
 3   Service Area Population   14418 non-null  float64
 4   Rail (Y/N)                17318 non-null  object 
 5   Fixed Route (Y/N)         17318 non-null  object 
 6   Service Costs             10262 non-null  object 
 7   Tires and Tubes           5615 non-null   object 
 8   Total Operating Expenses  17844 non-null  object 
dtypes: float64(1), object(8)
memory usage: 9.5 MB


In [5]:
# Now I want to simplify the names of the columns, for future convenience:

newColumnNames = {"Agency": "Agency",
                 "Reporter Type": "ReporterType",
                 "Organization Type": "OrgType",
                 "Service Area Population": "Population",
                 "Rail (Y/N)": "IsRail",
                 "Fixed Route (Y/N)": "IsFixedRoute",
                 "Service Costs": "ServiceCost",
                 "Tires and Tubes": "TiresTubesCost",
                 "Total Operating Expenses": "TotalExpenses"}

In [6]:
frame.rename(columns = newColumnNames, inplace = True)
frame.head()

Unnamed: 0,Agency,ReporterType,OrgType,Population,IsRail,IsFixedRoute,ServiceCost,TiresTubesCost,TotalExpenses
0,Washington County Commissioners,Reduced Reporter,,,,,$0.00,$0.00,$122524.00
1,Washington County Commissioners,Reduced Reporter,,,,,$0.00,$0.00,$272715.00
2,"Texoma Area Paratransit System, Inc",Full Reporter,,,,,,,$7295.00
3,Kalispel Tribe of Indians,Reduced Reporter,Tribe,,N,N,,,$37416.00
4,Kalispel Tribe of Indians,Reduced Reporter,Tribe,,N,Y,,,$345789.00


In [7]:
# To see how many unique values is in a given column, we use

frame["ReporterType"].nunique()

4

In [8]:
# To count all rows:

frame["ReporterType"].count()

17844

In [9]:
# To count subgroups:

frame["ReporterType"].value_counts()

Full Reporter       13345
Rural Reporter       3116
Reduced Reporter     1313
Separate Service       70
Name: ReporterType, dtype: int64

In [11]:
# To change the type of the values in the column:

frame["ReporterType"] = frame["ReporterType"].astype("category")
# Note that we saved some memory in this way.

In [12]:
frame.info(memory_usage = "deep")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17844 entries, 0 to 17843
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype   
---  ------          --------------  -----   
 0   Agency          17844 non-null  object  
 1   ReporterType    17844 non-null  category
 2   OrgType         17759 non-null  object  
 3   Population      14418 non-null  float64 
 4   IsRail          17318 non-null  object  
 5   IsFixedRoute    17318 non-null  object  
 6   ServiceCost     10262 non-null  object  
 7   TiresTubesCost  5615 non-null   object  
 8   TotalExpenses   17844 non-null  object  
dtypes: category(1), float64(1), object(7)
memory usage: 8.3 MB


In [13]:
frame["OrgType"] = frame["OrgType"].astype("category")
frame["Agency"] = frame["Agency"].astype("category")

In [14]:
frame.info(memory_usage = "deep")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17844 entries, 0 to 17843
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype   
---  ------          --------------  -----   
 0   Agency          17844 non-null  category
 1   ReporterType    17844 non-null  category
 2   OrgType         17759 non-null  category
 3   Population      14418 non-null  float64 
 4   IsRail          17318 non-null  object  
 5   IsFixedRoute    17318 non-null  object  
 6   ServiceCost     10262 non-null  object  
 7   TiresTubesCost  5615 non-null   object  
 8   TotalExpenses   17844 non-null  object  
dtypes: category(3), float64(1), object(5)
memory usage: 5.1 MB


In [15]:
# Note that if tere is "NaN" in some column, this column will not be of int type, but it will be of float type.

frame["Population"].fillna(0, inplace =True)

In [16]:
frame.head()

Unnamed: 0,Agency,ReporterType,OrgType,Population,IsRail,IsFixedRoute,ServiceCost,TiresTubesCost,TotalExpenses
0,Washington County Commissioners,Reduced Reporter,,0.0,,,$0.00,$0.00,$122524.00
1,Washington County Commissioners,Reduced Reporter,,0.0,,,$0.00,$0.00,$272715.00
2,"Texoma Area Paratransit System, Inc",Full Reporter,,0.0,,,,,$7295.00
3,Kalispel Tribe of Indians,Reduced Reporter,Tribe,0.0,N,N,,,$37416.00
4,Kalispel Tribe of Indians,Reduced Reporter,Tribe,0.0,N,Y,,,$345789.00


In [19]:
# Now I can covert the values of this column to the int type.

frame["Population"] = frame["Population"].astype("int")

In [20]:
frame.info(memory_usage = "deep")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17844 entries, 0 to 17843
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype   
---  ------          --------------  -----   
 0   Agency          17844 non-null  category
 1   ReporterType    17844 non-null  category
 2   OrgType         17759 non-null  category
 3   Population      17844 non-null  int64   
 4   IsRail          17318 non-null  object  
 5   IsFixedRoute    17318 non-null  object  
 6   ServiceCost     10262 non-null  object  
 7   TiresTubesCost  5615 non-null   object  
 8   TotalExpenses   17844 non-null  object  
dtypes: category(3), int64(1), object(5)
memory usage: 5.1 MB


In [21]:
# If I want to see first few rows of the table with "Population" bigger than 0, I use:

nonZeroPopulation = frame["Population"] > 0

frame[nonZeroPopulation].head()


Unnamed: 0,Agency,ReporterType,OrgType,Population,IsRail,IsFixedRoute,ServiceCost,TiresTubesCost,TotalExpenses
71,Reno-Sparks Indian Colony,Reduced Reporter,Tribe,1127,N,Y,,,$174810.00
92,City of Pocatello,Reduced Reporter,"City, County or Local Government Unit or Depar...",81730,N,Y,,,$1119899.00
93,Lee-Russell Council of Governments,Reduced Reporter,"MPO, COG or Other Planning Agency",193194,N,N,,,$1540633.00
94,"Aiken Area Council on Aging, Inc.",Reduced Reporter,Area Agency on Aging,160099,N,Y,,,$525325.00
101,City of Lawrence,Full Reporter,Consolidated Reporter,87643,N,Y,,,$937407.00


In [24]:
# To replace some words:

frame["IsRail"].replace(("Y","N") ,(True,False), inplace = True)
frame["IsFixedRoute"].replace(("Y","N") ,(True,False), inplace = True)

frame.head()

Unnamed: 0,Agency,ReporterType,OrgType,Population,IsRail,IsFixedRoute,ServiceCost,TiresTubesCost,TotalExpenses
0,Washington County Commissioners,Reduced Reporter,,0,,,$0.00,$0.00,$122524.00
1,Washington County Commissioners,Reduced Reporter,,0,,,$0.00,$0.00,$272715.00
2,"Texoma Area Paratransit System, Inc",Full Reporter,,0,,,,,$7295.00
3,Kalispel Tribe of Indians,Reduced Reporter,Tribe,0,False,False,,,$37416.00
4,Kalispel Tribe of Indians,Reduced Reporter,Tribe,0,False,True,,,$345789.00


In [25]:
# Now we make the decision to replace NaNs in some columns to False (as if we did not know these values):

frame["IsRail"].fillna(False, inplace = True)
frame["IsFixedRoute"].fillna(False, inplace = True)

In [31]:
# To save even more memory, I change the type of the values:

frame["IsRail"] = frame["IsRail"].astype(bool)
frame["IsFixedRoute"] = frame["IsRail"].astype(bool)

In [27]:
frame.info(memory_usage = "deep")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17844 entries, 0 to 17843
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype   
---  ------          --------------  -----   
 0   Agency          17844 non-null  category
 1   ReporterType    17844 non-null  category
 2   OrgType         17759 non-null  category
 3   Population      17844 non-null  int64   
 4   IsRail          17844 non-null  bool    
 5   IsFixedRoute    17844 non-null  bool    
 6   ServiceCost     10262 non-null  object  
 7   TiresTubesCost  5615 non-null   object  
 8   TotalExpenses   17844 non-null  object  
dtypes: bool(2), category(3), int64(1), object(3)
memory usage: 3.2 MB


In [32]:
# Next we want to modify numerical values, so that to remove "$", which is treated as a character:

frame["ServiceCost"] = frame["ServiceCost"].str.replace("$"," ")
frame["TiresTubesCost"] = frame["TiresTubesCost"].str.replace("$"," ")
frame["TotalExpenses"] = frame["TotalExpenses"].str.replace("$"," ")

  frame["ServiceCost"] = frame["ServiceCost"].str.replace("$"," ")
  frame["TiresTubesCost"] = frame["TiresTubesCost"].str.replace("$"," ")
  frame["TotalExpenses"] = frame["TotalExpenses"].str.replace("$"," ")


In [34]:
# and we can change their type:

frame["ServiceCost"] = frame["ServiceCost"].astype("float")
frame["TiresTubesCost"] = frame["TiresTubesCost"].astype("float")
frame["TotalExpensesCost"] = frame["TotalExpenses"].astype("float")

In [35]:
frame.info(memory_usage = "deep")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17844 entries, 0 to 17843
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype   
---  ------             --------------  -----   
 0   Agency             17844 non-null  category
 1   ReporterType       17844 non-null  category
 2   OrgType            17759 non-null  category
 3   Population         17844 non-null  int64   
 4   IsRail             17844 non-null  bool    
 5   IsFixedRoute       17844 non-null  bool    
 6   ServiceCost        10262 non-null  float64 
 7   TiresTubesCost     5615 non-null   float64 
 8   TotalExpenses      10262 non-null  object  
 9   TotalExpensesCost  10262 non-null  float64 
dtypes: bool(2), category(3), float64(3), int64(1), object(1)
memory usage: 1.8 MB


In [38]:
# We modify string values:

frame["Agency"] = frame["Agency"].str.title().astype("category")
frame["ReporterType"] = frame["ReporterType"].str.upper().astype("category")
frame["OrgType"] = frame["OrgType"].str.upper().astype("category")

frame.head()

Unnamed: 0,Agency,ReporterType,OrgType,Population,IsRail,IsFixedRoute,ServiceCost,TiresTubesCost,TotalExpenses,TotalExpensesCost
0,Washington County Commissioners,REDUCED REPORTER,,0,False,False,0.0,0.0,0.0,0.0
1,Washington County Commissioners,REDUCED REPORTER,,0,False,False,0.0,0.0,0.0,0.0
2,"Texoma Area Paratransit System, Inc",FULL REPORTER,,0,False,False,,,,
3,Kalispel Tribe Of Indians,REDUCED REPORTER,TRIBE,0,False,False,,,,
4,Kalispel Tribe Of Indians,REDUCED REPORTER,TRIBE,0,False,False,,,,
