In [27]:
"""
What's this?
In class tutorial on using pandas and numpy with
WPRDC csv dataset on restaurant healthcode 
violations

Notes:
- Numpy based on the C library; based on core data
structure known as numpy array (ND array) optimized
for lightning fast processing.
- Becomes the underlying library for pandas to use
(numpy provides materials; array like structures of lists
of quantitative values in sequential order to consumed
by pandas)
- Pandas then creates and uses a mutable data-table akin
to a spreadsheet.
- These are tabular structures in which each row has a 
unique index (creates one if missing); we create columns
that act as sliceable values
- In pandas, columns are referred to as series objects
- All columns and series are managed by the pandas
DataFrame
- See documentation when all else fails (posted on
technologyrediscovery.net)

"""

import numpy as np
import pandas as pd

"""
Legend:
### Heading
## Notes/Comments
# regular comment expression to avoid code running
"""


## Using pandas read_csv() method to read the entire CSV in

violations_dataframe = pd.read_csv('food_violations.csv')

### Interrogating data (seeing what's inside)

## just to take a look at what type this is
# print(type(violations_dataframe))

## print first few rows
# print(violations_dataframe.head())

## print the column headers to see what is recorded
# print(violations_dataframe.columns)

## print the data types the columns are storing
# print(violations_dataframe.dtypes)

## baked in python methods work too
# len(violations_dataframe)

### now we're extracting a series from the column 'description_new'

## assign a variable to the series (column) containing infractions
infractions = violations_dataframe['description_new']

## ask what data type this is
# print(type(infractions)) ## This should return a Series

## Methords on series objects
## similar to len() but for a series
# print(infractions.size) 

## seeks how many of each type (tabulation of unique values)
# print(infractions.value_counts()) 

## Changing gears into municipalities
municipalities = violations_dataframe['municipal']

## extract only the municipality series and display unique value counts
# print(municipalities.value_counts())

"""Remember: [] in notation refers to columns not rows
To get individual rows you need to slice"""

### Extract individual rows from the dataframe based on index value
## Two major ways of accessing them are index based extraction 'iloc' (int based)
## Or you can used 'loc' to get rows that meet certain conditions

## Need double brackets to return row with headers
## this line will pull two rows 0 and 9
# print(violations_dataframe.iloc[[0,9]])

## this version pulls a range with headers
#print(violations_dataframe.iloc[0:6])
## Anything that comes back from this method is it's own dataframe

### Build dataframe containing only rows who's violation is in Robinson

## creating a new data frame based on boolean conditions with a lambda
## what this does is uses the loc() function to select only rows
## when processed by lambda expression return a True value
robinson_violations = violations_dataframe.loc[lambda violations_dataframe: violations_dataframe['municipal'].isin(['Robinson'])]
#print(robinson_violations)

## for reasurance, this is a dataframe
type(robinson_violations)

## This pulls the column containing violations as a series and tallies unique value counts
robinson_violations['description_new'].value_counts()




Fabrication, Design, Installation and Maintenance          672
Cleaning and Sanitization                                  629
Contamination Prevention - Food, Utensils and Equipment    563
Cold Holding Temperatures                                  512
Handwashing Facilities                                     397
Floors                                                     376
Pest Management                                            315
Cooling Food                                               283
Walls and ceilings                                         263
Certified Food Protection Manager                          258
Facilities to Maintain Temperature                         244
Date Marking of Food                                       230
Toxic Items                                                217
Cross-Contamination Prevention                             196
Plumbing                                                   196
Garbage and Refuse                                     