# Comma Separated Values (CSV)

In [1]:
import csv

### Warmup 1

- Use `list` type's `index` method to lookup index of "ice cream"
- Take a look at other list methods: https://www.w3schools.com/python/python_ref_list.asp

In [2]:
dairy = ["milk", "ice cream", "cheese", "yogurt" ]
print(dairy.index("ice cream"))
# print(dairy.index("paneer")) # doesn't work due to ValueError (runtime error)

1


### Warmup 2 
Use `in` operator to complete the condition to check if food_shelf contains any dairy products.

In [3]:
food_shelf = ["peanut butter", "milk", "bread", "cheese", "YOGURT"]
for item in food_shelf:
    if item.lower() in dairy:
        print(item, "is dairy")
    else:
        print(item, "is not dairy")

peanut butter is not dairy
milk is dairy
bread is not dairy
cheese is dairy
YOGURT is dairy


## Learning Objectives:

- Open an Excel file and export it to a Comma Separated Value file.
- Open a CSV file in TextEditor/Jupyter and connect the elements of the CSV file to the rows and columns in the spreadsheet.
- Use pre-written Python code to read a CSV file into a list of lists.
- Write Python statements with double list indexing to access any element of a CSV file via a list of lists.
- Write code that answers questions about CSV data by writing for loops on lists of lists.

In [4]:
# inspired by https://automatetheboringstuff.com/2e/chapter16/
def process_csv(filename):
    # open the file, its a text file utf-8
    example_file = open(filename, encoding="utf-8")
    # prepare it for reading as a CSV object
    example_reader = csv.reader(example_file)
    # use the built-in list function to convert this into a list of lists
    example_data = list(example_reader)
    # close the file to tidy up our workspace
    example_file.close()
    # return the list of lists
    
    return example_data

### Student Information Survey data

In [5]:
# TODO: call the process_csv function and store the list of lists in cs220_csv
cs220_csv = process_csv("cs220_survey_data.csv")

In [6]:
# Store the header row into cs220_header, using indexing
cs220_header = cs220_csv[0]
cs220_header

['Lecture',
 'Age',
 'Primary major',
 'Other majors',
 'Zip Code',
 'Pizza topping',
 'Pet owner',
 'Runner',
 'Sleep habit',
 'Procrastinator']

In [7]:
# TODO: Store all of the data rows into cs220_data, using slicing
cs220_data = cs220_csv[1:]

# TODO: use slicing to display top 3 rows data
cs220_data[:3]

[['LEC002',
  '19',
  'Engineering: Mechanical',
  '',
  '53711',
  'pepperoni',
  'Yes',
  'No',
  'night owl',
  'Maybe'],
 ['LEC002',
  '20',
  'Science: Physics',
  'Astronomy-Physics, History',
  '53726',
  'pineapple',
  'Yes',
  'Yes',
  'night owl',
  'Yes'],
 ['LEC001',
  '20',
  'Science: Chemistry',
  '',
  '53703',
  'pepperoni',
  'Yes',
  'No',
  'early bird',
  'No']]

### What is the Sleep habit for the 2nd student?

In [8]:
cs220_data[1][8] # bad example: we hard-coded the column index

'night owl'

What if we decided to add a new column before sleeping habit? Your code will no longer work.

Instead of hard-coding column index, you should use `index` method, to lookup column index from the header variable. This will also make your code so much readable.

In [9]:
cs220_data[1][cs220_header.index("Sleep habit")]

'night owl'

### What is the Lecture of the 4th student?

In [10]:
cs220_data[3][cs220_header.index("Lecture")]

'LEC004'

### Create a list containing Age of all students 10 years from now

In [11]:
ages_in_ten_years = []

for row in cs220_data:
    age = row[cs220_header.index("Age")]
    
    if age == '':
        continue
        
    age = int(age)
    ages_in_ten_years.append(age + 10)
    
ages_in_ten_years

[29,
 30,
 30,
 29,
 30,
 28,
 28,
 28,
 29,
 29,
 28,
 28,
 29,
 28,
 30,
 30,
 28,
 29,
 29,
 31,
 28,
 30,
 31,
 31,
 28,
 28,
 28,
 28,
 29,
 28,
 29,
 30,
 38,
 28,
 30,
 29,
 28,
 31,
 28,
 28,
 31,
 28,
 29,
 32,
 30,
 28,
 28,
 29,
 28,
 30,
 29,
 30,
 30,
 29,
 28,
 30,
 29,
 29,
 29,
 29,
 31,
 29,
 28,
 28,
 28,
 28,
 33,
 28,
 28,
 29,
 29,
 30,
 28,
 28,
 29,
 29,
 31,
 31,
 30,
 28,
 29,
 28,
 30,
 30,
 28,
 29,
 30,
 46,
 28,
 29,
 30,
 32,
 28,
 28,
 29,
 30,
 34,
 30,
 29,
 29,
 30,
 30,
 33,
 31,
 31,
 29,
 29,
 29,
 30,
 29,
 29,
 29,
 30,
 28,
 29,
 29,
 28,
 28,
 28,
 29,
 32,
 29,
 34,
 28,
 35,
 30,
 28,
 31,
 29,
 28,
 29,
 28,
 29,
 28,
 31,
 28,
 30,
 29,
 31,
 29,
 29,
 30,
 31,
 30,
 28,
 29,
 28,
 28,
 36,
 29,
 28,
 37,
 30,
 28,
 29,
 29,
 30,
 30,
 31,
 30,
 30,
 31,
 34,
 29,
 31,
 31,
 28,
 28,
 28,
 29,
 28,
 32,
 28,
 29,
 28,
 29,
 30,
 29,
 30,
 30,
 30,
 29,
 31,
 29,
 29,
 28,
 30,
 36,
 28,
 33,
 28,
 28,
 28,
 29,
 30,
 29,
 29,
 30,
 28,
 29,


### cell function

- It would be very helpful to define a cell function, which can handle missing data and type conversions

In [12]:
def cell(row_idx, col_name):
    """
    Returns the data value (cell) corresponding to the row index and 
    the column name of a CSV file.
    """
    # TODO: get the index of col_name
    col_idx = cs220_header.index(col_name) 
    
    # TODO: get the value of cs220_data at the specified cell
    val = cs220_data[row_idx][col_idx]  
    
    # TODO: handle missing values, by returning None
    if val == '':
        return None
    
    # TODO: handle type conversions
    if col_name in ["Age",]:
        return int(val)
    
    return val

### Find average age per lecture.

In [13]:
# TODO: initialize 4 lists for the 4 lectures
lec1_ages = []
lec2_ages = []
lec3_ages = []
lec4_ages = []

# Iterate over the data and populate the lists

for row_idx in range(len(cs220_data)):
    age = cell(row_idx, "Age")
    
    if age != None:
        lecture = cell(row_idx, "Lecture")
        if lecture == "LEC001":
            lec1_ages.append(age)
        elif lecture == "LEC002":
            lec2_ages.append(age)
        elif lecture == "LEC003":
            lec3_ages.append(age)
        elif lecture == "LEC004":
            lec4_ages.append(age)    
            
# TODO: compute average age of each lecture
print("LEC001 average student age:", round(sum(lec1_ages) / len(lec1_ages), 2))
print("LEC002 average student age:", round(sum(lec2_ages) / len(lec2_ages), 2))
print("LEC003 average student age:", round(sum(lec3_ages) / len(lec3_ages), 2))
print("LEC004 average student age:", round(sum(lec4_ages) / len(lec4_ages), 2))

LEC001 average student age: 19.93
LEC002 average student age: 19.8
LEC003 average student age: 19.38
LEC004 average student age: 19.27


### Find all unique zip codes.

In [14]:
# TODO: initialize list of keep track of zip codes
zip_codes = []

for row_idx in range(len(cs220_data)):
    zip_code = cell(row_idx, "Zip Code")
    
    if zip_code != None:
        zip_codes.append(zip_code)
        
list(set(zip_codes))

['53719',
 '54636',
 '53575',
 '53706-1406',
 '55416',
 '53703-1104',
 '53705',
 '52706',
 '59301',
 '53590',
 '53175',
 '53708',
 '92376',
 '53711',
 '53597',
 '53713',
 '53726',
 '53704',
 '53701',
 '53706-1127',
 '43706',
 '52816',
 '53089',
 '83001',
 '5 3706',
 '57305',
 '10306',
 '53715',
 '19002',
 '53703',
 '53706',
 '53706-1203',
 'internation student',
 '53562',
 '53076',
 '53706-1188',
 '53717']

## Self-practice

### How many students are both a procrastinator and a pet owner?

### What percentage of 18-year-olds have their major declared as "Other"?

### How old is the oldest basil/spinach-loving Business major?