## Automation using for loops and functions

In [1]:
# Need a list to be able to loop over

# Toy example of what a kid might see at a zoo
animals = ['lion', 'tiger', 'crocodile', 'vulture', 'hippo']

In [2]:
animals

['lion', 'tiger', 'crocodile', 'vulture', 'hippo']

### Challenge: instead of new-lines, have each animal separated with commas.

In [8]:
for creature in animals:
    print(creature, end=',')

lion,tiger,crocodile,vulture,hippo,

In [10]:
', '.join(animals)

sep = ', '
?sep.join(animals)

In [4]:
print('lion')
print('tiger')
print('crocodile')

lion
tiger
crocodile


In [5]:
?print

In [11]:
# Back to the real data!

import os
os.getcwd()

'/home/instructor'

In [12]:
# Let's create the directories to save our data
os.mkdir('data')

In [14]:
os.mkdir('data/yearly_files')

FileExistsError: [Errno 17] File exists: 'data/yearly_files'

In [24]:
os.listdir('data/yearly_files/')

['surveys2002.csv']

In [18]:
# Now that we have a place to store our results
# We can read in the data
import pandas as pd

surveys_df = pd.read_csv("https://ndownloader.figshare.com/files/2292172",

                             keep_default_na=False, na_values=[""])

In [22]:
surveys2002 = surveys_df[surveys_df["year"] == 2002]

In [23]:
surveys2002.to_csv('data/yearly_files/surveys2002.csv')

## We saved one year manually.  Let's create a for loop to do save all years

In [26]:
surveys_df["year"].unique()

array([1977, 1978, 1979, 1980, 1981, 1982, 1983, 1984, 1985, 1986, 1987,
       1988, 1989, 1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998,
       1999, 2000, 2001, 2002])

In [27]:
# Don't save data yet, let's see what the file names look like
for year in surveys_df["year"].unique():
    filename = 'data/yearly_files/surveys' + str(year) + '.csv'
    print(filename)

data/yearly_files/surveys1977.csv
data/yearly_files/surveys1978.csv
data/yearly_files/surveys1979.csv
data/yearly_files/surveys1980.csv
data/yearly_files/surveys1981.csv
data/yearly_files/surveys1982.csv
data/yearly_files/surveys1983.csv
data/yearly_files/surveys1984.csv
data/yearly_files/surveys1985.csv
data/yearly_files/surveys1986.csv
data/yearly_files/surveys1987.csv
data/yearly_files/surveys1988.csv
data/yearly_files/surveys1989.csv
data/yearly_files/surveys1990.csv
data/yearly_files/surveys1991.csv
data/yearly_files/surveys1992.csv
data/yearly_files/surveys1993.csv
data/yearly_files/surveys1994.csv
data/yearly_files/surveys1995.csv
data/yearly_files/surveys1996.csv
data/yearly_files/surveys1997.csv
data/yearly_files/surveys1998.csv
data/yearly_files/surveys1999.csv
data/yearly_files/surveys2000.csv
data/yearly_files/surveys2001.csv
data/yearly_files/surveys2002.csv


In [28]:
# Let's improve the loop to save files
for year in surveys_df["year"].unique():
    # Subset the year's data from our full set
    surveys_year = surveys_df[surveys_df["year"] == year]
    # Create the filename to save the data
    filename = 'data/yearly_files/surveys' + str(year) + '.csv'
    # Save the data!
    surveys_year.to_csv(filename)

In [29]:
# Let's verify we created all the files
os.listdir('data/yearly_files/')

['surveys1990.csv',
 'surveys1983.csv',
 'surveys1994.csv',
 'surveys1986.csv',
 'surveys1997.csv',
 'surveys1988.csv',
 'surveys2000.csv',
 'surveys1979.csv',
 'surveys1999.csv',
 'surveys1995.csv',
 'surveys1993.csv',
 'surveys1982.csv',
 'surveys1978.csv',
 'surveys1985.csv',
 'surveys1981.csv',
 'surveys2002.csv',
 'surveys1977.csv',
 'surveys1992.csv',
 'surveys1991.csv',
 'surveys1987.csv',
 'surveys1998.csv',
 'surveys1989.csv',
 'surveys1996.csv',
 'surveys2001.csv',
 'surveys1984.csv',
 'surveys1980.csv']

In [31]:
# Save by species instead of years
for species in surveys_df["species_id"].unique():
    # Subset the year's data from our full set
    surveys_species = surveys_df[surveys_df["species_id"] == species]
    # Create the filename to save the data
    filename = 'data/species_files/surveys' + str(species) + '.csv'
    # Save the data!
    surveys_species.to_csv(filename)

In [32]:
os.listdir('data/species_files/')

['surveysDX.csv',
 'surveysAB.csv',
 'surveysRX.csv',
 'surveysDM.csv',
 'surveysDS.csv',
 'surveysSF.csv',
 'surveysPL.csv',
 'surveysCU.csv',
 'surveysCB.csv',
 'surveysPE.csv',
 'surveysCT.csv',
 'surveysZL.csv',
 'surveysPB.csv',
 'surveysPG.csv',
 'surveysPX.csv',
 'surveysSS.csv',
 'surveysUR.csv',
 'surveysPM.csv',
 'surveysUP.csv',
 'surveysUL.csv',
 'surveysPF.csv',
 'surveysST.csv',
 'surveysPU.csv',
 'surveysCM.csv',
 'surveysAS.csv',
 'surveysSC.csv',
 'surveysNL.csv',
 'surveysRM.csv',
 'surveysSU.csv',
 'surveysAH.csv',
 'surveysPP.csv',
 'surveysOL.csv',
 'surveysOX.csv',
 'surveysSA.csv',
 'surveysDO.csv',
 'surveysCQ.csv',
 'surveysSH.csv',
 'surveysCS.csv',
 'surveysCV.csv',
 'surveysnan.csv',
 'surveysPI.csv',
 'surveysPC.csv',
 'surveysUS.csv',
 'surveysSO.csv',
 'surveysOT.csv',
 'surveysRO.csv',
 'surveysPH.csv',
 'surveysRF.csv',
 'surveysBA.csv']

In [30]:
os.mkdir('data/species_files')

## Functions: make your code reusable and modular

In [38]:
# Toy example function

def multiply(input1, input2):
    # Print the 2 input arguments before doing anything.
    print("The function arguments are:", input1, input2)
    x = input1 * input2
    print(x)
    x

In [39]:
# Let's run our shiny new function!
result = multiply(2, 5)
print("The result is", result)

The function arguments are: 2 5
10
The result is None


# Functions on the real data

- One function will save a file for one year.
- The other function will loop over the years and call our previous function.

In [70]:
# Function to save a file
def one_year_csv_writer(this_year, all_data):
    """
    Writes a CSV file for data from a given year
    
    this_year --- year for which data is extracted
    all_data ---- DataFrame with multi-year data
    """
    
    # Select data for the one year
    surveys_year = all_data[all_data["year"] == this_year]
    
    if 'yearly_files' in os.listdir('data/'):
        print('Processed directory exists')
    else:
        os.mkdir('data/yearly_files')
        print('Processed directory created')
    
    # Write the new DataFrame to a CSV file
    filename = 'data/yearly_files/function_surveys' + str(this_year) + '.csv'
    surveys_year.to_csv(filename)
    print(filename)

In [69]:
os.listdir('data/')

['yearly_files', 'species_files']

In [41]:
?one_year_csv_writer

In [42]:
# Run our new function
one_year_csv_writer(2002, surveys_df)

In [43]:
# Check that our file was created
os.listdir('data/yearly_files/')

['surveys1990.csv',
 'surveys1983.csv',
 'surveys1994.csv',
 'surveys1986.csv',
 'surveys1997.csv',
 'surveys1988.csv',
 'surveys2000.csv',
 'surveys1979.csv',
 'surveys1999.csv',
 'surveys1995.csv',
 'surveys1993.csv',
 'surveys1982.csv',
 'surveys1978.csv',
 'surveys1985.csv',
 'surveys1981.csv',
 'surveys2002.csv',
 'surveys1977.csv',
 'surveys1992.csv',
 'function_surveys2002.csv',
 'surveys1991.csv',
 'surveys1987.csv',
 'surveys1998.csv',
 'surveys1989.csv',
 'surveys1996.csv',
 'surveys2001.csv',
 'surveys1984.csv',
 'surveys1980.csv']

## (Lunch Break)

In [63]:
# Let's write our second function to write all the CSV files.

def yearly_data_csv_writer(all_data, start_year = None, end_year = None):
    '''
    Writes separate CSV files for each year of data
    
    start_year --- the first year of data we want
    end_year -- the last year of data we want
    all_data --- DataFrame with multi-year data
    '''
    if not start_year:
        start_year = all_data.year.min()
    if not end_year:
        end_year = all_data.year.max()
    
    # "end_year" is the last year we want, so add +1 to include it.
    for year in range(start_year, end_year + 1):
        one_year_csv_writer(year, all_data)

In [50]:
list(range(1980, 1985 + 1, 2))

[1980, 1982, 1984]

In [51]:
# Run our final function!
yearly_data_csv_writer(1977, 1980, surveys_df)

In [56]:
# Check the output
os.listdir('data/yearly_files/')

['surveys1990.csv',
 'function_surveys1984.csv',
 'surveys1983.csv',
 'surveys1994.csv',
 'function_surveys1978.csv',
 'surveys1986.csv',
 'surveys1997.csv',
 'function_surveys1980.csv',
 'surveys1988.csv',
 'surveys2000.csv',
 'surveys1979.csv',
 'surveys1999.csv',
 'surveys1995.csv',
 'surveys1993.csv',
 'surveys1982.csv',
 'function_surveys1979.csv',
 'surveys1978.csv',
 'surveys1985.csv',
 'surveys1981.csv',
 'surveys2002.csv',
 'surveys1977.csv',
 'surveys1992.csv',
 'function_surveys2002.csv',
 'surveys1991.csv',
 'surveys1987.csv',
 'surveys1998.csv',
 'surveys1989.csv',
 'surveys1996.csv',
 'surveys2001.csv',
 'surveys1984.csv',
 'function_surveys1977.csv',
 'surveys1980.csv']

## Challenge

In [60]:
# Only 
yearly_data_csv_writer(1984, 1984, surveys_df)

data/yearly_files/function_surveys1984.csv


In [71]:
yearly_data_csv_writer(surveys_df, start_year=2000, end_year=2000)

Processed directory exists
data/yearly_files/function_surveys2000.csv


In [72]:
yearly_data_csv_writer(surveys_df, start_year=2000, end_year=2000)

Processed directory created
data/yearly_files/function_surveys2000.csv


In [66]:
yearly_data_csv_writer(surveys_df, end_year=1984)

data/yearly_files/function_surveys1977.csv
data/yearly_files/function_surveys1978.csv
data/yearly_files/function_surveys1979.csv
data/yearly_files/function_surveys1980.csv
data/yearly_files/function_surveys1981.csv
data/yearly_files/function_surveys1982.csv
data/yearly_files/function_surveys1983.csv
data/yearly_files/function_surveys1984.csv
