Before we get started, a couple of reminders to keep in mind when using iPython notebooks:

- Remember that you can see from the left side of a code cell when it was last run if there is a number within the brackets.
- When you start a new notebook session, make sure you run all of the cells up to the point where you last left off. Even if the output is still visible from when you ran the cells in your previous session, the kernel starts in a fresh state so you'll need to reload the data, etc. on a new session.
- The previous point is useful to keep in mind if your answers do not match what is expected in the lesson's quizzes. Try reloading the data and run all of the processing steps one by one in order to make sure that you are working with the same variables and data that are at each quiz stage.


## Load Data from CSVs

In [None]:
import unicodecsv

## Longer version of code (replaced with shorter, equivalent version below)

# enrollments = []
# f = open('enrollments.csv', 'rb')
# reader = unicodecsv.DictReader(f)
# for row in reader:
#     enrollments.append(row)
# f.close()

with open('enrollments.csv', 'rb') as f:
    reader = unicodecsv.DictReader(f) # make into a dictionary IT
    enrollments = list(reader) #  now you make a list of dictionaries IT
    
print(enrollments[0])

In [None]:
#####################################
#                 1                 #
#####################################

## Read in the data from daily_engagement.csv and project_submissions.csv 
## and store the results in the below variables.
## Then look at the first row of each table.

def read_data(file):
    with open(file, "rb") as f:
        reader = unicodecsv.DictReader(f)
        return list(reader)

daily_engagement = read_data("daily_engagement.csv")
project_submissions = read_data("project_submissions.csv")

print(daily_engagement[0])
print(project_submissions[0])

## Fixing Data Types

In [None]:
from datetime import datetime as dt

# Takes a date as a string, and returns a Python datetime object. 
# If there is no date given, returns None
def parse_date(date):
    if date == '':
        return None
    else:
        return dt.strptime(date, '%Y-%m-%d')
    
# Takes a string which is either an empty string or represents an integer,
# and returns an int or None.
def parse_maybe_int(i):
    if i == '':
        return None
    else:
        return int(i)

# Clean up the data types in the enrollments table
for enrollment in enrollments:
    enrollment['cancel_date'] = parse_date(enrollment['cancel_date'])
    enrollment['days_to_cancel'] = parse_maybe_int(enrollment['days_to_cancel'])
    enrollment['is_canceled'] = enrollment['is_canceled'] == 'True' # returns boolean. If string is then it retuns
    # a boolean of True. If the string is NOT equal then it returns a boolean of False
    enrollment['is_udacity'] = enrollment['is_udacity'] == 'True'
    enrollment['join_date'] = parse_date(enrollment['join_date'])
    
enrollments[0]

In [None]:
# Clean up the data types in the engagement table
for engagement_record in daily_engagement:
    engagement_record['lessons_completed'] = int(float(engagement_record['lessons_completed']))
    engagement_record['num_courses_visited'] = int(float(engagement_record['num_courses_visited']))
    engagement_record['projects_completed'] = int(float(engagement_record['projects_completed']))
    engagement_record['total_minutes_visited'] = float(engagement_record['total_minutes_visited'])
    engagement_record['utc_date'] = parse_date(engagement_record['utc_date'])
    
daily_engagement[0]

In [None]:
# Clean up the data types in the submissions table
for submission in project_submissions:
    submission['completion_date'] = parse_date(submission['completion_date'])
    submission['creation_date'] = parse_date(submission['creation_date'])

project_submissions[0]

Note when running the above cells that we are actively changing the contents of our data variables. If you try to run these cells multiple times in the same session, an error will occur.

## Investigating the Data

## Problems in the Data

In [None]:
#####################################
#                 3                 #
#####################################

## Rename the "acct" column in the daily_engagement table to "account_key".

for engagement_point in daily_engagement:
    engagement_point['account_key'] = engagement_point['acct']
    del[engagement_point['acct']]
    
daily_engagement[0]

In [None]:
#####################################
#                 2                 #
#####################################

## Find the total number of rows and the number of unique students (account keys)
## in each table.

def unique_student(file):
    unique_list = set()
    for student in file:
        unique_list.add(student['account_key'])
    return unique_list

print(len(enrollments))
print(len(daily_engagement))
print(len(project_submissions))

In [None]:
unique_enrollments = unique_student(enrollments)
unique_engagement = unique_student(daily_engagement)
unique_project_submissions = unique_student(project_submissions)

print(len(unique_enrollments))
print(len(unique_engagement))
print(len(unique_project_submissions))

## Missing Engagement Records

In [None]:
#####################################
#                 4                 #
#####################################

## Find any one student enrollments where the student is missing from the daily engagement table.
## Output that enrollment.


for student in enrollments:
    account = student["account_key"]
    if account not in unique_engagement:
        print student
        break # terminates current loop so you get only one student

In [None]:
### List comprehension of above code and used "next" to print out only the first find
## "next" needed a second argument just in case the list was empty so we don't get some type of 
## error. So, we put "None" but you could also write whatever you want as the default return.

account = next((student for student in enrollments if student["account_key"] not in unique_engagement),
               "empty list")  # if list returned is empty nothing will get returned in this case
account

## Checking for More Problem Records

In [None]:
#####################################
#                 5                 #
#####################################

## Find the number of surprising data points (enrollments missing from
## the engagement table) that remain, if any.

# if you looked thouroughly you would see that some people joined and cancelled on the same day \
# which mabye isn't worth having.

count = 0
for student in enrollments:
    account = student["account_key"]
    if account not in unique_engagement and student["join_date"] != student["cancel_date"]:
        count += 1
        print(student)
print count

#or

count = 0
for student in enrollments:
    account = student["account_key"]
    if (account not in unique_engagement and 
    student["join_date"] != student["cancel_date"]):
            count += 1
    
print count



## Tracking Down the Remaining Problems

In [None]:
# Create a set of the account keys for all Udacity test accounts
udacity_test_accounts = set()
for student in enrollments:
    if student['is_udacity'] == True: # I guess you don't need to put "True" because it's assumed
        udacity_test_accounts.add(student['account_key'])
len(udacity_test_accounts)

In [None]:
############## list comprehension of below
def remove_udacity_accounts(data):
    real_udacity_data = [data_point for data_point in data if data_point['account_key'] not in udacity_test_accounts]
    return real_udacity_data   

In [None]:
# Given some data with an account_key field, removes any records corresponding to Udacity test accounts
def remove_udacity_accounts(data):
    real_udacity_data = []
    for data_point in data:
        if data_point['account_key'] not in udacity_test_accounts:
            real_udacity_data.append(data_point)
    return real_udacity_data

In [None]:
# Remove Udacity test accounts from all three tables
real_udacity_enrollments = remove_udacity_accounts(enrollments)
real_udacity_engagement = remove_udacity_accounts(daily_engagement)
real_udacity_submissions = remove_udacity_accounts(project_submissions)

print len(real_udacity_enrollments)
print len(real_udacity_engagement)
print len(real_udacity_submissions)
print (real_udacity_enrollments[0])

## Refining the Question

In [None]:
#####################################
#                 6                 #
#####################################

## Create a dictionary named paid_students containing all students who either
## haven't canceled yet or who remained enrolled for more than 7 days. The keys
## should be account keys, and the values should be the date the student enrolled.

# set comprehension of code below
paid_students = {student['account_key']: student['join_date'] for student in real_udacity_enrollments 
                 if not student["is_canceled"] or student["days_to_cancel"] > 7}
paid_students

In [None]:
paid_students = {}
for student in real_udacity_enrollments:
    if not student["is_canceled"] or student["days_to_cancel"] > 7:
        account_key = student["account_key"]
        enrollment_date = student["join_date"]
        paid_students[account_key] = enrollment_date
    
print(paid_students)

## Getting Data from First Week

In [None]:
# Takes a student's join date and the date of a specific engagement record,
# and returns True if that engagement record happened within one week
# of the student joining.
def within_one_week(join_date, engagement_date):
    time_delta = engagement_date - join_date
    return time_delta.days < 7 and time_delta.days >= 0

In [None]:
# list comprehension of code below
def remove_free_trials(data):
    new_data = [data_point for data_point in data if data_point['account_key'] in paid_students]
    return new_data

paid_enrollments = remove_free_trials(real_udacity_enrollments)  
paid_engagement = remove_free_trials(real_udacity_engagement)
paid_submissions = remove_free_trials(real_udacity_submissions)

print len(paid_enrollments)
print len(paid_engagement)
print len(paid_submissions)

print (paid_engagement[4])

In [None]:
def remove_free_trials(data):
    new_data = []
    for data_point in data:
        if data_point["account_key"] in paid_students:
            new_data.append(data_point)
    return new_data
        
paid_enrollments = remove_free_trials(real_udacity_enrollments)  
paid_engagement = remove_free_trials(real_udacity_engagement)
paid_submissions = remove_free_trials(real_udacity_submissions)

print len(paid_enrollments)
print len(paid_engagement)
print len(paid_submissions)

print (paid_engagement[4])

In [None]:
for engagement_record in paid_engagement:
    if engagement_record["num_courses_visited"] > 0:
        engagement_record["has_visited"] = 1
    else:
        engagement_record["has_visited"] = 0
        
print paid_engagement[:3]

In [None]:
#####################################
#                 7                 #
#####################################

## Create a list of rows from the engagement table including only rows where
## the student is one of the paid students you just found, and the date is within
## one week of the student's join date.

paid_engagement_in_first_week = []

for paying_student in paid_engagement:
    account_key = paying_student["account_key"] #saved account key in this variable
    join_date = paid_students[account_key] # no clue why we did this. Supposedly this let you look up the join date in the dictionary you made above
    engagement_date = paying_student["utc_date"]
    
    
    if within_one_week(join_date, engagement_date):
        paid_engagement_in_first_week.append(paying_student)

len(paid_engagement_in_first_week)
print(paid_engagement_in_first_week[:3])

## Exploring Student Engagement

In [None]:
from collections import defaultdict

# Create a dictionary of engagement grouped by student.
# The keys are account keys, and the values are lists of engagement records.

engagement_by_account = defaultdict(list) #defaultdict makes empty lists if you look
# up a key and there isn't anything there.
for engagement_record in paid_engagement_in_first_week:
    account_key = engagement_record['account_key'] ## IT. save each students account key (value) in this variable
    engagement_by_account[account_key].append(engagement_record) # look up engagement_by_account
    # and if it's empty then it will add the append. Looks like you also have 
    # to use a variable to look up values.

engagement_by_account

In [None]:
# Create a dictionary with the total minutes each student spent in the classroom during the first week.
# The keys are account keys, and the values are numbers (total minutes)
total_minutes_by_account = {}
for account_key, engagement_for_student in engagement_by_account.items():
    total_minutes = 0
    for engagement_record in engagement_for_student:
        total_minutes += engagement_record['total_minutes_visited']
    total_minutes_by_account[account_key] = total_minutes
    
total_minutes_by_account

In [None]:
import numpy as np

# Summarize the data about minutes spent in the classroom
total_minutes = total_minutes_by_account.values()
print 'Mean:', np.mean(total_minutes)
print 'Standard deviation:', np.std(total_minutes)
print 'Minimum:', np.min(total_minutes)
print 'Maximum:', np.max(total_minutes)    

In [None]:
student_with_max_minutes = None
max_minutes = 0

for student, total_minutes in total_minutes_by_account.items():
    if total_minutes > max_minutes:
        max_minutes = total_minutes
        student_with_max_minutes = student # the object ends up becoming the students account key
        
max_minutes     

In [None]:
for engagement_record in paid_engagement_in_first_week:
    if engagement_record["account_key"] == student_with_max_minutes:
        print engagement_record

## Do the same for lessons completed

In [None]:
from collections import defaultdict

def make_groups(data, key_name):
    groups_made = defaultdict(list)
    for data_point in data:
        key = data_point[key_name]
        groups_made[key].append(data_point)
    return groups_made
        
engagement_by_account = make_groups(paid_engagement_in_first_week, "account_key")

In [None]:
def sum_grouped_items(grouped_data, field_name):
    summed_data = {}
    
    for key, data_points in grouped_data.items():
        total = 0
        for data_point in data_points:
            total += data_point[field_name]
            summed_data[key] = total
    
    return summed_data

total_minutes_by_account = sum_grouped_items(engagement_by_account, "total_minutes_visited")

In [None]:
import numpy as np

# Summarize the data lessons completed in the classroom
def describe_data(data):
    print 'Mean:', np.mean(data)
    print 'Standard deviation:', np.std(data)
    print 'Minimum:', np.min(data)
    print 'Maximum:', np.max(data) 

total_minutes = total_minutes_by_account.values()
describe_data(total_minutes)

In [None]:
# above you checked to see that the answers were similar to your answers for total_minutes_visited. Now you
# run these functions again except for lessons completed.
lessons_completed_by_account = sum_grouped_items(engagement_by_account, "lessons_completed")
describe_data(lessons_completed_by_account.values()) # need to take values first

## Debugging Data Analysis Code

In [None]:
#####################################
#                 8                 #
#####################################

## Go through a similar process as before to see if there is a problem.
## Locate at least one surprising piece of data, output it, and take a look at it.

## Lessons Completed in First Week

In [None]:
#####################################
#                 9                 #
#####################################

## Adapt the code above to find the mean, standard deviation, minimum, and maximum for
## the number of lessons completed by each student during the first week. Try creating
## one or more functions to re-use the code above.



## Number of Visits in First Week

In [None]:
######################################
#                 10                 #
######################################

## Find the mean, standard deviation, minimum, and maximum for the number of
## days each student visits the classroom during the first week.

# use your sum_grouped_items function above
days_visited_by_account = sum_grouped_items(engagement_by_account, "has_visited")

# change to .values and then use your describe function above 
describe_data(days_visited_by_account.values())

## Splitting out Passing Students

In [None]:
######################################
#                 11                 #
######################################

## Create two lists of engagement data for paid students in the first week.
## The first list should contain data for students who eventually pass the
## subway project, and the second list should contain data for students
## who do not.

unique_passing_engagement = set() # sets are unique

subway_project_lesson_keys = ['746169184', '3176718735']

for student in paid_submissions:
    
    project_number = student["lesson_key"]
    grade = student["assigned_rating"]
    
    if project_number in subway_project_lesson_keys and (grade == "PASSED" or grade == "DISCTINCTION"):
        unique_passing_engagement.add(student["account_key"])

unique_passing_engagement

In [None]:
paid_engagement_in_first_week[500]

In [None]:
paid_submissions[0]

In [None]:
# THIS IS WRONG. you can't start looping through the new set and try to match to the list
# You need to do the opposite by looping through the list and seeing if it matches in the unique set.
# think about it. It you did it by looping through the smaller set then your new list can't have all the rows from the
# bigger list

passing_students = []
failing_students = []

for account in unique_passing_engagement:
    if student in paid_engagement_in_first_week:
        passing_students.append(student)
    else:
        failing_students.append(student)

print(len(passing_students))
print(len(failing_students))

In [None]:
# right answer.

passing_students = []
failing_students = []

for student in paid_engagement_in_first_week:
    if student["account_key"] in unique_passing_engagement:
        passing_students.append(student)
    else:
        failing_students.append(student)

print(len(passing_students))
print(len(failing_students))

## Comparing the Two Student Groups

In [None]:
######################################
#                 12                 #
######################################

## Compute some metrics you're interested in and see how they differ for
## students who pass the subway project vs. students who don't. A good
## starting point would be the metrics we looked at earlier (minutes spent
## in the classroom, lessons completed, and days visited).

failing_students[:50]

In [None]:
# use the functions you made for lessons completed section
# def make_groups(data, key_name)

grouping_accounts_failing = make_groups(failing_students, "account_key")
grouping_accounts_passing = make_groups(passing_students, "account_key")

grouping_accounts_passing

In [None]:
# def make_groups(data, key_name)
grouping_minutes_visited_failing = sum_grouped_items(grouping_accounts_failing, "total_minutes_visited")
grouping_minutes_visited_passing = sum_grouped_items(grouping_accounts_passing, "total_minutes_visited")

grouping_minutes_visited_passing

In [None]:
# def describe_data(data):
    # print 'Mean:', np.mean(data)
    # print 'Standard deviation:', np.std(data)
    # print 'Minimum:', np.min(data)
    # print 'Maximum:', np.max(data)
# since your using numpy you need to convert these minutes using .values


describe_data(grouping_minutes_visited_failing.values())
print("")
describe_data(grouping_minutes_visited_passing.values())

## Making Histograms

In [None]:
######################################
#                 13                 #
######################################

## Make histograms of the three metrics we looked at earlier for both
## students who passed the subway project and students who didn't. You
## might also want to make histograms of any other metrics you examined.

%matplotlib inline
import matplotlib.pyplot as plt
converting_passing = grouping_minutes_visited_passing.values()

plt.hist(converting_passing)

#plt.hist(np.mean(converting_passing))
#plt.hist(np.std(converting_passing))
#plt.hist(np.min(converting_passing))
#plt.hist(np.max(converting_passing))

converting_failing = grouping_minutes_visited_failing.values()

#plt.hist(np.mean(converting_failing))
#plt.hist(np.std(converting_failing))
#plt.hist(np.min(converting_failing))
#plt.hist(np.max(converting_failing))

In [None]:
%matplotlib inline
plt.hist(converting_failing)

In [None]:
# comparing by number of courses visited. We already grouped them by account number above

grouping_courses_visited_failing = sum_grouped_items(grouping_accounts_failing, "num_courses_visited")
grouping_courses_visited_passing = sum_grouped_items(grouping_accounts_passing, "num_courses_visited")

In [None]:
#failing students

import seaborn as sns
plt.hist(grouping_courses_visited_failing.values(), bins = 100)
plt.xlabel("Number of courses visited")
plt.title('give a title of what this is about')

In [None]:
import seaborn as sns
plt.hist(grouping_courses_visited_passing.values(), bins = 20)

## Improving Plots and Sharing Findings

In [None]:
######################################
#                 14                 #
######################################

## Make a more polished version of at least one of your visualizations
## from earlier. Try importing the seaborn library to make the visualization
## look better, adding axis labels and a title, and changing one or more
## arguments to the hist() function.

import seaborn as sns
plt.hist(grouping_courses_visited_failing.values(), bins = 100)
plt.xlabel("Number of courses visited")
plt.title('give a title of what this is about')