In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas
import seaborn
import csv
from datetime import datetime
from collections import defaultdict

## Functions

In [3]:
def parse_date(arg):
    if arg == "":
        return None
    else:
        return datetime.strptime(arg, "%Y-%m-%d").date()

def parse_int(arg):
    if arg == '':
        return None
    else:
        return int(arg)

def parse_float(arg):
    if arg == '' or None:
        return None
    else:
        return float(arg)

def date_sustract(date1,date2):
    if date1 == None or date2 == None:
        return -100
    else:
        return (date1-date2).days

def students_number(arr):
    students = set()
    for student in arr:
        students.add(student["account_key"])
    return len(students)

def get_students(arr):
    students = set()
    for student in arr:
        students.add(student["account_key"])
    return students

def remove_students(arr,to_delete):
    tmp = []

    for data in arr:
        if data["account_key"] not in to_delete:
            tmp.append(data)

    return tmp

def remove_free_trial(arr,students):
    non_free_trial = []

    for data in arr:
        if data['account_key'] in students:
            non_free_trial.append(data)

    return non_free_trial



In [4]:
def get_enrollments():

    enrollemnts_path = 'csv_files/enrollments.csv'

    with open(enrollemnts_path) as f:
        csv_reader = csv.DictReader(f)


        #print(csv_reader.dialect)
        #print(csv_reader.fieldnames)

        enrollments = []

        days_to_cancel = []

        for data in csv_reader:

            data["account_key"] = parse_int(data["account_key"])
            data["cancel_date"] = parse_date(data["cancel_date"])
            data["join_date"] = parse_date(data["join_date"])
            data["days_to_cancel"] = parse_int(data["days_to_cancel"])
            data["is_udacity"] = data["is_udacity"] == "True"
            data["is_canceled"] = data["is_canceled"] == "True"


            enrollments.append(data)

        for data in enrollments:
            tmp = date_sustract(data["cancel_date"], data["join_date"])
            if tmp == -100:
                continue
            days_to_cancel.append(tmp)

        tmp = date_sustract(enrollments[0]["cancel_date"],enrollments[0]["join_date"])
        #print(enrollments[0]["cancel_date"],enrollments[0]["join_date"], "=",tmp,type(tmp),'\n\n' )

        days_to_cancel.sort()
        days_to_cancel.reverse()
        #plt.hist(days_to_cancel,100,histtype='bar',ec='black')
        #plt.grid(True)
        #plt.show()
        print ("enrollments =",(len(enrollments)), students_number(enrollments))

        return enrollments


In [5]:
def get_project_submissions():
    project_submissions_path = "csv_files/project_submissions.csv"
    with open(project_submissions_path) as f:
        project_submissions_csv = csv.DictReader(f)

        #print(list(project_submissions_csv)[0])
        #print(project_submissions_csv.fieldnames)


        project_submissions = []

        for data in project_submissions_csv:
            data["account_key"] = parse_int(data["account_key"])
            data["creation_date"] = parse_date(data["creation_date"])
            data["completion_date"] = parse_date(data["completion_date"])
            project_submissions.append(data)

        plot_data = []
        for data in project_submissions:
            tmp = date_sustract(data["completion_date"],data["creation_date"])
            if tmp == -100 and tmp <40:
                continue
            plot_data.append(tmp)

        #print("project_submissions =",len(project_submissions), students_number(project_submissions))
        #plt.hist(plot_data,100, histtype="bar",ec="black")
        #plt.plot(plot_data,"r.")
        #plt.show()

        return project_submissions


In [6]:
def get_daily_engagement():
    project_daily_engagement = 'csv_files/daily_engagement.csv'

    with open(project_daily_engagement) as f:
        daily_engagement_csv = csv.DictReader(f)

        daily_engagement = []
        plot_data = []
        #print(daily_engagement_csv.fieldnames)
        for data in daily_engagement_csv:
            data['account_key'] = parse_int(data['account_key'])
            data['utc_date'] = parse_date(data['utc_date'])
            data['num_courses_visited'] = parse_float(data['num_courses_visited'])
            data['total_minutes_visited'] = parse_float(data['total_minutes_visited'])
            data['lessons_completed'] = parse_float(data['lessons_completed'] )
            data['projects_completed'] = parse_float(data['projects_completed'])
            daily_engagement.append(data)
            plot_data.append(data['projects_completed'])


        #print("daily_engagement =",len(daily_engagement), students_number(daily_engagement))
        #plt.hist(plot_data,3)
        #plt.show()
        return daily_engagement


In [8]:
"""
def get_daily_engagement_full():
    project_daily_engagement_full = '../Data/daily_engagement_full.csv'

    with open(project_daily_engagement_full) as f:
        daily_engagement_csv = csv.DictReader(f)

        daily_engagement = []
        plot_data = []
        print(daily_engagement_csv.fieldnames)

        for data in daily_engagement_csv:
            data['account_key'] = parse_int(data['account_key'])
            data['utc_date'] = parse_date(data['utc_date'])
            data['total_minutes_visited'] = parse_float(data['total_minutes_visited'])
            data['lessons_completed'] = parse_float(data['lessons_completed'] )
            data['projects_completed'] = parse_float(data['projects_completed'])
            daily_engagement.append(data)
            plot_data.append(data['projects_completed'])
            print(data)

        # acct,registration_date,subscription_start,course_key,sibling_key,course_title,has_visited
        #[ 'acct', 'registration_date', 'subscription_start', 'course_key', 'sibling_key', 'course_title', 'has_visited', 'lessons_completed', 'projects_completed', 'account_key']
        #print("daily_engagement =",len(daily_engagement), students_number(daily_engagement))
        #plt.hist(plot_data,3)
        #plt.show()
        return daily_engagement
"""
print("nothing")

nothing


In [10]:

enrollments = get_enrollments()
daily_engagement = get_daily_engagement()
#daily_engagement_full = get_daily_engagement_full()
project_submissions = get_project_submissions()
students_engagement = get_students(daily_engagement)
students_enrollments = get_students(enrollments)


to_delete = set()

for data in enrollments:
    if data["is_udacity"]:
        to_delete.add(data["account_key"])

print(to_delete)

enrollments = remove_students(enrollments,to_delete)
daily_engagement = remove_students(daily_engagement,to_delete)
project_submissions = remove_students(project_submissions,to_delete)


#print("enrollments =",len(enrollments), students_number(enrollments))
#print("daily_engagement =",len(daily_engagement), students_number(daily_engagement))
#print("project_submissions =",len(project_submissions), students_number(project_submissions))



enrollments = 1640 1302
{448, 312, 1069, 1101, 818, 1304}


In [None]:



paid_students = {}

#print(enrollments[0])


for data in enrollments:
    if data["days_to_cancel"] == None or data["days_to_cancel"] > 7:
        account_key = data["account_key"]
        join_date = data["join_date"]

        if account_key not in paid_students.keys() or join_date > paid_students[account_key]:
            paid_students[account_key] = data["join_date"]



print(len(paid_students))


paid_enrollments = remove_free_trial(enrollments,paid_students.keys())
paid_engagement =  remove_free_trial(daily_engagement,paid_students.keys())
paid_submissions = remove_free_trial(project_submissions,paid_students.keys())


print(len(paid_enrollments))
print(len(paid_engagement))
print(len(paid_submissions))


paid_engagement_in_first_week = []

for data in paid_engagement:

    account_key = data['account_key']
    join_date  = paid_students[account_key]
    engagement_record_date = data['utc_date']

    if date_sustract(engagement_record_date,join_date) < 7:
        paid_engagement_in_first_week.append(data)

print(len(paid_engagement_in_first_week))




In [11]:
#see students in first week

engagement_by_account = defaultdict(list)

for data in paid_engagement_in_first_week:
    account_key = data['account_key']
    engagement_by_account[account_key].append(data)


total_minutes_per_account = {}

for account_key,data in engagement_by_account.items():
    total_minutes = 0
    for values in data:
        total_minutes += values['total_minutes_visited']
    total_minutes_per_account[account_key] = total_minutes

total_minutes = list(total_minutes_per_account.values())

#print(type(total_minutes))

minutes_average = np.average(total_minutes)

print(minutes_average)

647.5901738262695
