In [1]:
## Representando um CSV como lista de linhas

# Opção: Each row is a list

csv = [['A1','A2','A3'],
       ['B2','B2','B3']]

# Opção 2: Each row is a dictionary
csv = [{'name':'A1','name2':'A2', 'name2': 'A3'},
       {'name':'B1','name2':'B2', 'name3':'B3'}]

In [2]:
import unicodecsv

def read_csv(csv_file):
    with open(csv_file,'rb') as f:
        reader = unicodecsv.DictReader(f)
        return list(reader)

In [3]:
enrollments = read_csv('enrollments.csv')
enrollments[0]

{'account_key': '448',
 'cancel_date': '2015-01-14',
 'days_to_cancel': '65',
 'is_canceled': 'True',
 'is_udacity': 'True',
 'join_date': '2014-11-10',
 'status': 'canceled'}

In [40]:
daily_engagement = read_csv('daily_engagement.csv')
daily_engagement[0]

{'acct': '0',
 'lessons_completed': '0.0',
 'num_courses_visited': '1.0',
 'projects_completed': '0.0',
 'total_minutes_visited': '11.6793745',
 'utc_date': '2015-01-09'}

In [41]:
for row in daily_engagement:
    row['account_key'] = row['acct']
    del(row['acct'])

In [42]:
def get_unique_students(data):
    unique_students = set()
    for data_point in data:
        unique_students.add(data_point['account_key'])
    return unique_students

In [5]:
project_submissions = read_csv('project_submissions.csv')
project_submissions[0]

{'account_key': '256',
 'assigned_rating': 'UNGRADED',
 'completion_date': '2015-01-16',
 'creation_date': '2015-01-14',
 'lesson_key': '3176718735',
 'processing_state': 'EVALUATED'}

In [6]:
from datetime import datetime as dt

# Transforma string em data parta data returnando o objeto datetim em python
# Se não for passado uma data retorna None

def parse_date(date):
    if date == '':
        return None
    else:
        return dt.strptime(date, '%Y-%m-%d')
    
# Transforma string em inteiro parta data returnando inteiro
# Se não for passado um inteiro retorna None

def parse_maybe_int(i):
    if i == '':
        return None
    else:
        return int(i)

In [7]:
# Clean enrollment
for enrollment in enrollments:
    enrollment['cancel_date'] = parse_date(enrollment['cancel_date'])
    enrollment['days_to_cancel'] = parse_maybe_int(enrollment['days_to_cancel'])
    enrollment['is_canceled'] = enrollment['is_canceled'] == 'True'
    enrollment['is_udacity'] = enrollment['is_udacity'] == 'True'
    enrollment['join_date'] = parse_date(enrollment['join_date'])


In [8]:
enrollments[0]['account_key']

'448'

In [9]:
# Clean up tipos de dados em engagement
for engagement_record in daily_engagement:
    engagement_record['lessons_completed'] = int(float(engagement_record['lessons_completed'] ))
    engagement_record['num_courses_visited'] = int(float(engagement_record['num_courses_visited'] ))
    engagement_record['projects_completed'] = int(float(engagement_record['projects_completed'] ))
    engagement_record['total_minutes_visited'] = float(engagement_record['total_minutes_visited'] )
    engagement_record['utc_date'] = parse_date(engagement_record['utc_date'] )
    
daily_engagement[0]

{'acct': '0',
 'lessons_completed': 0,
 'num_courses_visited': 1,
 'projects_completed': 0,
 'total_minutes_visited': 11.6793745,
 'utc_date': datetime.datetime(2015, 1, 9, 0, 0)}

In [10]:
# Clean up tipos de dados em submissions
for submission in project_submissions:
    submission['completion_date'] = parse_date(submission['completion_date'] )
    submission['creation_date'] = parse_date(submission['creation_date'] )
    
project_submissions[0]


{'account_key': '256',
 'assigned_rating': 'UNGRADED',
 'completion_date': datetime.datetime(2015, 1, 16, 0, 0),
 'creation_date': datetime.datetime(2015, 1, 14, 0, 0),
 'lesson_key': '3176718735',
 'processing_state': 'EVALUATED'}

In [11]:
def alunos_unicos(lista,coluna):
    students = []
    for linha in lista:
        if (linha[coluna] not in students):
            students.append(linha[coluna]) 
            students.sort()
    
    return len(students)

In [45]:
enrollment_num_rows = len(enrollments)
enrollment_unique_students = get_unique_students(enrollments)
enrollment_num_unique_students = len(enrollment_unique_students)
# alunos_unicos(enrollments,'account_key')

engagement_num_rows = len(daily_engagement)
engagement_unique_students = get_unique_students(daily_engagement)
engagement_num_unique_students = len(engagement_unique_students)
# alunos_unicos(daily_engagement,'acct')

submission_num_rows = len(project_submissions)
submission_unique_students = get_unique_students(project_submissions)
submission_num_unique_students = len(submission_unique_students)
# alunos_unicos(project_submissions,'account_key')

In [46]:
print('enrollment_num_rows: %d' % enrollment_num_rows)
print('enrollment_num_unique_students: %d' % enrollment_num_unique_students)

print('engagement_num_rows: %d' % engagement_num_rows)
print('engagement_num_unique_students: %d' % engagement_num_unique_students)

print('submission_num_rows: %d ' % submission_num_rows)
print('submission_num_unique_students: %d' % submission_num_unique_students )

enrollment_num_rows: 1640
enrollment_num_unique_students: 1302
engagement_num_rows: 136240
engagement_num_unique_students: 1237
submission_num_rows: 3642 
submission_num_unique_students: 743


In [14]:
enrollments[0]

{'account_key': '448',
 'cancel_date': datetime.datetime(2015, 1, 14, 0, 0),
 'days_to_cancel': 65,
 'is_canceled': True,
 'is_udacity': True,
 'join_date': datetime.datetime(2014, 11, 10, 0, 0),
 'status': 'canceled'}

In [33]:
daily_engagement[0]

{'account_key': '0',
 'lessons_completed': 0,
 'num_courses_visited': 1,
 'projects_completed': 0,
 'total_minutes_visited': 11.6793745,
 'utc_date': datetime.datetime(2015, 1, 9, 0, 0)}