## **Reading and exploring the data files:**
Following are the 4 variations of a simple python code to read csv files using **csv**, **unicodecsv** and **pandas** liberaries.

Then we correct the data types

In [4]:
import os
import csv


def get_file_path(filename):
    path = os.getcwd()
    path_file = os.path.join(path, "../data/", filename)
    return path_file


def read_csv(filename):
    enrollments = []
    file_path = get_file_path(filename)
    f = open(file_path, 'r')
    reader = csv.DictReader(f)  # reader here is an iterator.

    for row in reader:
        enrollments.append(row)
    f.close()
    return enrollments


def main():
    enrollments_file = 'enrollments.csv'
    engagement_file = 'daily_engagement.csv'
    submissions_file = 'project_submissions.csv'

    enrollments = read_csv(enrollments_file)
    daily_engagement = read_csv(engagement_file)
    submissions_file = read_csv(submissions_file)
    
    print(enrollments[0])
    print(daily_engagement[0])
    print(submissions_file[0])


if __name__ == '__main__':
    main()

{'account_key': '448', 'status': 'canceled', 'join_date': '2014-11-10', 'cancel_date': '2015-01-14', 'days_to_cancel': '65', 'is_udacity': 'True', 'is_canceled': 'True'}
{'acct': '0', 'utc_date': '2015-01-09', 'num_courses_visited': '1.0', 'total_minutes_visited': '11.6793745', 'lessons_completed': '0.0', 'projects_completed': '0.0'}
{'creation_date': '2015-01-14', 'completion_date': '2015-01-16', 'assigned_rating': 'UNGRADED', 'account_key': '256', 'lesson_key': '3176718735', 'processing_state': 'EVALUATED'}


In [6]:
# import os
import unicodecsv


def get_file_path(filename):
    path = os.getcwd()
    path_file = os.path.join(path, "../data/", filename)
    return path_file


def read_csv(filename):
    file_path = get_file_path(filename)
    with open(file_path, 'rb') as f:
        reader = unicodecsv.DictReader(f)  # reader here is an iterator.
        data_dict = list(reader)

    return data_dict


def main():
    enrollments_file = 'enrollments.csv'
    engagement_file = 'daily_engagement.csv'
    submissions_file = 'project_submissions.csv'

    enrollments = read_csv(enrollments_file)
    daily_engagement = read_csv(engagement_file)
    submissions_file = read_csv(submissions_file)
    
    print(enrollments[0])
    print(daily_engagement[0])
    print(submissions_file[0])


if __name__ == '__main__':
    main()


{'account_key': '448', 'status': 'canceled', 'join_date': '2014-11-10', 'cancel_date': '2015-01-14', 'days_to_cancel': '65', 'is_udacity': 'True', 'is_canceled': 'True'}
{'acct': '0', 'utc_date': '2015-01-09', 'num_courses_visited': '1.0', 'total_minutes_visited': '11.6793745', 'lessons_completed': '0.0', 'projects_completed': '0.0'}
{'creation_date': '2015-01-14', 'completion_date': '2015-01-16', 'assigned_rating': 'UNGRADED', 'account_key': '256', 'lesson_key': '3176718735', 'processing_state': 'EVALUATED'}


In [9]:
# import os
# import csv
from datetime import datetime as dt


def get_file_path(filename):
    path = os.getcwd()
    path_file = os.path.join(path, "../data/", filename)
    return path_file


def read_csv(filename):
    file_path = get_file_path(filename)
    with open(file_path, 'r') as f:
        reader = csv.DictReader(f)  # reader here is an iterator.
        data_dict = list(reader)

    return data_dict


# Takes a date as a string, and returns a Python datetime object.
# If there is no date given, returns None
def parse_date(date):
    if date == '':
        return None
    else:
        return dt.strptime(date, '%Y-%m-%d')


# Takes a string which is either an empty string or represents an integer,
# and returns an int or None.
def parse_maybe_int(i):
    if i == '':
        return None
    else:
        return int(i)


def clean_enrollment_data(enrollments):
    for enrollment in enrollments:
        enrollment['cancel_date'] = parse_date(enrollment['cancel_date'])
        enrollment['join_date'] = parse_date(enrollment['join_date'])
        enrollment['days_to_cancel'] = parse_maybe_int(enrollment['days_to_cancel'])
        enrollment['is_canceled'] = enrollment['is_canceled'] == 'True'
        enrollment['is_udacity'] = enrollment['is_udacity'] == 'True'


def clean_engagement_data(engagements):
    for engagement in engagements:
        engagement['utc_date'] = parse_date(engagement['utc_date'])
        engagement['num_courses_visited'] = int(float(engagement['num_courses_visited']))
        engagement['lessons_completed'] = int(float(engagement['lessons_completed']))
        engagement['projects_completed'] = int(float(engagement['projects_completed']))
        engagement['total_minutes_visited'] = float(engagement['total_minutes_visited'])


def clean_submission_data(submissions):
    for submission in submissions:
        submission['completion_date'] = parse_date(submission['completion_date'])
        submission['creation_date'] = parse_date(submission['creation_date'])


def main():
    enrollments_file = 'enrollments.csv'
    engagement_file = 'daily_engagement.csv'
    submissions_file = 'project_submissions.csv'

    enrollments = read_csv(enrollments_file)
    clean_enrollment_data(enrollments)

    engagements = read_csv(engagement_file)
    clean_engagement_data(engagements)

    submissions = read_csv(submissions_file)
    clean_submission_data(submissions)
    
    print(enrollments[0])
    print(engagements[0])
    print(submissions[0])


if __name__ == '__main__':
    main()


{'account_key': '448', 'status': 'canceled', 'join_date': datetime.datetime(2014, 11, 10, 0, 0), 'cancel_date': datetime.datetime(2015, 1, 14, 0, 0), 'days_to_cancel': 65, 'is_udacity': True, 'is_canceled': True}
{'acct': '0', 'utc_date': datetime.datetime(2015, 1, 9, 0, 0), 'num_courses_visited': 1, 'total_minutes_visited': 11.6793745, 'lessons_completed': 0, 'projects_completed': 0}
{'creation_date': datetime.datetime(2015, 1, 14, 0, 0), 'completion_date': datetime.datetime(2015, 1, 16, 0, 0), 'assigned_rating': 'UNGRADED', 'account_key': '256', 'lesson_key': '3176718735', 'processing_state': 'EVALUATED'}


In [13]:
import os
import pandas as pd


def get_file_path(filename):
    path = os.getcwd()
    path_file = os.path.join(path, "../data/", filename)
    return path_file


def read_csv(filename):
    file_path = get_file_path(filename)
    df = pd.read_csv(file_path)

    return df


def main():
    enrollments_file = 'enrollments.csv'
    engagement_file = 'daily_engagement.csv'
    submissions_file = 'project_submissions.csv'

    enrollments = read_csv(enrollments_file)
    print(enrollments.head())
    print(len(enrollments.account_key))
    print(enrollments['account_key'].nunique())

    engagements = read_csv(engagement_file)
    print(engagements.head())
    print(len(engagements.acct))
    print(engagements['acct'].nunique())


    submissions = read_csv(submissions_file)
    print(submissions.head())
    print(len(submissions.account_key))
    print(submissions['account_key'].nunique())


if __name__ == '__main__':
    main()


   account_key    status   join_date cancel_date  days_to_cancel  is_udacity  \
0          448  canceled  2014-11-10  2015-01-14            65.0        True   
1          448  canceled  2014-11-05  2014-11-10             5.0        True   
2          448  canceled  2015-01-27  2015-01-27             0.0        True   
3          448  canceled  2014-11-10  2014-11-10             0.0        True   
4          448   current  2015-03-10         NaN             NaN        True   

   is_canceled  
0         True  
1         True  
2         True  
3         True  
4        False  
1640
1302
   acct    utc_date  num_courses_visited  total_minutes_visited  \
0     0  2015-01-09                  1.0              11.679374   
1     0  2015-01-10                  2.0              37.284887   
2     0  2015-01-11                  2.0              53.633746   
3     0  2015-01-12                  1.0              33.489270   
4     0  2015-01-13                  1.0              64.779678   

   l