In [10]:
#standard libraries
import pandas as pd, numpy as np

#randomization
from faker import Faker

#date and time
from datetime import date

In [2]:
#export the file
enrolment = pd.read_excel('./experimental/tables_to_use/dashboard_sketches.xlsx',
                          sheet_name='enrolment')

enrolment.head(3)

Unnamed: 0,COURSE_ID,CYCLE_ID,STUDENT_ID,ENROLLMENT_DATE,COMPLETION_DATE,CANCELLED,CANCELLATION_REASON,Unnamed: 7,Unnamed: 8,Unnamed: 9,ADDENDUM,NEED
0,,,S000000,,,,,,,,COURSE_ID,X
1,,,S000001,,,,,,,,CYCLE_ID,X
2,,,S000002,,,,,,,,STUDENT_ID,X


In [3]:
#remove uninformative features
enrolment = enrolment.drop(['CYCLE_ID',
                            'Unnamed: 7', 'Unnamed: 8', 'Unnamed: 9',
                            'CANCELLATION_REASON', 'ADDENDUM', 'NEED'],
                           axis=1,
                           errors='ignore')

#drop features to be created at a later stage
enrolment = enrolment.drop(['ENROLLMENT_DATE', 'COMPLETION_DATE'],
                           axis=1,
                           errors='ignore')

In [4]:
enrolment.head(3)

Unnamed: 0,COURSE_ID,STUDENT_ID,CANCELLED
0,,S000000,
1,,S000001,
2,,S000002,


#### CREATE CANCELLED BY RANDOMIZATION

let's say a student cancels if their student_id is divisable by 7 without a remainder

In [5]:
#set the index
enrolment = enrolment.reset_index()

In [6]:
#rename the feature(s)
enrolment = enrolment.rename(columns={'index': 'ID',
                              })

In [7]:
#write a function to set a random number of students to cancelled if their id is divisable by 7 without a remainder
def cancelled_students(data):
    #if the ID is divisable by 7 without a remainder
    if data['ID'] % 7 == 0:
        return True
    else:
        return False

In [8]:
#apply the function
enrolment['CANCELLED'] = enrolment.apply(cancelled_students,
                            axis=1)

In [9]:
#check that the change was applied
enrolment['CANCELLED'].value_counts()

CANCELLED
False    171
True      29
Name: count, dtype: int64

that's 14.5% that have completed the studies for any reason

#### CREATE ENROLMENT_DATE BY RANDOMIZATION

In [11]:
#create the object class
fake = Faker()

#create an empty list
enrol_lst = []

#set the entry limit and no. of iterations
limit = 200
i = 0

#set the start and end dates
s_date = '2020-01-01'
e_date = '2024-08-01'

#convert the dates to datetime
s_date = pd.to_datetime(s_date)
e_date = pd.to_datetime(e_date)

#loop until the limit is reached
while i < limit:
    enrol_lst.append(fake.date_between(start_date=s_date, end_date=e_date))
    i += 1

In [14]:
enrolment.head()

Unnamed: 0,ID,COURSE_ID,STUDENT_ID,CANCELLED,ENROLMENT_DATE
0,0,,S000000,True,2021-04-12
1,1,,S000001,False,2023-10-31
2,2,,S000002,False,2022-06-19
3,3,,S000003,False,2021-12-13
4,4,,S000004,False,2022-05-18


In [13]:
#insert the new feature
enrolment.insert(4, 'ENROLMENT_DATE', enrol_lst, True)

#### CREATE COMPLETION_DATE BY RANDOMIZATION

In [15]:
#write a function to create the completion date
def create_completion_date(data):
    #create the object class
    fake_complete = Faker()
    
    #set the end date
    e_date = date.today()
    
    #randomize between enrolment and end date
    return fake_complete.date_between(start_date=data['ENROLMENT_DATE'],
                                      end_date=e_date)

In [16]:
#apply the function
enrolment['COMPLETION_DATE'] = enrolment.apply(create_completion_date,
                                               axis=1)

LOGIC CHECK

enrolment date cannot be after completion date

In [18]:
enrolment[enrolment['ENROLMENT_DATE'] > enrolment['COMPLETION_DATE']]

Unnamed: 0,ID,COURSE_ID,STUDENT_ID,CANCELLED,ENROLMENT_DATE,COMPLETION_DATE


In [19]:
enrolment.head(2)

Unnamed: 0,ID,COURSE_ID,STUDENT_ID,CANCELLED,ENROLMENT_DATE,COMPLETION_DATE
0,0,,S000000,True,2021-04-12,2021-08-30
1,1,,S000001,False,2023-10-31,2024-01-06


EXPORT THE TABLE

In [20]:
#create a file name
xlx_name = 'enrolment.xlsx'

#export to excel
enrolment.to_excel('./experimental/created_tables/' + xlx_name,
                   sheet_name='enrolment_details')