In [3]:
#standard libraries
import pandas as pd, numpy as np

#randomization
import random
from faker import Faker

In [4]:
#export the file(s)
students = pd.read_excel('./experimental/created_tables/students.xlsx',
                         index_col=0)

course_details = pd.read_excel('./experimental/created_tables/course_details.xlsx',
                               index_col=0)

class_details = pd.read_excel('./experimental/created_tables/class_details.xlsx',
                              index_col=0)

enrolment = pd.read_excel('./experimental/created_tables/enrolment.xlsx',
                         index_col=0)

In [5]:
#find dimensions
print(f'students: {students.shape}, \n course_details: {course_details.shape}, \n class_details: {class_details.shape}, \n enrolment: {enrolment.shape}')

students: (200, 6), 
 course_details: (5, 2), 
 class_details: (37, 3), 
 enrolment: (200, 6)


In [6]:
#display the data sets
display(students.head(2))
print()
display(course_details.head(2))
print()
display(class_details.head(2))
print()
display(enrolment.head(2))

Unnamed: 0,STUDENT_ID,FIRST_NAME,LAST_NAME,CITY,BIRTH_DATE,E-MAIL
0,S000000,James,Smith,Moscow,1967-10-01,jamessmith@yandex.com
1,S000001,Michael,Johnson,Nizhniy Novgorod,1993-08-30,michaeljohnson@yandex.com





Unnamed: 0,COURSE_ID,COURSE_TITLE
0,C0001,"CIRCULAR ECONOMY, RESPONSIBLE PRODUCTION AND C..."
1,C0002,SUSTAINABLE FOOD SYSTEMS





Unnamed: 0,COURSE_ID,CLASS_ID,CLASS_TITLE
0,C0001,CERPC101,"001 - CIRCULAR ECONOMY, RESPONSIBLE PRODUCTION..."
1,C0001,CERPC102,"002 - CIRCULAR ECONOMY, RESPONSIBLE PRODUCTION..."





Unnamed: 0,ID,COURSE_ID,STUDENT_ID,CANCELLED,ENROLMENT_DATE,COMPLETION_DATE
0,0,,S000000,True,2021-04-12,2021-08-30
1,1,,S000001,False,2023-10-31,2024-01-06


#### **MERGE STUDENTS + ENROLMENT**

In [7]:
#combine the students and enrolment
enrolled_students = students.merge(enrolment,
                                   how='left',
                                   on='STUDENT_ID')

enrolled_students.head(2)

Unnamed: 0,STUDENT_ID,FIRST_NAME,LAST_NAME,CITY,BIRTH_DATE,E-MAIL,ID,COURSE_ID,CANCELLED,ENROLMENT_DATE,COMPLETION_DATE
0,S000000,James,Smith,Moscow,1967-10-01,jamessmith@yandex.com,0,,True,2021-04-12,2021-08-30
1,S000001,Michael,Johnson,Nizhniy Novgorod,1993-08-30,michaeljohnson@yandex.com,1,,False,2023-10-31,2024-01-06


In [8]:
#check for null values
enrolled_students.isnull().sum()[enrolled_students.isnull().sum() > 0]

COURSE_ID    200
dtype: int64

In [9]:
#dimensions
enrolled_students.shape

(200, 11)

In [10]:
#concat vertically - 3x times
merged_thrice = pd.concat([enrolled_students, enrolled_students, enrolled_students],
                          axis=0)

merged_thrice.head(2)

Unnamed: 0,STUDENT_ID,FIRST_NAME,LAST_NAME,CITY,BIRTH_DATE,E-MAIL,ID,COURSE_ID,CANCELLED,ENROLMENT_DATE,COMPLETION_DATE
0,S000000,James,Smith,Moscow,1967-10-01,jamessmith@yandex.com,0,,True,2021-04-12,2021-08-30
1,S000001,Michael,Johnson,Nizhniy Novgorod,1993-08-30,michaeljohnson@yandex.com,1,,False,2023-10-31,2024-01-06


In [11]:
#dimensions
merged_thrice.shape

(600, 11)

#### **CREATE COURSE_ID BY RANDOMIZATION**

create a random list of courses (from 1-5) and distribute per student

In [12]:
#number of courses available
course_details['COURSE_ID'].nunique()

5

In [13]:
#number of courses
course_list = [1, 2, 3, 4, 5]

#relative weights to choose elements from the list with different probability
course_num = random.choices(course_list,
                            weights=(19, 21.5, 21.5, 14, 24),
                            k=600)

In [14]:
#insert new features
merged_thrice['COURSE_NUMBER'] = None

merged_thrice = merged_thrice.assign(COURSE_NUMBER=course_num)

In [15]:
#course distribution
merged_thrice['COURSE_NUMBER'].value_counts()

COURSE_NUMBER
5    151
3    140
2    117
1    108
4     84
Name: count, dtype: int64

In [16]:
#dimensions
merged_thrice.shape

(600, 12)

In [17]:
#write a function to return course_id
def create_course_id(data):
    return 'C000' + str(data['COURSE_NUMBER'])

#apply the function
merged_thrice['COURSE_ID'] = merged_thrice.apply(create_course_id,
                                             axis=1)

In [18]:
merged_thrice.head(3)

Unnamed: 0,STUDENT_ID,FIRST_NAME,LAST_NAME,CITY,BIRTH_DATE,E-MAIL,ID,COURSE_ID,CANCELLED,ENROLMENT_DATE,COMPLETION_DATE,COURSE_NUMBER
0,S000000,James,Smith,Moscow,1967-10-01,jamessmith@yandex.com,0,C0003,True,2021-04-12,2021-08-30,3
1,S000001,Michael,Johnson,Nizhniy Novgorod,1993-08-30,michaeljohnson@yandex.com,1,C0004,False,2023-10-31,2024-01-06,4
2,S000002,Robert,Williams,Rostov,1993-10-02,robertwilliams@yandex.com,2,C0001,False,2022-06-19,2023-01-24,1


#### **COMBINE CLASSES**

In [19]:
#add classes
enrolled_students_studies = merged_thrice.merge(class_details,
                                                how='outer',
                                                on=['COURSE_ID'])

In [20]:
#check for null values
enrolled_students_studies.isnull().sum()[enrolled_students_studies.isnull().sum() > 0]

Series([], dtype: int64)

In [21]:
#dimensions
enrolled_students_studies.shape

(4306, 14)

In [22]:
#no of students on file
enrolled_students_studies['STUDENT_ID'].nunique()

200

In [23]:
enrolled_students_studies.head(2)

Unnamed: 0,STUDENT_ID,FIRST_NAME,LAST_NAME,CITY,BIRTH_DATE,E-MAIL,ID,COURSE_ID,CANCELLED,ENROLMENT_DATE,COMPLETION_DATE,COURSE_NUMBER,CLASS_ID,CLASS_TITLE
0,S000002,Robert,Williams,Rostov,1993-10-02,robertwilliams@yandex.com,2,C0001,False,2022-06-19,2023-01-24,1,CERPC101,"001 - CIRCULAR ECONOMY, RESPONSIBLE PRODUCTION..."
1,S000002,Robert,Williams,Rostov,1993-10-02,robertwilliams@yandex.com,2,C0001,False,2022-06-19,2023-01-24,1,CERPC102,"002 - CIRCULAR ECONOMY, RESPONSIBLE PRODUCTION..."


#### **COMBINE COURSES**

In [24]:
#add courses
student_study = enrolled_students_studies.merge(course_details,
                                                how='outer',
                                                on=['COURSE_ID'])

In [25]:
#check for null values
student_study.isnull().sum()[student_study.isnull().sum() > 0]

Series([], dtype: int64)

In [26]:
#dimensions
student_study.shape

(4306, 15)

In [27]:
#no of students on file
student_study['STUDENT_ID'].nunique()

200

In [28]:
student_study.head(2)

Unnamed: 0,STUDENT_ID,FIRST_NAME,LAST_NAME,CITY,BIRTH_DATE,E-MAIL,ID,COURSE_ID,CANCELLED,ENROLMENT_DATE,COMPLETION_DATE,COURSE_NUMBER,CLASS_ID,CLASS_TITLE,COURSE_TITLE
0,S000002,Robert,Williams,Rostov,1993-10-02,robertwilliams@yandex.com,2,C0001,False,2022-06-19,2023-01-24,1,CERPC101,"001 - CIRCULAR ECONOMY, RESPONSIBLE PRODUCTION...","CIRCULAR ECONOMY, RESPONSIBLE PRODUCTION AND C..."
1,S000002,Robert,Williams,Rostov,1993-10-02,robertwilliams@yandex.com,2,C0001,False,2022-06-19,2023-01-24,1,CERPC102,"002 - CIRCULAR ECONOMY, RESPONSIBLE PRODUCTION...","CIRCULAR ECONOMY, RESPONSIBLE PRODUCTION AND C..."


In [29]:
#remove uninformative features
student_study = student_study.drop(['COURSE_NUMBER',],
                                   axis=1,
                                   errors='ignore')

#### **CREATE COURSE_START_DATE / COURSE_END_DATE**

In [30]:
#slice the table
slice_course_dates = student_study.iloc[:, [0, 6, 7, 9, 10]]
slice_course_dates.head(2)

Unnamed: 0,STUDENT_ID,ID,COURSE_ID,ENROLMENT_DATE,COMPLETION_DATE
0,S000002,2,C0001,2022-06-19,2023-01-24
1,S000002,2,C0001,2022-06-19,2023-01-24


In [31]:
#check for duplicate values
dupl_columns = list(slice_course_dates.columns)

mask = slice_course_dates.duplicated(subset=dupl_columns)
slice_duplicates = slice_course_dates[mask]
print(f'Number of Duplicates: {slice_duplicates.shape[0]}')

Number of Duplicates: 3830


In [32]:
#remove duplicates
slice_course_dates = slice_course_dates.drop_duplicates(subset=dupl_columns)
print(f'New Dimensions: {slice_course_dates.shape[0]}')

New Dimensions: 476


In [33]:
#reset the index
slice_course_dates = slice_course_dates.reset_index()

#remove uninformative feature
slice_course_dates = slice_course_dates.drop('index',
                                             axis=1,
                                             errors='ignore')

In [34]:
slice_course_dates.head(2)

Unnamed: 0,STUDENT_ID,ID,COURSE_ID,ENROLMENT_DATE,COMPLETION_DATE
0,S000002,2,C0001,2022-06-19,2023-01-24
1,S000008,8,C0001,2022-12-09,2023-09-03


In [35]:
from datetime import date

#write a function to create the course start date
def create_course_start(data):
    #create the object class
    fake_start = Faker()
    
    #randomize between enrolment and completion date
    return fake_start.date_between(start_date=data['ENROLMENT_DATE'],
                                   end_date=data['COMPLETION_DATE'])

In [36]:
#apply the function
slice_course_dates['COURSE_START_DATE'] = slice_course_dates.apply(create_course_start,
                                                                   axis=1)

In [37]:
from datetime import date

#write a function to create the course end date
def create_course_end(data):
    #create the object class
    fake_end = Faker()
    
    #randomize between course start and completion date
    return fake_end.date_between(start_date=data['COURSE_START_DATE'],
                                 end_date=data['COMPLETION_DATE'])

In [38]:
#apply the function
slice_course_dates['COURSE_END_DATE'] = slice_course_dates.apply(create_course_end,
                                                                 axis=1)

LOGIC CHECK

In [39]:
#enrolment cannot be after completion
slice_course_dates[slice_course_dates['ENROLMENT_DATE'] > slice_course_dates['COMPLETION_DATE']]

Unnamed: 0,STUDENT_ID,ID,COURSE_ID,ENROLMENT_DATE,COMPLETION_DATE,COURSE_START_DATE,COURSE_END_DATE


In [40]:
#start date cannot be before enrolment
slice_course_dates[slice_course_dates['COURSE_START_DATE'] < slice_course_dates['ENROLMENT_DATE']]

Unnamed: 0,STUDENT_ID,ID,COURSE_ID,ENROLMENT_DATE,COMPLETION_DATE,COURSE_START_DATE,COURSE_END_DATE


In [41]:
#start cannot be after completion
slice_course_dates[slice_course_dates['COURSE_START_DATE'] > slice_course_dates['COURSE_END_DATE']]

Unnamed: 0,STUDENT_ID,ID,COURSE_ID,ENROLMENT_DATE,COMPLETION_DATE,COURSE_START_DATE,COURSE_END_DATE


In [42]:
student_study.shape

(4306, 14)

In [43]:
slice_course_dates.head(2)

Unnamed: 0,STUDENT_ID,ID,COURSE_ID,ENROLMENT_DATE,COMPLETION_DATE,COURSE_START_DATE,COURSE_END_DATE
0,S000002,2,C0001,2022-06-19,2023-01-24,2022-07-20,2023-01-21
1,S000008,8,C0001,2022-12-09,2023-09-03,2023-04-16,2023-06-06


In [44]:
#add course_dates to the main merged table
student_course_data = student_study.merge(slice_course_dates,
                                          how='inner',
                                          on=['STUDENT_ID', 'COURSE_ID', 'ID', 'ENROLMENT_DATE', 'COMPLETION_DATE'])

In [45]:
student_course_data.head(2)

Unnamed: 0,STUDENT_ID,FIRST_NAME,LAST_NAME,CITY,BIRTH_DATE,E-MAIL,ID,COURSE_ID,CANCELLED,ENROLMENT_DATE,COMPLETION_DATE,CLASS_ID,CLASS_TITLE,COURSE_TITLE,COURSE_START_DATE,COURSE_END_DATE
0,S000002,Robert,Williams,Rostov,1993-10-02,robertwilliams@yandex.com,2,C0001,False,2022-06-19,2023-01-24,CERPC101,"001 - CIRCULAR ECONOMY, RESPONSIBLE PRODUCTION...","CIRCULAR ECONOMY, RESPONSIBLE PRODUCTION AND C...",2022-07-20,2023-01-21
1,S000002,Robert,Williams,Rostov,1993-10-02,robertwilliams@yandex.com,2,C0001,False,2022-06-19,2023-01-24,CERPC102,"002 - CIRCULAR ECONOMY, RESPONSIBLE PRODUCTION...","CIRCULAR ECONOMY, RESPONSIBLE PRODUCTION AND C...",2022-07-20,2023-01-21


In [46]:
student_course_data.shape

(4306, 16)

#### **CREATE CLASS_START_DATE / CLASS_END_DATE**

In [47]:
#slice the table
slice_class_dates = student_course_data.iloc[:, [0, 6, 7, 9, 10, 11, 14, 15]]
slice_class_dates.head(2)

Unnamed: 0,STUDENT_ID,ID,COURSE_ID,ENROLMENT_DATE,COMPLETION_DATE,CLASS_ID,COURSE_START_DATE,COURSE_END_DATE
0,S000002,2,C0001,2022-06-19,2023-01-24,CERPC101,2022-07-20,2023-01-21
1,S000002,2,C0001,2022-06-19,2023-01-24,CERPC102,2022-07-20,2023-01-21


In [48]:
#check for duplicate values
dupl_columns = list(slice_class_dates.columns)

mask = slice_class_dates.duplicated(subset=dupl_columns)
slice_class_duplicates = slice_class_dates[mask]
print(f'Number of Duplicates: {slice_class_duplicates.shape[0]}')

Number of Duplicates: 873


In [49]:
#remove duplicates
slice_class_dates = slice_class_dates.drop_duplicates(subset=dupl_columns)
print(f'New Dimensions: {slice_class_dates.shape[0]}')

New Dimensions: 3433


In [50]:
#reset the index
slice_class_dates = slice_class_dates.reset_index()

#remove uninformative feature
slice_class_dates = slice_class_dates.drop('index',
                                           axis=1,
                                           errors='ignore')

In [51]:
slice_class_dates.head(2)

Unnamed: 0,STUDENT_ID,ID,COURSE_ID,ENROLMENT_DATE,COMPLETION_DATE,CLASS_ID,COURSE_START_DATE,COURSE_END_DATE
0,S000002,2,C0001,2022-06-19,2023-01-24,CERPC101,2022-07-20,2023-01-21
1,S000002,2,C0001,2022-06-19,2023-01-24,CERPC102,2022-07-20,2023-01-21


In [52]:
from datetime import date

#write a function to create the class start date
def create_class_start(data):
    #create the object class
    fake_start = Faker()
    
    #randomize between course start and course end
    return fake_start.date_between(start_date=data['COURSE_START_DATE'],
                                   end_date=data['COURSE_END_DATE'])

In [53]:
#apply the function
slice_class_dates['CLASS_START_DATE'] = slice_class_dates.apply(create_class_start,
                                                                axis=1)

In [54]:
from datetime import date

#write a function to create the class end date
def create_class_end(data):
    #create the object class
    fake_end = Faker()
    
    #randomize between class start and course end
    return fake_end.date_between(start_date=data['CLASS_START_DATE'],
                                 end_date=data['COURSE_END_DATE'])

In [55]:
#apply the function
slice_class_dates['CLASS_END_DATE'] = slice_class_dates.apply(create_class_end,
                                                              axis=1)

LOGIC CHECK

In [56]:
#class start cannot be after course end
slice_class_dates[slice_class_dates['CLASS_START_DATE'] > slice_class_dates['COURSE_END_DATE']]

Unnamed: 0,STUDENT_ID,ID,COURSE_ID,ENROLMENT_DATE,COMPLETION_DATE,CLASS_ID,COURSE_START_DATE,COURSE_END_DATE,CLASS_START_DATE,CLASS_END_DATE


In [57]:
#class end cannot be before class start
slice_class_dates[slice_class_dates['CLASS_START_DATE'] > slice_class_dates['CLASS_END_DATE']]

Unnamed: 0,STUDENT_ID,ID,COURSE_ID,ENROLMENT_DATE,COMPLETION_DATE,CLASS_ID,COURSE_START_DATE,COURSE_END_DATE,CLASS_START_DATE,CLASS_END_DATE


In [58]:
student_course_data.shape

(4306, 16)

In [59]:
#add class dates to the main merged table
student_class_data = student_course_data.merge(slice_class_dates,
                                               how='inner',
                                               on=['STUDENT_ID', 'ID',
                                                   'ENROLMENT_DATE', 'COMPLETION_DATE',
                                                   'CLASS_ID', 'COURSE_ID',
                                                   'COURSE_START_DATE', 'COURSE_END_DATE'])

In [60]:
student_class_data.head(2)

Unnamed: 0,STUDENT_ID,FIRST_NAME,LAST_NAME,CITY,BIRTH_DATE,E-MAIL,ID,COURSE_ID,CANCELLED,ENROLMENT_DATE,COMPLETION_DATE,CLASS_ID,CLASS_TITLE,COURSE_TITLE,COURSE_START_DATE,COURSE_END_DATE,CLASS_START_DATE,CLASS_END_DATE
0,S000002,Robert,Williams,Rostov,1993-10-02,robertwilliams@yandex.com,2,C0001,False,2022-06-19,2023-01-24,CERPC101,"001 - CIRCULAR ECONOMY, RESPONSIBLE PRODUCTION...","CIRCULAR ECONOMY, RESPONSIBLE PRODUCTION AND C...",2022-07-20,2023-01-21,2022-09-19,2023-01-19
1,S000002,Robert,Williams,Rostov,1993-10-02,robertwilliams@yandex.com,2,C0001,False,2022-06-19,2023-01-24,CERPC102,"002 - CIRCULAR ECONOMY, RESPONSIBLE PRODUCTION...","CIRCULAR ECONOMY, RESPONSIBLE PRODUCTION AND C...",2022-07-20,2023-01-21,2022-11-15,2022-12-15


In [61]:
student_class_data.shape

(4306, 18)

#### **CREATE TEST_NUMBER**

create test_number by combining course_id + T000 + class_id

In [62]:
#slice two characters from class id
student_class_data['TNUM'] = student_class_data['CLASS_ID'].apply(lambda x: str(x)[-2:])

In [63]:
#write a function to create the test id
def create_test_number(data):
    return str(data['COURSE_ID']) + 'T000' + str(data['TNUM'])

#apply a function
student_class_data['TEST_NUMBER'] = student_class_data.apply(create_test_number,
                                                             axis=1)

In [64]:
student_class_data.head(2)

Unnamed: 0,STUDENT_ID,FIRST_NAME,LAST_NAME,CITY,BIRTH_DATE,E-MAIL,ID,COURSE_ID,CANCELLED,ENROLMENT_DATE,COMPLETION_DATE,CLASS_ID,CLASS_TITLE,COURSE_TITLE,COURSE_START_DATE,COURSE_END_DATE,CLASS_START_DATE,CLASS_END_DATE,TNUM,TEST_NUMBER
0,S000002,Robert,Williams,Rostov,1993-10-02,robertwilliams@yandex.com,2,C0001,False,2022-06-19,2023-01-24,CERPC101,"001 - CIRCULAR ECONOMY, RESPONSIBLE PRODUCTION...","CIRCULAR ECONOMY, RESPONSIBLE PRODUCTION AND C...",2022-07-20,2023-01-21,2022-09-19,2023-01-19,1,C0001T00001
1,S000002,Robert,Williams,Rostov,1993-10-02,robertwilliams@yandex.com,2,C0001,False,2022-06-19,2023-01-24,CERPC102,"002 - CIRCULAR ECONOMY, RESPONSIBLE PRODUCTION...","CIRCULAR ECONOMY, RESPONSIBLE PRODUCTION AND C...",2022-07-20,2023-01-21,2022-11-15,2022-12-15,2,C0001T00002


CREATE TESTS TABLE

In [65]:
#combine courses + classes
tests = course_details.merge(class_details,
                             how='left',
                             on=['COURSE_ID'])

In [66]:
#slice two characters from class id
tests['TNUM'] = tests['CLASS_ID'].apply(lambda x: str(x)[-2:])

In [67]:
#apply a function
tests['TEST_NUMBER'] = tests.apply(create_test_number,
                                   axis=1)

In [68]:
#remove uninformative feature(s)
tests = tests.drop('TNUM',
                   axis=1,
                   errors='ignore')

In [69]:
tests.head(2)

Unnamed: 0,COURSE_ID,COURSE_TITLE,CLASS_ID,CLASS_TITLE,TEST_NUMBER
0,C0001,"CIRCULAR ECONOMY, RESPONSIBLE PRODUCTION AND C...",CERPC101,"001 - CIRCULAR ECONOMY, RESPONSIBLE PRODUCTION...",C0001T00001
1,C0001,"CIRCULAR ECONOMY, RESPONSIBLE PRODUCTION AND C...",CERPC102,"002 - CIRCULAR ECONOMY, RESPONSIBLE PRODUCTION...",C0001T00002


EXPORT TABLE

In [70]:
#create a file name
xlx_name = 'test_details.xlsx'

#export to excel
tests.to_excel('./experimental/created_tables/' + xlx_name,
               sheet_name='test_details')

#### **CREATE TEST_SCORE**

create a list of test scores with a different set of probabilities

In [71]:
#75-100%, 0-100%
test_score1, test_score2 = [75, 80, 85, 90, 95, 100], [0, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100]

#probability
prob1, prob2 = 100 / len(test_score1), 100 / len(test_score2)

#split
split1 = round(student_class_data.shape[0] / 2, 0)
split2 = student_class_data.shape[0] - split1

#relative weights to choose elements from the list with different probability
scores1 = random.choices(test_score1,
                         weights=(prob1, prob1, prob1, prob1, prob1, prob1),
                         k=int(split1))

#relative weights to choose elements from the list with different probability
scores2 = random.choices(test_score2,
                         weights=(prob2, prob2, prob2, prob2, prob2, prob2, prob2, prob2, prob2, prob2,
                                  prob2, prob2, prob2, prob2, prob2, prob2, prob2, prob2, prob2, prob2,
                                  prob2),
                         k=int(split2))

#add two lists together
combine_test_scores = scores1 + scores2

In [72]:
#insert a new feature
student_class_data['TEST_SCORE'] = None

student_class_data = student_class_data.assign(TEST_SCORE=combine_test_scores)

In [73]:
#write a function for test_status
def pass_or_fail(data):
    if data['TEST_SCORE'] > 50:
        return 'PASS'
    else:
        return 'FAIL'

#create a new feature
student_class_data['TEST_STATUS'] = student_class_data.apply(pass_or_fail,
                                                             axis=1)

In [74]:
student_class_data.head(2)

Unnamed: 0,STUDENT_ID,FIRST_NAME,LAST_NAME,CITY,BIRTH_DATE,E-MAIL,ID,COURSE_ID,CANCELLED,ENROLMENT_DATE,...,CLASS_TITLE,COURSE_TITLE,COURSE_START_DATE,COURSE_END_DATE,CLASS_START_DATE,CLASS_END_DATE,TNUM,TEST_NUMBER,TEST_SCORE,TEST_STATUS
0,S000002,Robert,Williams,Rostov,1993-10-02,robertwilliams@yandex.com,2,C0001,False,2022-06-19,...,"001 - CIRCULAR ECONOMY, RESPONSIBLE PRODUCTION...","CIRCULAR ECONOMY, RESPONSIBLE PRODUCTION AND C...",2022-07-20,2023-01-21,2022-09-19,2023-01-19,1,C0001T00001,95,PASS
1,S000002,Robert,Williams,Rostov,1993-10-02,robertwilliams@yandex.com,2,C0001,False,2022-06-19,...,"002 - CIRCULAR ECONOMY, RESPONSIBLE PRODUCTION...","CIRCULAR ECONOMY, RESPONSIBLE PRODUCTION AND C...",2022-07-20,2023-01-21,2022-11-15,2022-12-15,2,C0001T00002,75,PASS


#### **CREATE ACTIVE STUDENTS**

In [75]:
#create a new feature: student status
student_class_data['STUDENT_STATUS'] = student_class_data['CANCELLED'].apply(lambda x: 'ACTIVE' if x == False else 'INACTIVE')

In [76]:
#remove uninformative feature(s)
student_class_data = student_class_data.drop(['TNUM'],
                                             axis=1,
                                             errors='ignore')

In [77]:
student_class_data.head(2)

Unnamed: 0,STUDENT_ID,FIRST_NAME,LAST_NAME,CITY,BIRTH_DATE,E-MAIL,ID,COURSE_ID,CANCELLED,ENROLMENT_DATE,...,CLASS_TITLE,COURSE_TITLE,COURSE_START_DATE,COURSE_END_DATE,CLASS_START_DATE,CLASS_END_DATE,TEST_NUMBER,TEST_SCORE,TEST_STATUS,STUDENT_STATUS
0,S000002,Robert,Williams,Rostov,1993-10-02,robertwilliams@yandex.com,2,C0001,False,2022-06-19,...,"001 - CIRCULAR ECONOMY, RESPONSIBLE PRODUCTION...","CIRCULAR ECONOMY, RESPONSIBLE PRODUCTION AND C...",2022-07-20,2023-01-21,2022-09-19,2023-01-19,C0001T00001,95,PASS,ACTIVE
1,S000002,Robert,Williams,Rostov,1993-10-02,robertwilliams@yandex.com,2,C0001,False,2022-06-19,...,"002 - CIRCULAR ECONOMY, RESPONSIBLE PRODUCTION...","CIRCULAR ECONOMY, RESPONSIBLE PRODUCTION AND C...",2022-07-20,2023-01-21,2022-11-15,2022-12-15,C0001T00002,75,PASS,ACTIVE


#### **CREATE UNFINISHED COURSES**

In [78]:
student_class_data = student_class_data.reset_index()

remove some records to create unfinished courses

In [79]:
#slice two characters from class id
student_class_data['TNUM'] = student_class_data['CLASS_ID'].apply(lambda x: str(x)[-2:])

COURSE ONE

In [80]:
#filter by course one and last two classes
course_one_indx = student_class_data[(student_class_data['COURSE_ID'] == 'C0001') &
                                     ((student_class_data['TNUM'] == '09') | (student_class_data['TNUM'] == '10'))]
course_one_indx.index

Index([   8,    9,   18,   19,   28,   29,   38,   39,   48,   49,
       ...
       1038, 1039, 1048, 1049, 1058, 1059, 1068, 1069, 1078, 1079],
      dtype='int64', length=216)

In [81]:
#create a list to store course one records to remove
delete_one = []

#find records divisable by 4 without a remainder
for r in course_one_indx.index.to_list():
    if r % 4 == 0:
        delete_one.append(r)

delete_one
len(delete_one)

54

COURSE TWO AND THREE

In [82]:
#filter by course two and three and last two classes
courses_two_three_indx = student_class_data[((student_class_data['COURSE_ID'] == 'C0002') | (student_class_data['COURSE_ID'] == 'C0003')) &
                                            ((student_class_data['TNUM'] == '06') | (student_class_data['TNUM'] == '07'))]
courses_two_three_indx.index

Index([1085, 1086, 1092, 1093, 1099, 1100, 1106, 1107, 1113, 1114,
       ...
       2849, 2850, 2856, 2857, 2863, 2864, 2870, 2871, 2877, 2878],
      dtype='int64', length=514)

In [83]:
#create a list to store course two and three records to remove
delete_two_three = []

#find records divisable by 3 without a remainder
for r in courses_two_three_indx.index.to_list():
    if r % 3 == 0:
        delete_two_three.append(r)

delete_two_three
len(delete_two_three)

172

COURSE FOUR

In [84]:
#filter by course four and last class
course_four_indx = student_class_data[(student_class_data['COURSE_ID'] == 'C0004') & (student_class_data['TNUM'] == '08')]
course_four_indx.index

Index([2886, 2894, 2902, 2910, 2918, 2926, 2934, 2942, 2950, 2958, 2966, 2974,
       2982, 2990, 2998, 3006, 3014, 3022, 3030, 3038, 3046, 3054, 3062, 3070,
       3078, 3086, 3094, 3102, 3110, 3118, 3126, 3134, 3142, 3150, 3158, 3166,
       3174, 3182, 3190, 3198, 3206, 3214, 3222, 3230, 3238, 3246, 3254, 3262,
       3270, 3278, 3286, 3294, 3302, 3310, 3318, 3326, 3334, 3342, 3350, 3358,
       3366, 3374, 3382, 3390, 3398, 3406, 3414, 3422, 3430, 3438, 3446, 3454,
       3462, 3470, 3478, 3486, 3494, 3502, 3510, 3518, 3526, 3534, 3542, 3550],
      dtype='int64')

In [85]:
#create a list to store course four records to remove
delete_four = []

#find records divisable by 5 without a remainder
for r in course_four_indx.index.to_list():
    if r % 5 == 0:
        delete_four.append(r)

delete_four
len(delete_four)

17

COURSE FIVE

In [86]:
#filter by course five and last class
course_five_indx = student_class_data[(student_class_data['COURSE_ID'] == 'C0005') & (student_class_data['TNUM'] == '05')]
course_five_indx.index

Index([3555, 3560, 3565, 3570, 3575, 3580, 3585, 3590, 3595, 3600,
       ...
       4260, 4265, 4270, 4275, 4280, 4285, 4290, 4295, 4300, 4305],
      dtype='int64', length=151)

In [87]:
#create a list to store course five records to remove
delete_five = []

#find records divisable by 12 without a remainder
for r in course_five_indx.index.to_list():
    if r % 12 == 0:
        delete_five.append(r)

delete_five
len(delete_five)

12

REMOVE RECORDS TO CREATE UNFINISHED COURSES

In [88]:
student_class_data.shape

(4306, 24)

In [89]:
#combine into one index
delete_records = delete_one + delete_two_three + delete_four + delete_five

#remove course records
student_class_data = student_class_data.drop(student_class_data.index[delete_records],
                                             axis=0,
                                             errors='ignore')

In [90]:
student_class_data.shape

(4051, 24)

In [91]:
#remove uninformative feature(s)
student_class_data = student_class_data.drop(['index', 'TNUM'],
                                             axis=1,
                                             errors='ignore')

#### **CREATE LOGS**

In [92]:
#slice the table
attend = student_class_data[['STUDENT_ID', 'ID', 'COURSE_ID', 'CLASS_ID', 'CLASS_START_DATE', 'CLASS_END_DATE']]
attend.head(3)

Unnamed: 0,STUDENT_ID,ID,COURSE_ID,CLASS_ID,CLASS_START_DATE,CLASS_END_DATE
0,S000002,2,C0001,CERPC101,2022-09-19,2023-01-19
1,S000002,2,C0001,CERPC102,2022-11-15,2022-12-15
2,S000002,2,C0001,CERPC103,2022-10-01,2022-10-05


In [93]:
#check for duplicate values
dupl_columns = list(attend.columns)

mask = attend.duplicated(subset=dupl_columns)
attend_duplicates = attend[mask]
print(f'Number of Duplicates: {attend_duplicates.shape[0]}')

Number of Duplicates: 774


In [94]:
#remove duplicates
attend = attend.drop_duplicates(subset=dupl_columns)
print(f'New Dimensions: {attend.shape[0]}')

New Dimensions: 3277


In [95]:
#convert to specific format to avoid log_out < log_in
attend['CLASS_START_DATE'] = pd.to_datetime(attend['CLASS_START_DATE'],
                                       format='%Y-%m-%d %H:%M:%S')

attend['CLASS_END_DATE'] = pd.to_datetime(attend['CLASS_END_DATE'],
                                     format='%Y-%m-%d %H:%M:%S')

In [96]:
import datetime
from datetime import date

#write a function to create the log in
def create_session_start(data):
    #create the object class
    fake_timezone = Faker()
    
    #randomize between class start and class end
    return fake_timezone.date_time_between(start_date=data['CLASS_START_DATE'],
                                             end_date=data['CLASS_END_DATE'])

#apply the function
attend['LOG_IN'] = attend.apply(create_session_start,
                                axis=1)

In [97]:
#write a function to create the log out
def create_session_end(data):
    #create the object class
    fake_timezone = Faker()
    
    #randomize between log in and class end
    return fake_timezone.date_time_between(start_date=data['LOG_IN'],
                                              end_date=data['CLASS_END_DATE'])

#apply the function
attend['LOG_OUT'] = attend.apply(create_session_end,
                                 axis=1)

In [98]:
attend.head(2)

Unnamed: 0,STUDENT_ID,ID,COURSE_ID,CLASS_ID,CLASS_START_DATE,CLASS_END_DATE,LOG_IN,LOG_OUT
0,S000002,2,C0001,CERPC101,2022-09-19,2023-01-19,2022-09-23 11:28:34.121015,2022-11-10 05:12:53.524904
1,S000002,2,C0001,CERPC102,2022-11-15,2022-12-15,2022-11-24 08:45:31.068376,2022-12-04 16:44:31.439917


In [99]:
#find the session time
attend['DIF'] = abs(attend['LOG_OUT'] - attend['LOG_IN'])

#convert to string
attend['DIF'] = attend['DIF'].astype('string')

In [100]:
from datetime import datetime
from operator import attrgetter

#write a function to return session time (in minutes) to 2dp to signify seconds, i.e. 0.05 = 3 seconds (0.05 * 60 = 3)
def str_datetime_to_minute(data):
    #split the string
    days, time = data['DIF'].split(' days ')
    
    #separate elements
    hours, minutes, seconds, microseconds = attrgetter(
        'hour', 'minute', 'second', 'microsecond'
    )(datetime.strptime(time, '%H:%M:%S.%f'))
    
    #convert to minutes rounding down to 2dp
    return round(int(days) * 24 * 60
            + hours * 60
            + minutes
            + seconds / 60
            + microseconds / 60 / 1000000, 2)

In [101]:
#apply the function to create the new feature
attend['SESSION_TIME'] = attend.apply(str_datetime_to_minute,
                                      axis=1)

In [102]:
attend.head(2)

Unnamed: 0,STUDENT_ID,ID,COURSE_ID,CLASS_ID,CLASS_START_DATE,CLASS_END_DATE,LOG_IN,LOG_OUT,DIF,SESSION_TIME
0,S000002,2,C0001,CERPC101,2022-09-19,2023-01-19,2022-09-23 11:28:34.121015,2022-11-10 05:12:53.524904,47 days 17:44:19.403889,68744.32
1,S000002,2,C0001,CERPC102,2022-11-15,2022-12-15,2022-11-24 08:45:31.068376,2022-12-04 16:44:31.439917,10 days 07:59:00.371541,14879.01


In [103]:
#distribution
attend['SESSION_TIME'].value_counts().sort_values(ascending=True)

SESSION_TIME
51.70        1
484.13       1
18.81        1
336.88       1
3160.65      1
          ... 
59.43        2
9040.18      2
0.02         5
0.00       290
0.01       301
Name: count, Length: 2678, dtype: int64

not creating multiple sessions, so outliers possible

In [104]:
#create the sessions
sessions = list(range(1, (attend.shape[0]+1)))
len(sessions)

3277

In [105]:
#insert the new feature
attend.insert(0, 'SESSION_ID', sessions, True)

In [106]:
attend = attend.reset_index()

In [107]:
#remove uninformative feature(s)
attend = attend.drop('index',
                     axis=1,
                     errors='ignore')

In [108]:
attend.head(2)

Unnamed: 0,SESSION_ID,STUDENT_ID,ID,COURSE_ID,CLASS_ID,CLASS_START_DATE,CLASS_END_DATE,LOG_IN,LOG_OUT,DIF,SESSION_TIME
0,1,S000002,2,C0001,CERPC101,2022-09-19,2023-01-19,2022-09-23 11:28:34.121015,2022-11-10 05:12:53.524904,47 days 17:44:19.403889,68744.32
1,2,S000002,2,C0001,CERPC102,2022-11-15,2022-12-15,2022-11-24 08:45:31.068376,2022-12-04 16:44:31.439917,10 days 07:59:00.371541,14879.01


EXPORT TABLE

In [109]:
#create a file name
xlx_name = 'attendance.xlsx'

#export to excel
attend.to_excel('./experimental/created_tables/' + xlx_name,
                sheet_name='logging_data')

#### **CREATE COURSE_STATUS**

In [110]:
#group the test_scores by course (per student)
average_score_per_course = student_class_data.groupby(['STUDENT_ID', 'COURSE_ID'])['TEST_SCORE'].mean().round(0)
average_score_per_course[:5]

STUDENT_ID  COURSE_ID
S000000     C0001        90.0
            C0003        66.0
S000001     C0004        49.0
            C0005        57.0
S000002     C0001        86.0
Name: TEST_SCORE, dtype: float64

In [111]:
#convert series to dataframe
average_score_per_course = average_score_per_course.to_frame()
type(average_score_per_course)

pandas.core.frame.DataFrame

In [112]:
#write a function for course_status
def course_pass(data):
    if data['TEST_SCORE'] >= 50:
        return 'PASS'
    else:
        return 'FAIL'
    
#apply the function
average_score_per_course['COURSE_STATUS'] = average_score_per_course.apply(course_pass,
                                                                           axis=1)

In [113]:
average_score_per_course.shape

(476, 2)

In [114]:
#create an index to fill in the blanks
average_score_per_course = average_score_per_course.reset_index()

In [115]:
#confirm that the change was applied correctly
average_score_per_course.head(3)

Unnamed: 0,STUDENT_ID,COURSE_ID,TEST_SCORE,COURSE_STATUS
0,S000000,C0001,90.0,PASS
1,S000000,C0003,66.0,PASS
2,S000001,C0004,49.0,FAIL


In [116]:
#rename the column
average_score_per_course = average_score_per_course.rename(columns={'TEST_SCORE': 'COURSE_SCORE',
                                                                    })

In [117]:
average_score_per_course.shape

(476, 4)

In [118]:
#add course_score and course_status to the main merged table
student_data = student_class_data.merge(average_score_per_course,
                                        how='outer',
                                        on=['STUDENT_ID', 'COURSE_ID'])

In [119]:
student_data.shape

(4051, 24)

In [120]:
student_data.head(2)

Unnamed: 0,STUDENT_ID,FIRST_NAME,LAST_NAME,CITY,BIRTH_DATE,E-MAIL,ID,COURSE_ID,CANCELLED,ENROLMENT_DATE,...,COURSE_START_DATE,COURSE_END_DATE,CLASS_START_DATE,CLASS_END_DATE,TEST_NUMBER,TEST_SCORE,TEST_STATUS,STUDENT_STATUS,COURSE_SCORE,COURSE_STATUS
0,S000000,James,Smith,Moscow,1967-10-01,jamessmith@yandex.com,0,C0001,True,2021-04-12,...,2021-05-16,2021-07-08,2021-07-07,2021-07-07,C0001T00001,90,PASS,INACTIVE,90.0,PASS
1,S000000,James,Smith,Moscow,1967-10-01,jamessmith@yandex.com,0,C0001,True,2021-04-12,...,2021-05-16,2021-07-08,2021-05-29,2021-06-22,C0001T00002,95,PASS,INACTIVE,90.0,PASS


#### **CREATE ACHIEVEMENTS AND POINTS**

SCORING: 0-100

ACHIEVEMENTS:
> GOLD: 90-100% \
> SILVER: 80-89% \
> BRONZE: 70-79%

POINTS:
- GOLD (100)
- SILVER (75)
- BRONZE (50)

In [121]:
#write a function for achievements
def create_achievements(data):
    if 90 <= data['TEST_SCORE'] <= 100:
        return 'GOLD'
    elif 80 <= data['TEST_SCORE'] <= 89:
        return 'SILVER'
    elif 70 <= data['TEST_SCORE'] <= 79:
        return 'BRONZE'
    else:
        return None

In [122]:
#apply the function
student_data['ACHIEVEMENT'] = student_data.apply(create_achievements,
                                                 axis=1)

In [123]:
#write a function for points
def get_points(data):
    if data['ACHIEVEMENT'] == 'GOLD':
        return 100
    elif data['ACHIEVEMENT'] == 'SILVER':
        return 75
    elif data['ACHIEVEMENT'] == 'BRONZE':
        return 50
    else:
        return 0

In [124]:
#apply the function
student_data['POINTS'] = student_data.apply(get_points,
                                            axis=1)

In [125]:
student_data.head(2)

Unnamed: 0,STUDENT_ID,FIRST_NAME,LAST_NAME,CITY,BIRTH_DATE,E-MAIL,ID,COURSE_ID,CANCELLED,ENROLMENT_DATE,...,CLASS_START_DATE,CLASS_END_DATE,TEST_NUMBER,TEST_SCORE,TEST_STATUS,STUDENT_STATUS,COURSE_SCORE,COURSE_STATUS,ACHIEVEMENT,POINTS
0,S000000,James,Smith,Moscow,1967-10-01,jamessmith@yandex.com,0,C0001,True,2021-04-12,...,2021-07-07,2021-07-07,C0001T00001,90,PASS,INACTIVE,90.0,PASS,GOLD,100
1,S000000,James,Smith,Moscow,1967-10-01,jamessmith@yandex.com,0,C0001,True,2021-04-12,...,2021-05-29,2021-06-22,C0001T00002,95,PASS,INACTIVE,90.0,PASS,GOLD,100


ADD LOGGING DETAILS

In [127]:
#add class dates to the main merged table
student_data = student_data.merge(attend,
                                  how='left',
                                  on=['STUDENT_ID', 'ID',
                                      'COURSE_ID', 'CLASS_ID'])

In [128]:
#check for null values
student_data.isnull().sum()[student_data.isnull().sum() > 0]

ACHIEVEMENT    1351
dtype: int64

##### **CORRECTIONS**

In [129]:
student_data.head(3)

Unnamed: 0,STUDENT_ID,FIRST_NAME,LAST_NAME,CITY,BIRTH_DATE,E-MAIL,ID,COURSE_ID,CANCELLED,ENROLMENT_DATE,...,COURSE_STATUS,ACHIEVEMENT,POINTS,SESSION_ID,CLASS_START_DATE_y,CLASS_END_DATE_y,LOG_IN,LOG_OUT,DIF,SESSION_TIME
0,S000000,James,Smith,Moscow,1967-10-01,jamessmith@yandex.com,0,C0001,True,2021-04-12,...,PASS,GOLD,100,553,2021-07-07,2021-07-07,2021-07-07 00:00:00.047659,2021-07-07 00:00:00.829958,0 days 00:00:00.782299,0.01
1,S000000,James,Smith,Moscow,1967-10-01,jamessmith@yandex.com,0,C0001,True,2021-04-12,...,PASS,GOLD,100,554,2021-05-29,2021-06-22,2021-06-10 20:03:09.436395,2021-06-21 01:27:26.322408,10 days 05:24:16.886013,14724.28
2,S000000,James,Smith,Moscow,1967-10-01,jamessmith@yandex.com,0,C0001,True,2021-04-12,...,PASS,SILVER,75,555,2021-05-25,2021-06-18,2021-06-16 22:47:55.012603,2021-06-17 07:15:20.502716,0 days 08:27:25.490113,507.42


In [131]:
student_data[['CLASS_START_DATE_x', 'CLASS_START_DATE_y', 'CLASS_END_DATE_x', 'CLASS_END_DATE_y']].head(2)

Unnamed: 0,CLASS_START_DATE_x,CLASS_START_DATE_y,CLASS_END_DATE_x,CLASS_END_DATE_y
0,2021-07-07,2021-07-07,2021-07-07,2021-07-07
1,2021-05-29,2021-05-29,2021-06-22,2021-06-22


LOGIC CHECK

In [133]:
student_data[student_data['CLASS_START_DATE_x'] != student_data['CLASS_START_DATE_y']]

Unnamed: 0,STUDENT_ID,FIRST_NAME,LAST_NAME,CITY,BIRTH_DATE,E-MAIL,ID,COURSE_ID,CANCELLED,ENROLMENT_DATE,...,COURSE_STATUS,ACHIEVEMENT,POINTS,SESSION_ID,CLASS_START_DATE_y,CLASS_END_DATE_y,LOG_IN,LOG_OUT,DIF,SESSION_TIME


In [134]:
student_data[student_data['CLASS_END_DATE_x'] != student_data['CLASS_END_DATE_y']]

Unnamed: 0,STUDENT_ID,FIRST_NAME,LAST_NAME,CITY,BIRTH_DATE,E-MAIL,ID,COURSE_ID,CANCELLED,ENROLMENT_DATE,...,COURSE_STATUS,ACHIEVEMENT,POINTS,SESSION_ID,CLASS_START_DATE_y,CLASS_END_DATE_y,LOG_IN,LOG_OUT,DIF,SESSION_TIME


In [135]:
#rename the columns
student_data = student_data.rename(columns={'CLASS_START_DATE_x': 'CLASS_START_DATE',
                                            'CLASS_END_DATE_x': 'CLASS_END_DATE',
                                            })

In [136]:
#drop unnecessary feature(s)
student_data = student_data.drop(['CLASS_START_DATE_y', 'CLASS_END_DATE_y'],
                                 axis=1,
                                 errors='ignore')

In [144]:
#drop unnecessary feature(s)
student_data = student_data.drop(['SESSION_ID', 'DIF'],
                                 axis=1,
                                 errors='ignore')

In [137]:
student_data.head(2)

Unnamed: 0,STUDENT_ID,FIRST_NAME,LAST_NAME,CITY,BIRTH_DATE,E-MAIL,ID,COURSE_ID,CANCELLED,ENROLMENT_DATE,...,STUDENT_STATUS,COURSE_SCORE,COURSE_STATUS,ACHIEVEMENT,POINTS,SESSION_ID,LOG_IN,LOG_OUT,DIF,SESSION_TIME
0,S000000,James,Smith,Moscow,1967-10-01,jamessmith@yandex.com,0,C0001,True,2021-04-12,...,INACTIVE,90.0,PASS,GOLD,100,553,2021-07-07 00:00:00.047659,2021-07-07 00:00:00.829958,0 days 00:00:00.782299,0.01
1,S000000,James,Smith,Moscow,1967-10-01,jamessmith@yandex.com,0,C0001,True,2021-04-12,...,INACTIVE,90.0,PASS,GOLD,100,554,2021-06-10 20:03:09.436395,2021-06-21 01:27:26.322408,10 days 05:24:16.886013,14724.28


In [146]:
#set to datetime
student_data['BIRTH_DATE'] = pd.to_datetime(student_data['BIRTH_DATE'])
student_data['ENROLMENT_DATE'] = pd.to_datetime(student_data['ENROLMENT_DATE'])
student_data['COMPLETION_DATE'] = pd.to_datetime(student_data['COMPLETION_DATE'])
student_data['COURSE_START_DATE'] = pd.to_datetime(student_data['COURSE_START_DATE'])
student_data['COURSE_END_DATE'] = pd.to_datetime(student_data['COURSE_END_DATE'])
student_data['CLASS_START_DATE'] = pd.to_datetime(student_data['CLASS_START_DATE'])
student_data['CLASS_END_DATE'] = pd.to_datetime(student_data['CLASS_END_DATE'])

In [147]:
#change object to string
student_data['STUDENT_ID'] = student_data['STUDENT_ID'].astype('string')
student_data['FIRST_NAME'] = student_data['FIRST_NAME'].astype('string')
student_data['LAST_NAME'] = student_data['LAST_NAME'].astype('string')
student_data['CITY'] = student_data['CITY'].astype('string')
student_data['E-MAIL'] = student_data['E-MAIL'].astype('string')
student_data['COURSE_ID'] = student_data['COURSE_ID'].astype('string')
student_data['CLASS_ID'] = student_data['CLASS_ID'].astype('string')
student_data['CLASS_TITLE'] = student_data['CLASS_TITLE'].astype('string')
student_data['COURSE_TITLE'] = student_data['COURSE_TITLE'].astype('string')
student_data['TEST_NUMBER'] = student_data['TEST_NUMBER'].astype('string')
student_data['TEST_STATUS'] = student_data['TEST_STATUS'].astype('string')
student_data['STUDENT_STATUS'] = student_data['STUDENT_STATUS'].astype('string')
student_data['COURSE_STATUS'] = student_data['COURSE_STATUS'].astype('string')
student_data['ACHIEVEMENT'] = student_data['ACHIEVEMENT'].astype('string')

In [148]:
#change object to integer
student_data['ID'] = student_data['ID'].to_numpy('int16')
student_data['TEST_SCORE'] = student_data['TEST_SCORE'].to_numpy('int8')
student_data['POINTS'] = student_data['POINTS'].to_numpy('int8')

##### **EXPORT TABLES**

In [150]:
# #remove timezone from columns
# student_data[''] = student_data['Programme_Completion'].dt.tz_localize(None)

In [151]:
#create a file name
xlx_name = 'merged_tables.xlsx'

#export to excel
student_data.to_excel('./experimental/database_tables/' + xlx_name,
                       sheet_name='merged_tables')

TEST_SCORES

In [152]:
#slice the merged table
test_scores = student_data[['STUDENT_ID', 
                            'COURSE_ID', 'CLASS_ID',
                            'TEST_NUMBER', 'TEST_SCORE', 'TEST_STATUS',
                            'STUDENT_STATUS',
                            'ACHIEVEMENT', 'POINTS']]

test_scores.head(2)

Unnamed: 0,STUDENT_ID,COURSE_ID,CLASS_ID,TEST_NUMBER,TEST_SCORE,TEST_STATUS,STUDENT_STATUS,ACHIEVEMENT,POINTS
0,S000000,C0001,CERPC101,C0001T00001,90,PASS,INACTIVE,GOLD,100
1,S000000,C0001,CERPC102,C0001T00002,95,PASS,INACTIVE,GOLD,100


In [153]:
#create a file name
xlx_name = 'test_scores.xlsx'

#export to excel
test_scores.to_excel('./experimental/created_tables/' + xlx_name,
                       sheet_name='test_scores')

CURRENT COURSES

In [154]:
#slice the merged table
courses = student_data[['STUDENT_ID',
                        'COURSE_ID',
                        'COURSE_START_DATE', 'COURSE_END_DATE', 'COURSE_SCORE', 'COURSE_STATUS',
                        'STUDENT_STATUS']]

courses.head(2)

Unnamed: 0,STUDENT_ID,COURSE_ID,COURSE_START_DATE,COURSE_END_DATE,COURSE_SCORE,COURSE_STATUS,STUDENT_STATUS
0,S000000,C0001,2021-05-16,2021-07-08,90.0,PASS,INACTIVE
1,S000000,C0001,2021-05-16,2021-07-08,90.0,PASS,INACTIVE


In [155]:
#create a file name
xlx_name = 'courses.xlsx'

#export to excel
courses.to_excel('./experimental/created_tables/' + xlx_name,
                 sheet_name='courses')

CURRENT CLASSES

In [156]:
#slice the merged table
classes = student_data[['STUDENT_ID',
                        'COURSE_ID', 'CLASS_ID',
                        'COURSE_START_DATE', 'COURSE_END_DATE',
                        'CLASS_START_DATE', 'CLASS_END_DATE', 'TEST_STATUS',
                        'STUDENT_STATUS']]

classes.head(2)

Unnamed: 0,STUDENT_ID,COURSE_ID,CLASS_ID,COURSE_START_DATE,COURSE_END_DATE,CLASS_START_DATE,CLASS_END_DATE,TEST_STATUS,STUDENT_STATUS
0,S000000,C0001,CERPC101,2021-05-16,2021-07-08,2021-07-07,2021-07-07,PASS,INACTIVE
1,S000000,C0001,CERPC102,2021-05-16,2021-07-08,2021-05-29,2021-06-22,PASS,INACTIVE


In [157]:
#rename the feature(s)
classes = classes.rename(columns={'TEST_STATUS': 'CLASS_STATUS',
                                  })

In [158]:
#create a file name
xlx_name = 'classes.xlsx'

#export to excel
classes.to_excel('./experimental/created_tables/' + xlx_name,
                 sheet_name='classes')