# Data Engineering Project

## Loading Dependencies & Extracting Data

In [1]:
import pandas as pd
from sqlalchemy import create_engine
from pathlib import Path
import logging

logger =  logging.getLogger('notebook')

In [2]:
project_root = Path.cwd().parent
dataset = 'data/Cohort 4 Capstone Project - Dataset.xlsx'

In [3]:
data_path = Path.joinpath(project_root, dataset)
data_path

PosixPath('/Users/talentmatch/Documents/Projects/code_projects/katlego-engineering-c4-project/data/Cohort 4 Capstone Project - Dataset.xlsx')

In [4]:
xls = pd.ExcelFile(data_path)

In [5]:
xls.sheet_names

['Cohort 3 DS', 'Cohort 3 DA']

In [6]:
dfs = xls.parse(0)
dfa = xls.parse(1)

In [7]:
# data loading function
def load(path):
    logger.info("Uploading data")
    try:
        file = pd.ExcelFile(path)
        sheets = file.sheet_names
        names = ", ".join(str(name) for name in xls.sheet_names)
        logger.info(f"Successfully uploaded {len(sheets)} named {names} from file.")
        frames = [xls.parse(sheet) for sheet in sheets]
        return frames
    except Exception as e: 
        logger.error("Cannot upload sheets from file")


In [8]:
frames = load(data_path)

## Exploratory Data Analysis & Data Transformation

In [9]:
dfs.head()

Unnamed: 0,Timestamp,Id. No,Age range,Gender,Country,Where did you hear about Everything Data?,How many years of learning experience do you have in the field of data?,Which track are you applying for?,How many hours per week can you commit to learning?,What is your main aim for joining the mentorship program?,What is your motivation to join the Everything Data mentorship program?,How best would you describe your skill level in the track you are applying for?,Have you completed the everything data aptitude test for your track?,Total score,Graduated
0,2024-12-01 23:50:47.202,DS301,18-24 years,Male,Kenya,Word of mouth,Less than six months,Data science,less than 6 hours,Upskill,to enter into the data analysis career,Beginner - I have NO learning or work experien...,Yes,58.666667,No
1,2024-12-03 09:35:19.407,DS302,25-34 years,Male,Kenya,WhatsApp,6 months - 1 year,Data science,more than 14 hours,Upskill,To grow and improve my skills in data science ...,Elementary - I have theoretical understanding ...,Yes,70.0,No
2,2024-12-03 19:16:49.376,DS303,18-24 years,Female,Kenya,WhatsApp,6 months - 1 year,Data science,more than 14 hours,Upskill,I’m motivated to join Everything Data to enhan...,Intermediate - I have theoretical knowledge an...,Yes,64.333333,Yes
3,2024-12-03 12:52:35.541,DS304,18-24 years,Female,Kenya,WhatsApp,6 months - 1 year,Data science,7-14 hours,Upskill,I'd like to upskill and Join the Data Community,Intermediate - I have theoretical knowledge an...,Yes,75.0,No
4,2024-12-03 18:12:27.159,DS305,18-24 years,Male,Kenya,WhatsApp,Less than six months,Data science,7-14 hours,Upskill,I aim to join the mentorship program to enhanc...,Beginner - I have NO learning or work experien...,Yes,59.0,No


In [10]:
dfa.head()

Unnamed: 0,Timestamp,ID No.,Age range,Gender,Country,Where did you hear about Everything Data?,How many years of learning experience do you have in the field of data?,Which track are you applying for?,How many hours per week can you commit to learning?,What is your main aim for joining the mentorship program?,What is your motivation to join the Everything Data mentorship program?,How best would you describe your skill level in the track you are applying for?,Have you completed the everything data aptitude test for your track?,Total score,Graduated
0,2024-12-03 16:25:30.208,DA301,18-24 years,Female,Kenya,WhatsApp,Less than six months,Data analysis,more than 14 hours,Learn data afresh,The hands-on program offers valuable real-worl...,Elementary - I have theoretical understanding ...,Yes,67.333333,Yes
1,2024-12-02 18:17:14.522,DA302,18-24 years,Male,Kenya,Twitter,Less than six months,Data analysis,7-14 hours,Build a project portfolio,The mentorship will be a catalytic opportunity...,Beginner - I have NO learning or work experien...,Yes,62.0,No
2,2024-11-29 08:05:14.371,DA303,18-24 years,Male,Kenya,WhatsApp,6 months - 1 year,Data analysis,7-14 hours,Upskill,Desire to learn,Beginner - I have NO learning or work experien...,Yes,74.0,No
3,2024-11-28 21:37:53.455,DA304,25-34 years,Male,Kenya,Twitter,1-3 years,Data analysis,7-14 hours,Build a project portfolio,My background knowledge in Computer Science dr...,Elementary - I have theoretical understanding ...,Yes,65.333333,No
4,2024-12-03 17:15:14.443,DA305,18-24 years,Male,Kenya,LinkedIn,6 months - 1 year,Data analysis,more than 14 hours,Build a project portfolio,To horn my skills in data analysis\nLand a dat...,Intermediate - I have theoretical knowledge an...,Yes,69.333333,No


## Standardising Columns

In [11]:
dfa.columns == dfs.columns

array([ True, False,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True])

In [12]:
dfa_cols_lower = [col.lower() for col in dfa.columns]
dfs_cols_lower = [col.lower() for col in dfs.columns]

In [13]:
dfa_cols_lower == dfs_cols_lower

False

In [14]:
dfs.columns = dfa.columns = dfa_cols_lower

In [15]:
dfsa = pd.concat([dfs, dfa], axis=0)

In [16]:
print(dfs.shape)
print(dfa.shape)
print(dfsa.shape)

(63, 15)
(52, 15)
(115, 15)


In [17]:
columns = dfs.columns

In [18]:
print(columns)

Index(['timestamp', 'id no.', 'age range', 'gender', 'country',
       'where did you hear about everything data?',
       'how many years of learning experience do you have in the field of data?',
       'which track are you applying for?',
       'how many hours per week can you commit to learning?',
       'what is your main aim for joining the mentorship program?',
       'what is your motivation to join the everything data mentorship program?',
       'how best would you describe your skill level in the track you are applying for?',
       'have you completed the everything data aptitude test for your track?',
       'total score', 'graduated'],
      dtype='object')


In [19]:
new_cols = [
    'timestamp', 'id', 'age_range', 'gender', 'country', 'referral', 
    'experience', 'track', 'hours_available', 'aim', 'motivation', 'skill_level',
    'completed_aptitude', 'aptitude_score', 'graduated'
]
map_dict = {col: columns[num] for num, col in enumerate(new_cols)}

In [20]:
map_dict

{'timestamp': 'timestamp',
 'id': 'id no.',
 'age_range': 'age range',
 'gender': 'gender',
 'country': 'country',
 'referral': 'where did you hear about everything data?',
 'experience': 'how many years of learning experience do you have in the field of data?',
 'track': 'which track are you applying for?',
 'hours_available': 'how many hours per week can you commit to learning?',
 'aim': 'what is your main aim for joining the mentorship program?',
 'motivation': 'what is your motivation to join the everything data mentorship program?',
 'skill_level': 'how best would you describe your skill level in the track you are applying for?',
 'completed_aptitude': 'have you completed the everything data aptitude test for your track?',
 'aptitude_score': 'total score',
 'graduated': 'graduated'}

In [21]:
map_dict.keys()

dict_keys(['timestamp', 'id', 'age_range', 'gender', 'country', 'referral', 'experience', 'track', 'hours_available', 'aim', 'motivation', 'skill_level', 'completed_aptitude', 'aptitude_score', 'graduated'])

In [22]:
dfsa.columns = [*map_dict.keys()]

In [23]:
dfsa.head()

Unnamed: 0,timestamp,id,age_range,gender,country,referral,experience,track,hours_available,aim,motivation,skill_level,completed_aptitude,aptitude_score,graduated
0,2024-12-01 23:50:47.202,DS301,18-24 years,Male,Kenya,Word of mouth,Less than six months,Data science,less than 6 hours,Upskill,to enter into the data analysis career,Beginner - I have NO learning or work experien...,Yes,58.666667,No
1,2024-12-03 09:35:19.407,DS302,25-34 years,Male,Kenya,WhatsApp,6 months - 1 year,Data science,more than 14 hours,Upskill,To grow and improve my skills in data science ...,Elementary - I have theoretical understanding ...,Yes,70.0,No
2,2024-12-03 19:16:49.376,DS303,18-24 years,Female,Kenya,WhatsApp,6 months - 1 year,Data science,more than 14 hours,Upskill,I’m motivated to join Everything Data to enhan...,Intermediate - I have theoretical knowledge an...,Yes,64.333333,Yes
3,2024-12-03 12:52:35.541,DS304,18-24 years,Female,Kenya,WhatsApp,6 months - 1 year,Data science,7-14 hours,Upskill,I'd like to upskill and Join the Data Community,Intermediate - I have theoretical knowledge an...,Yes,75.0,No
4,2024-12-03 18:12:27.159,DS305,18-24 years,Male,Kenya,WhatsApp,Less than six months,Data science,7-14 hours,Upskill,I aim to join the mentorship program to enhanc...,Beginner - I have NO learning or work experien...,Yes,59.0,No


In [24]:
dfsa.dtypes

timestamp             datetime64[ns]
id                            object
age_range                     object
gender                        object
country                       object
referral                      object
experience                    object
track                         object
hours_available               object
aim                           object
motivation                    object
skill_level                   object
completed_aptitude            object
aptitude_score               float64
graduated                     object
dtype: object

In [25]:
# column name standardisation function
def standardise_column_titles(frames):
    logger.info("Standardising column names")

    try: 
        frame_cols = [frame.columns for frame in frames]
        prime_frame_cols = frame_cols[0]
        prime_frame_cols_lower = [col.lower() for col in prime_frame_cols]

        new_frames = []

        flag = {}
        
        for i, frame in enumerate(frames):
            if [col.lower() for col in prime_frame_cols] == prime_frame_cols_lower:
                frame.columns = prime_frame_cols_lower
                new_frames.append(frame)
            else: 
                flag[i] = frame

        new_frame = pd.concat(new_frames, axis=0)

        logger.info("Column names successfully standardised.")

        return new_frame
    except Exception as e:
        logger.error(f"Column name standardisation failed due to {str(e)}")


frame = standardise_column_titles(frames)

if type(frame) == pd.DataFrame: 
    print(frame.head(1))


                timestamp id. no    age range gender country  \
0 2024-12-01 23:50:47.202  DS301  18-24 years   Male   Kenya   

  where did you hear about everything data?  \
0                             Word of mouth   

  how many years of learning experience do you have in the field of data?  \
0                               Less than six months                        

  which track are you applying for?  \
0                      Data science   

  how many hours per week can you commit to learning?  \
0                                  less than 6 hours    

  what is your main aim for joining the mentorship program?  \
0                                            Upskill          

  what is your motivation to join the everything data mentorship program?  \
0             to enter into the data analysis career                        

  how best would you describe your skill level in the track you are applying for?  \
0  Beginner - I have NO learning or work experien...        

In [26]:
# column name shortening function
def map_short_column_names(frame, new_cols):
    logger.info("Shortening standardised column names")
    if len(new_cols) == len(frame.columns):
        try:
            map_dict = {col: columns[num] for num, col in enumerate(new_cols)}
            
            frame.columns = [*map_dict.keys()]

            logger.info("Successfully shortened standardised column names.")

            return frame
        except Exception as e:
            logger.error(f"An error has occured attempting to shorten the standardised column names: {str(e)}")
            


In [27]:
new_cols = [
    'timestamp', 'id', 'age_range_years', 'gender', 'country', 'referral', 
    'experience', 'track', 'hours_available', 'aim', 'motivation', 'skill_level',
    'completed_aptitude', 'aptitude_score', 'graduated'
]

frame = map_short_column_names(frame, new_cols)

if type(frame) == pd.DataFrame:
    frame.head()

### Standardising Other Column Values

Standardise by removing `years`. 

In [28]:
dfsa.age_range.value_counts()

age_range
18-24 years    62
25-34 years    48
35-44 years     4
45-54 years     1
Name: count, dtype: int64

Standardise by removing replacing "through a geeks for geeks webinar" with `Geeks for Geeks`.

In [29]:
dfsa.referral.value_counts()

referral
WhatsApp                             53
Twitter                              38
LinkedIn                             13
Word of mouth                         7
Instagram                             2
through a geeks for geeks webinar     1
Friend                                1
Name: count, dtype: int64

Standardise by removing hours and correcting ranges.


Source             | Standardised  |
-------------------|---------------|
less than 6 hours  | less than  7   |
7 - 14 hours       | between 7 to 14  |
mort than 14 hours | more than 14  |


In [30]:
dfsa.hours_available.value_counts()

hours_available
7-14 hours            64
more than 14 hours    35
less than 6 hours     16
Name: count, dtype: int64

Create a skills level table and code levels ordinally: 

1 - Beginner.  
2 - Elementary.  
3 - Intermediate.  
4 - Advanced.  

In [31]:
dfsa.skill_level.value_counts()

skill_level
Elementary - I have theoretical understanding of basic data analysis/ data science concepts         56
Beginner - I have NO learning or work experience in data analysis/ data science                     42
Intermediate - I have theoretical knowledge and experience in data analysis/ data science           16
Advanced - I have deep knowledge and experience in advanced data analysis/ data science concepts     1
Name: count, dtype: int64

Standardise values by replacing "Less than six months" with `Less than 6 months`

In [32]:
dfsa.experience.value_counts()

experience
Less than six months    72
6 months - 1 year       30
1-3 years                9
4-6 years                4
Name: count, dtype: int64

In [33]:
dfsa.graduated.value_counts()

graduated
No     84
Yes    31
Name: count, dtype: int64

In [34]:
dfsa.aim.value_counts()

aim
Upskill                                                                         73
Learn data afresh                                                               23
Build a project portfolio                                                       15
Connect with fellow data professionals                                           2
both upskilling and connecting with fellow data professionals                    1
Learn more about data analysis and also network with fellow data enthusiasts     1
Name: count, dtype: int64

In [35]:
dfsa.completed_aptitude.value_counts()

completed_aptitude
Yes    112
No       3
Name: count, dtype: int64

In [36]:
dfsa_cleaned = dfsa.copy()

In [37]:
dfsa_cleaned['age_range'] =  dfsa_cleaned['age_range'].apply(lambda row: row.replace('years', ''))
dfsa_cleaned.head()

Unnamed: 0,timestamp,id,age_range,gender,country,referral,experience,track,hours_available,aim,motivation,skill_level,completed_aptitude,aptitude_score,graduated
0,2024-12-01 23:50:47.202,DS301,18-24,Male,Kenya,Word of mouth,Less than six months,Data science,less than 6 hours,Upskill,to enter into the data analysis career,Beginner - I have NO learning or work experien...,Yes,58.666667,No
1,2024-12-03 09:35:19.407,DS302,25-34,Male,Kenya,WhatsApp,6 months - 1 year,Data science,more than 14 hours,Upskill,To grow and improve my skills in data science ...,Elementary - I have theoretical understanding ...,Yes,70.0,No
2,2024-12-03 19:16:49.376,DS303,18-24,Female,Kenya,WhatsApp,6 months - 1 year,Data science,more than 14 hours,Upskill,I’m motivated to join Everything Data to enhan...,Intermediate - I have theoretical knowledge an...,Yes,64.333333,Yes
3,2024-12-03 12:52:35.541,DS304,18-24,Female,Kenya,WhatsApp,6 months - 1 year,Data science,7-14 hours,Upskill,I'd like to upskill and Join the Data Community,Intermediate - I have theoretical knowledge an...,Yes,75.0,No
4,2024-12-03 18:12:27.159,DS305,18-24,Male,Kenya,WhatsApp,Less than six months,Data science,7-14 hours,Upskill,I aim to join the mentorship program to enhanc...,Beginner - I have NO learning or work experien...,Yes,59.0,No


In [38]:
dfsa_cleaned['referral'] = dfsa_cleaned['referral'].apply(lambda row: row.replace('through a geeks for geeks webinar', 'Geeks for Geeks'))
dfsa_cleaned[dfsa_cleaned['referral'] == 'Geeks for Geeks']

Unnamed: 0,timestamp,id,age_range,gender,country,referral,experience,track,hours_available,aim,motivation,skill_level,completed_aptitude,aptitude_score,graduated
35,2024-12-01 18:49:03.203,DS336,18-24,Female,Kenya,Geeks for Geeks,Less than six months,Data science,7-14 hours,both upskilling and connecting with fellow dat...,Joining the Everything Data Mentorship program...,Elementary - I have theoretical understanding ...,Yes,74.333333,Yes


In [39]:
dfsa_cleaned['experience'] = dfsa_cleaned['experience'].apply(lambda row: row.replace('six', '6'))
dfsa_cleaned[dfsa_cleaned['experience'] == 'Less than 6 months']

Unnamed: 0,timestamp,id,age_range,gender,country,referral,experience,track,hours_available,aim,motivation,skill_level,completed_aptitude,aptitude_score,graduated
0,2024-12-01 23:50:47.202,DS301,18-24,Male,Kenya,Word of mouth,Less than 6 months,Data science,less than 6 hours,Upskill,to enter into the data analysis career,Beginner - I have NO learning or work experien...,Yes,58.666667,No
4,2024-12-03 18:12:27.159,DS305,18-24,Male,Kenya,WhatsApp,Less than 6 months,Data science,7-14 hours,Upskill,I aim to join the mentorship program to enhanc...,Beginner - I have NO learning or work experien...,Yes,59.000000,No
6,2024-11-28 14:42:44.559,DS307,18-24,Female,Kenya,WhatsApp,Less than 6 months,Data science,7-14 hours,Learn data afresh,To learn data science and develop skills neces...,Beginner - I have NO learning or work experien...,Yes,73.333333,No
7,2024-12-03 12:58:05.404,DS308,18-24,Male,Kenya,WhatsApp,Less than 6 months,Data science,more than 14 hours,Connect with fellow data professionals,I'm eager to gain a solid foundation in data f...,Elementary - I have theoretical understanding ...,Yes,71.000000,No
8,2024-12-01 22:02:22.734,DS309,25-34,Female,Kenya,Twitter,Less than 6 months,Data science,less than 6 hours,Upskill,I am inspired to make more data driven decisio...,Beginner - I have NO learning or work experien...,Yes,74.000000,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43,2024-11-28 19:53:42.862,DA344,25-34,Female,Kenya,WhatsApp,Less than 6 months,Data analysis,less than 6 hours,Upskill,"To connect with fellow data professionals, sta...",Elementary - I have theoretical understanding ...,Yes,78.000000,No
45,2024-12-02 13:08:20.517,DA346,25-34,Female,Kenya,WhatsApp,Less than 6 months,Data analysis,7-14 hours,Upskill,This is a program which will offer me an oppor...,Elementary - I have theoretical understanding ...,Yes,64.666667,Yes
47,2024-12-02 21:06:50.072,DA348,25-34,Male,Kenya,WhatsApp,Less than 6 months,Data analysis,7-14 hours,Upskill,My motivation to join the Everything Data ment...,Elementary - I have theoretical understanding ...,No,62.333333,No
48,2024-12-03 20:11:36.575,DA349,25-34,Male,Kenya,LinkedIn,Less than 6 months,Data analysis,7-14 hours,Build a project portfolio,Heavy inspiration from the collaboration and i...,Elementary - I have theoretical understanding ...,Yes,70.666667,No


In [40]:
hours_mapping_dict = {
    "7-14 hours":  "7-14 hours",           
    "more than 14 hours": "More than 14 hours",
    "less than 6 hours": "Less than 7 hours", 
}

dfsa_cleaned['hours_available'] = dfsa_cleaned['hours_available'].apply(lambda row: hours_mapping_dict.get(row, row))
dfsa_cleaned.head()

Unnamed: 0,timestamp,id,age_range,gender,country,referral,experience,track,hours_available,aim,motivation,skill_level,completed_aptitude,aptitude_score,graduated
0,2024-12-01 23:50:47.202,DS301,18-24,Male,Kenya,Word of mouth,Less than 6 months,Data science,Less than 7 hours,Upskill,to enter into the data analysis career,Beginner - I have NO learning or work experien...,Yes,58.666667,No
1,2024-12-03 09:35:19.407,DS302,25-34,Male,Kenya,WhatsApp,6 months - 1 year,Data science,More than 14 hours,Upskill,To grow and improve my skills in data science ...,Elementary - I have theoretical understanding ...,Yes,70.0,No
2,2024-12-03 19:16:49.376,DS303,18-24,Female,Kenya,WhatsApp,6 months - 1 year,Data science,More than 14 hours,Upskill,I’m motivated to join Everything Data to enhan...,Intermediate - I have theoretical knowledge an...,Yes,64.333333,Yes
3,2024-12-03 12:52:35.541,DS304,18-24,Female,Kenya,WhatsApp,6 months - 1 year,Data science,7-14 hours,Upskill,I'd like to upskill and Join the Data Community,Intermediate - I have theoretical knowledge an...,Yes,75.0,No
4,2024-12-03 18:12:27.159,DS305,18-24,Male,Kenya,WhatsApp,Less than 6 months,Data science,7-14 hours,Upskill,I aim to join the mentorship program to enhanc...,Beginner - I have NO learning or work experien...,Yes,59.0,No


In [41]:
dfsa_cleaned['motivation'] = dfsa_cleaned['motivation'].str.lower()
dfsa_cleaned.head()

Unnamed: 0,timestamp,id,age_range,gender,country,referral,experience,track,hours_available,aim,motivation,skill_level,completed_aptitude,aptitude_score,graduated
0,2024-12-01 23:50:47.202,DS301,18-24,Male,Kenya,Word of mouth,Less than 6 months,Data science,Less than 7 hours,Upskill,to enter into the data analysis career,Beginner - I have NO learning or work experien...,Yes,58.666667,No
1,2024-12-03 09:35:19.407,DS302,25-34,Male,Kenya,WhatsApp,6 months - 1 year,Data science,More than 14 hours,Upskill,to grow and improve my skills in data science ...,Elementary - I have theoretical understanding ...,Yes,70.0,No
2,2024-12-03 19:16:49.376,DS303,18-24,Female,Kenya,WhatsApp,6 months - 1 year,Data science,More than 14 hours,Upskill,i’m motivated to join everything data to enhan...,Intermediate - I have theoretical knowledge an...,Yes,64.333333,Yes
3,2024-12-03 12:52:35.541,DS304,18-24,Female,Kenya,WhatsApp,6 months - 1 year,Data science,7-14 hours,Upskill,i'd like to upskill and join the data community,Intermediate - I have theoretical knowledge an...,Yes,75.0,No
4,2024-12-03 18:12:27.159,DS305,18-24,Male,Kenya,WhatsApp,Less than 6 months,Data science,7-14 hours,Upskill,i aim to join the mentorship program to enhanc...,Beginner - I have NO learning or work experien...,Yes,59.0,No


In [42]:
skill_level = dfsa['skill_level']
split_skill_level = skill_level.str.split('-')
skill_label = split_skill_level.map(lambda x: x[0].strip())
skill_description = split_skill_level.map(lambda x: x[1].lower().strip())
dfsa_cleaned = dfsa_cleaned.drop(columns=['skill_level'])
dfsa_cleaned['skill_level'] = skill_label.str.lower()
dfsa_cleaned['skill_level_description'] = skill_description
dfsa_cleaned['track'] = dfsa_cleaned['track'].str.lower()
if 'timestamp' in dfsa_cleaned.columns: 
       dfsa_cleaned['registration_date'] =   dfsa_cleaned['timestamp'].dt.date
       dfsa_cleaned['registration_time'] = dfsa_cleaned['timestamp'].dt.time

dfsa_cleaned = dfsa_cleaned[['registration_date', 'registration_time', 'id', 'age_range', 'gender', 'country', 'referral',
       'experience', 'track', 'hours_available', 'aim', 'motivation','skill_level','skill_level_description',
       'completed_aptitude', 'aptitude_score', 'graduated'
       ]]
dfsa_cleaned.head()

Unnamed: 0,registration_date,registration_time,id,age_range,gender,country,referral,experience,track,hours_available,aim,motivation,skill_level,skill_level_description,completed_aptitude,aptitude_score,graduated
0,2024-12-01,23:50:47.202000,DS301,18-24,Male,Kenya,Word of mouth,Less than 6 months,data science,Less than 7 hours,Upskill,to enter into the data analysis career,beginner,i have no learning or work experience in data ...,Yes,58.666667,No
1,2024-12-03,09:35:19.407000,DS302,25-34,Male,Kenya,WhatsApp,6 months - 1 year,data science,More than 14 hours,Upskill,to grow and improve my skills in data science ...,elementary,i have theoretical understanding of basic data...,Yes,70.0,No
2,2024-12-03,19:16:49.376000,DS303,18-24,Female,Kenya,WhatsApp,6 months - 1 year,data science,More than 14 hours,Upskill,i’m motivated to join everything data to enhan...,intermediate,i have theoretical knowledge and experience in...,Yes,64.333333,Yes
3,2024-12-03,12:52:35.541000,DS304,18-24,Female,Kenya,WhatsApp,6 months - 1 year,data science,7-14 hours,Upskill,i'd like to upskill and join the data community,intermediate,i have theoretical knowledge and experience in...,Yes,75.0,No
4,2024-12-03,18:12:27.159000,DS305,18-24,Male,Kenya,WhatsApp,Less than 6 months,data science,7-14 hours,Upskill,i aim to join the mentorship program to enhanc...,beginner,i have no learning or work experience in data ...,Yes,59.0,No


In [43]:
map_aims = {
    'upskill': 'upskill', 
    'data': 'learn', 
    'connect': 'network',
    'build': 'enhance portfolio',
    'both': 'network & upskill',
    'more': 'learn & network'
}

extract_aim = lambda x:  x.split(" ")[1] if x.split(" ")[0] == 'Learn' else x.split(" ")[0]
map_aims_func = lambda x: map_aims.get(extract_aim(x).lower(), x)


dfsa_cleaned['aim'] = dfsa_cleaned['aim'].apply(lambda x: map_aims_func(x))

In [44]:
dfsa_cleaned.head()

Unnamed: 0,registration_date,registration_time,id,age_range,gender,country,referral,experience,track,hours_available,aim,motivation,skill_level,skill_level_description,completed_aptitude,aptitude_score,graduated
0,2024-12-01,23:50:47.202000,DS301,18-24,Male,Kenya,Word of mouth,Less than 6 months,data science,Less than 7 hours,upskill,to enter into the data analysis career,beginner,i have no learning or work experience in data ...,Yes,58.666667,No
1,2024-12-03,09:35:19.407000,DS302,25-34,Male,Kenya,WhatsApp,6 months - 1 year,data science,More than 14 hours,upskill,to grow and improve my skills in data science ...,elementary,i have theoretical understanding of basic data...,Yes,70.0,No
2,2024-12-03,19:16:49.376000,DS303,18-24,Female,Kenya,WhatsApp,6 months - 1 year,data science,More than 14 hours,upskill,i’m motivated to join everything data to enhan...,intermediate,i have theoretical knowledge and experience in...,Yes,64.333333,Yes
3,2024-12-03,12:52:35.541000,DS304,18-24,Female,Kenya,WhatsApp,6 months - 1 year,data science,7-14 hours,upskill,i'd like to upskill and join the data community,intermediate,i have theoretical knowledge and experience in...,Yes,75.0,No
4,2024-12-03,18:12:27.159000,DS305,18-24,Male,Kenya,WhatsApp,Less than 6 months,data science,7-14 hours,upskill,i aim to join the mentorship program to enhanc...,beginner,i have no learning or work experience in data ...,Yes,59.0,No


In [45]:
# column name standardisation function 

def standardise_column_values(frame, hours_map): 
    logger.info("Standardising column values.")
    try: 
        copy = frame.copy()
        copy['referral'] = copy['referral'].apply(lambda row: row.replace(
            'through a geeks for geeks webinar', 'Geeks for Geeks')
        )
        copy['motivation'] = copy['motivation'].str.lower()
        copy['experience'] = copy['experience'].apply(
            lambda row: row.replace('six', '6')
        )
        copy['age_range_years'] =  copy['age_range_years'].apply(
            lambda row: row.replace('years', '')
        )
        copy['hours_available'] = copy['hours_available'].apply(
            lambda row: hours_map.get(row, row)
        )
        copy['track'] = copy['track'].str.lower()
        if 'timestamp' in copy.columns:
            copy['registration_date'] = copy['timestamp'].dt.date
            copy['registration_time'] = copy['timestamp'].dt.time
            copy.drop(columns=['timestamp'], inplace=True)

        logger.info("Successfuly standardised column values.")
        return copy
    
    
    except Exception as e:
        logger.error(f"Failed to standardise column values due to the following error: {str(e)}")

    
    

In [46]:
hours_mapping_dict = {
        "7-14 hours":  "7-14 hours",           
        "more than 14 hours": "More than 14 hours",
        "less than 6 hours": "Less than 7 hours", 
}
frame = standardise_column_values(frame, hours_mapping_dict)

if type(frame) == pd.DataFrame: 
    print(frame.head(1))

      id age_range_years gender country       referral          experience  \
0  DS301          18-24    Male   Kenya  Word of mouth  Less than 6 months   

          track    hours_available      aim  \
0  data science  Less than 7 hours  Upskill   

                               motivation  \
0  to enter into the data analysis career   

                                         skill_level completed_aptitude  \
0  Beginner - I have NO learning or work experien...                Yes   

   aptitude_score graduated registration_date registration_time  
0       58.666667        No        2024-12-01   23:50:47.202000  


In [47]:
# skill level splitting column
def split_skill_level(frame, standard_cols):
    logger.info("Splitting the 'skill level' column into category and description.")
    try: 
        copy = frame.copy()
        skill_level = copy['skill_level']
        split_skill_level = skill_level.str.split('-')
        skill_label = split_skill_level.map(
            lambda x: x[0].strip()
        )
        skill_description = split_skill_level.map(
            lambda x: x[1].lower().strip()
        )
        copy.drop(columns=['skill_level'], inplace=True)
        copy['skill_level'] = skill_label.lower()
        copy['skill_level_description'] = skill_description
        copy = copy[standard_cols]

        logger.info("Successfully split the 'skill level' column.")
        return copy
    
    except Exception as e:
        logger.error(f"An error has occured attempting to split the 'skill level' column: {str(e)}")

In [48]:
standard_cols = [
    'registration_date', 'registration_time', 'id', 'age_range_years', 'gender', 'country', 'referral',
    'experience', 'track', 'hours_available', 'aim', 'motivation',
    'skill_level','skill_level_description','completed_aptitude', 
    'aptitude_score', 'graduated'
]
frame = split_skill_level(frame, standard_cols)
if type(frame) == pd.DataFrame: 
    print(frame.head(1))

An error has occured attempting to split the 'skill level' column: 'Series' object has no attribute 'lower'


In [49]:
# aim categories mapping function
def map_aim_categories(frame, aims_map):
    logger.info("Mapping 'aim' column categories to standardised values")

    try:
        extract_aim = lambda x:  x.split(" ")[1] if x.split(" ")[0] == 'Learn' else x.split(" ")[0]
        map_aims_func = lambda x: aims_map[extract_aim(x).lower()]
        frame['aim'] = frame['aim'].apply(lambda x: map_aims_func(x))

        logger.info("Successfully mapped 'aim' column categories" )
        return frame
    except Exception as e:
        logger.error(f"Failed to map 'aim' column categories: {str(e)}")

In [50]:
aims_map = {
    'upskill': 'upskill', 
    'data': 'learn', 
    'connect': 'network',
    'build': 'enhance portfolio',
    'both': 'network & upskill',
    'more': 'learn & network'
}
frame = map_aim_categories(frame, aims_map)
if type(frame) == pd.DataFrame: 
    print(frame.head(1))

Failed to map 'aim' column categories: 'NoneType' object is not subscriptable


## Preprocessing Datat for Database Loading

In [51]:
def index_to_map(column):
    index = column.to_dict()
    index_map = {v: k for k, v in index.items()}
    return index_map

In [75]:
def prepare_age_range():
    age_range = pd.DataFrame(dfsa_cleaned['age_range'].unique())
    age_range.rename(columns={0: 'age_range'}, inplace=True)
    return age_range
prepare_age_range().head(1)

Unnamed: 0,age_range
0,18-24


In [85]:
def prepare_registration():
    registration = dfsa_cleaned[['registration_date', 'registration_time',  'id']]
    return registration.copy()
prepare_registration().head(1)

Unnamed: 0,registration_date,registration_time,id
0,2024-12-01,23:50:47.202000,DS301


In [77]:
def prepare_country():
    country = pd.DataFrame(dfsa_cleaned['country'].unique())
    country.rename(columns={0: 'country'}, inplace=True)
    return country
prepare_country().head(1)

Unnamed: 0,country
0,Kenya


In [78]:
def prepare_referral():
    referral = pd.DataFrame(dfsa_cleaned['referral'].unique())
    referral.rename(columns={0: 'source'}, inplace=True)
    return referral
prepare_referral().head(1)

Unnamed: 0,source
0,Word of mouth


In [79]:
def prepare_experience():
    experience = pd.DataFrame(dfsa_cleaned['experience'].unique())
    experience.rename(columns={0: 'experience_level'}, inplace=True)
    return experience
prepare_experience().head(1)

Unnamed: 0,experience_level
0,Less than 6 months


In [80]:
def prepare_track():
    track = pd.DataFrame(dfsa_cleaned['track'].unique())
    track.rename(columns={0: 'track'}, inplace=True)
    return track
prepare_track().head(1)

Unnamed: 0,track
0,data science


In [81]:
def prepare_aims():
    aim = pd.DataFrame(dfsa_cleaned['aim'].unique())
    aim.rename(columns={0: 'aim'}, inplace=True)
    return aim
prepare_aims().head(1)

Unnamed: 0,aim
0,upskill


In [None]:
def prepare_skills():
    skills = dfsa_cleaned[['skill_level', 'skill_level_description']].drop_duplicates(subset=['skill_level', 'skill_level_description'])
    skills.index = pd.Index(range(len(skills.index)))
    return skills.copy()
prepare_skills().head(1)

Unnamed: 0,skill_level,skill_level_description
0,beginner,i have no learning or work experience in data ...


In [84]:
def prepare_student_outcomes():
    outcomes = dfsa_cleaned[['id','completed_aptitude', 'aptitude_score', 'graduated']]
    outcomes = outcomes.copy()
    outcomes.rename(columns={'id':'student_id'}, inplace=True)
    return outcomes
prepare_student_outcomes().head(1)

Unnamed: 0,student_id,completed_aptitude,aptitude_score,graduated
0,DS301,Yes,58.666667,No


In [60]:
def prepare_key_mappings(*args):
    mappings = {} 
    
    if "aim" in args:
        aim = prepare_aims()
        aim_keys = index_to_map(aim['aim'])
        mappings["aim"] = aim_keys
    if "age_range" in args:
        age_range = prepare_age_range()
        age_keys = index_to_map(age_range['age_range'])
        mappings["age_range"] = age_keys
    if "country" in args:
        country = prepare_country()
        country_keys = index_to_map(country['country'])
        mappings["country"] = country_keys
    if "experience" in args:
        experience = prepare_experience()
        experience_keys = index_to_map(experience['experience_level'])
        mappings["experience"] = experience_keys
    if "track" in args:
        track = prepare_track()
        track_keys = index_to_map(track['track'])
        mappings["track"] = track_keys
    if "referral" in args:
        referral = prepare_referral()
        referral_keys = index_to_map(referral['source'])
        mappings["referral"] = referral_keys
    if "skill_level" in args: 
        skill = prepare_skills()
        skill_keys = index_to_map(skill['skill_level'])
        mappings["skill_level"] = skill_keys

    return mappings

In [61]:
aims_map = prepare_key_mappings('aim')
aims_map

{'aim': {'upskill': 0,
  'learn': 1,
  'network': 2,
  'enhance portfolio': 3,
  'network & upskill': 4,
  'learn & network': 5}}

In [73]:
motivation = dfsa_cleaned[['id', 'aim', 'motivation']]
motivation = motivation.copy()
mappings = prepare_key_mappings('aim')
motivation['aim_id'] = motivation['aim'].apply(lambda x: mappings['aim'].get(x))
motivation.drop(columns=['aim'], inplace=True)
motivation.rename(columns={'id':'student_id'}, inplace=True)
motivation

Unnamed: 0,student_id,motivation,aim_id
0,DS301,to enter into the data analysis career,0
1,DS302,to grow and improve my skills in data science ...,0
2,DS303,i’m motivated to join everything data to enhan...,0
3,DS304,i'd like to upskill and join the data community,0
4,DS305,i aim to join the mentorship program to enhanc...,0
...,...,...,...
47,DA348,my motivation to join the everything data ment...,0
48,DA349,heavy inspiration from the collaboration and i...,3
49,DA350,i am interested in building my data skills so ...,1
50,DA351,i’m eager to join the the mentorship program t...,3


In [63]:
student_cols = ['id', 'gender', 'age_range', 'country', 'experience', 'track', 'referral', 'skill_level']
students = dfsa_cleaned[student_cols]
students = students.copy()
students.head(1)

Unnamed: 0,id,gender,age_range,country,experience,track,referral,skill_level
0,DS301,Male,18-24,Kenya,Less than 6 months,data science,Word of mouth,beginner


In [64]:
mappings = prepare_key_mappings(*student_cols)
mappings

{'age_range': {'18-24 ': 0, '25-34 ': 1, '45-54 ': 2, '35-44 ': 3},
 'country': {'Kenya': 0, 'South Africa': 1},
 'experience': {'Less than 6 months': 0,
  '6 months - 1 year': 1,
  '1-3 years': 2,
  '4-6 years': 3},
 'track': {'data science': 0, 'data analysis': 1},
 'referral': {'Word of mouth': 0,
  'WhatsApp': 1,
  'Twitter': 2,
  'LinkedIn': 3,
  'Geeks for Geeks': 4,
  'Instagram': 5,
  'Friend': 6},
 'skill_level': {'beginner': 0,
  'elementary': 1,
  'intermediate': 2,
  'advanced': 3}}

In [65]:
foreign_key_columns = ['age_range', 'country', 'experience', 'track', 'referral', 'skill_level']
students['age_range_id'] = students['age_range'].apply(lambda x: mappings['age_range'].get(x))
students['country_id'] = students['country'].apply(lambda x: mappings['country'].get(x))
students['experience_id'] = students['experience'].apply(lambda x: mappings['experience'].get(x))
students['track_id'] = students['track'].apply(lambda x: mappings['track'].get(x))
students['referral_id'] = students['referral'].apply(lambda x: mappings['referral'].get(x))
students['skill_level_id'] = students['skill_level'].apply(lambda x: mappings['skill_level'].get(x))
students.drop(columns=foreign_key_columns, inplace=True)

In [66]:
students.head()

Unnamed: 0,id,gender,age_range_id,country_id,experience_id,track_id,referral_id,skill_level_id
0,DS301,Male,0,0,0,0,0,0
1,DS302,Male,1,0,1,0,1,1
2,DS303,Female,0,0,1,0,1,2
3,DS304,Female,0,0,1,0,1,2
4,DS305,Male,0,0,0,0,1,0


In [67]:
class EverythinDataExtract:
    def __init__(self, data_path) -> None:
        self.data = self.extract_data(data_path)
        
    def extract_data(self, path):
        logger.info("Uploading data")
        try:
            file = pd.ExcelFile(path)
            sheets = file.sheet_names
            names = ", ".join(str(name) for name in xls.sheet_names)
            logger.info(f"Successfully uploaded {len(sheets)} named {names} from file.")
            frames = [xls.parse(sheet) for sheet in sheets]
            return frames
        except Exception as e: 
            logger.error("Cannot upload sheets from file")

In [68]:
class EverythinDataExtract():
    def __init__(self, data_path) -> None:
        self.data_path = data_path
        self.merged_frame = standardise_column_titles(frames)

    def merge_frames(self):
        logger.info("Standardising column names")

        try: 
            frames = self._extract()
            frame_cols = [frame.columns for frame in frames]
            prime_frame_cols = frame_cols[0]
            prime_frame_cols_lower = [col.lower() for col in prime_frame_cols]

            new_frames = []

            flag = {}
            
            for i, frame in enumerate(frames):
                if [col.lower() for col in prime_frame_cols] == prime_frame_cols_lower:
                    frame.columns = prime_frame_cols_lower
                    new_frames.append(frame)
                else: 
                    flag[i] = frame

            new_frame = pd.concat(new_frames, axis=0)

            logger.info("Column names successfully standardised.")

            return new_frame
        except Exception as e:
            logger.error(f"Column name standardisation failed due to {str(e)}")


    def _extract(self):
        logger.info("Uploading data")
        try:
            file = pd.ExcelFile(self.data_path)
            sheets = file.sheet_names
            names = ", ".join(str(name) for name in xls.sheet_names)
            logger.info(f"Successfully uploaded {len(sheets)} named {names} from file.")
            frames = [xls.parse(sheet) for sheet in sheets]
            return frames
        except Exception as e: 
            logger.error("Cannot upload sheets from file")

In [69]:
class EverythinDataTransform:
    def __init__(self, frame, hours_map, standard_cols, aims_map) -> None:
        self.frame = frame
        self.hours_map = hours_map
        self.standard_cols = standard_cols
        self.aims_map = aims_map
        self.clead_data = self.clean_data()

    def clean_data(self):
        std_cols_df = self._standardise_columns()
        split_skills_df = self._split_skill_level(std_cols_df)
        mapped_aims = self._map_aim_categories(split_skills_df)

        return mapped_aims

    def _standardise_columns(self): 
        logger.info("Standardising column values.")
        try: 
            copy = self.frame.copy()
            copy['referral'] = copy['referral'].apply(lambda row: row.replace(
                'through a geeks for geeks webinar', 'Geeks for Geeks')
            )
            copy['motivation'] = copy['motivation'].str.lower()
            copy['experience'] = copy['experience'].apply(
                lambda row: row.replace('six', '6')
            )
            copy['age_range_years'] =  copy['age_range_years'].apply(
                lambda row: row.replace('years', '')
            )
            copy['hours_available'] = copy['hours_available'].apply(
                lambda row: self.hours_map.get(row, row)
            )
            copy['track'] = copy['track'].str.lower()
            if 'timestamp' in copy.columns:
                copy['registration_date'] = copy['timestamp'].dt.date
                copy['registration_time'] = copy['timestamp'].dt.time
                copy.drop(columns=['timestamp'], inplace=True)

            logger.info("Successfuly standardised column values.")
            return copy
    
        except Exception as e:
            logger.error(f"Failed to standardise column values due to the following error: {str(e)}")

    def _split_skill_level(self, frame):
        logger.info("Splitting the 'skill level' column into category and description.")
        try: 
            copy = frame.copy()
            skill_level = copy['skill_level']
            split_skill_level = skill_level.str.split('-')
            skill_label = split_skill_level.map(
                lambda x: x[0].strip()
            )
            skill_description = split_skill_level.map(
                lambda x: x[1].lower().strip()
            )
            copy.drop(columns=['skill_level'], inplace=True)
            copy['skill_level'] = skill_label.lower()
            copy['skill_level_description'] = skill_description
            copy = copy[self.standard_cols]

            logger.info("Successfully split the 'skill level' column.")
            return copy
        
        except Exception as e:
            logger.error(f"An error has occured attempting to split the 'skill level' column: {str(e)}")

    def _map_aim_categories(self, frame):
        logger.info("Mapping 'aim' column categories to standardised values")

        try:
            copy = frame.copy()
            extract_aim = lambda x:  x.split(" ")[1] if x.split(" ")[0] == 'Learn' else x.split(" ")[0]
            map_aims_func = lambda x: self.aims_map[extract_aim(x).lower()]
            copy['aim'] = copy['aim'].apply(lambda x: map_aims_func(x))

            logger.info("Successfully mapped 'aim' column categories" )
            return copy
        except Exception as e:
            logger.error(f"Failed to map 'aim' column categories: {str(e)}")
            

In [None]:
class EverythinDataLoad: 
    def __init__(self, frame) -> None:
        self.frame = frame
        
    def _index_to_map(self, col):
        index = col.to_dict()
        index_map = {v: k for k, v in index.items()}
        return index_map

    def _prepare_age_range(self):
        copy = self.frame.copy()
        age_range = pd.DataFrame(copy['age_range'].unique())
        age_range.rename(columns={0: 'age_range'}, inplace=True)
        return age_range
    
    def _prepare_country(self):
        copy = self.frame.copy()
        country = pd.DataFrame(copy['country'].unique())
        country.rename(columns={0: 'country'}, inplace=True)
        return country
    
    def _prepare_experience(self):
        copy = self.frame.copy()
        experience = pd.DataFrame(copy['experience'].unique())
        experience.rename(columns={0: 'experience_level'}, inplace=True)
        return experience

    def _prepare_track(self):
        copy = self.frame.copy()
        tracks = pd.DataFrame(copy['track'].unique())
        tracks.rename(columns={0: 'track'}, inplace=True)
        return tracks
    
    def _prepare_referral(self):
        copy = self.frame.copy()
        referral = pd.DataFrame(copy['referral'].unique())
        referral.rename(columns={0: 'source'}, inplace=True)
        return referral
    
    def _prepare_skills(self):
        copy = self.frame.copy()
        skills = copy[['skill_level', 'skill_level_description']].drop_duplicates(
            subset=['skill_level', 'skill_level_description']
        )
        skills.index = pd.Index(range(len(skills.index)))
        return skills.copy()

    def _prepare_aims(self):
        copy = self.frame.copy()
        aims = pd.DataFrame(copy['aim'].unique())
        return aims
    
    def _prepare_registrations(self):
        copy = self.frame.copy()
        registrations = copy[['id', 'registration_date', 'registration_time']]
        registrations = registrations.copy()
        registrations = registrations.rename(columns={'id', 'student_id'})
        return registrations
    
    def _prepare_student_outcomes(self):
        copy = self.frame.copy()
        outcomes = copy[['id','completed_aptitude', 'aptitude_score', 'graduated']]
        outcomes = outcomes.copy()
        outcomes.rename(columns={'id':'student_id'}, inplace=True)
        return outcomes
    
    def _prepare_motivation(self):
        copy = self.frame.copy()
        mappings = self._prepare_key_mappings('aim')['aim']
        motivation = copy[['id', 'aim', 'motivation']] 
        motivation = motivation.copy()
        motivation['aim_id'] = motivation['aim'].apply(lambda x: mappings['aim'].get(x))
        motivation.drop(columns=['aim'], inplace=True)
        motivation.rename(columns={'id':'student_id'}, inplace=True)
        return motivation
    
    def _prepare_students(self):
        copy = self.frame.copy()
        foreign_key_columns = ['age_range', 'country', 'experience', 'track', 'referral', 'skill_level']
        mappings = self._prepare_key_mappings(foreign_key_columns)
        students = copy[['id', 'gender', 'age_range', 'country', 'experience', 'track', 'referral', 'skill_level']]
        students = students.copy()
        students['age_range_id'] = students['age_range'].apply(lambda x: mappings['age_range'].get(x))
        students['country_id'] = students['country'].apply(lambda x: mappings['country'].get(x))
        students['experience_id'] = students['experience'].apply(lambda x: mappings['experience'].get(x))
        students['track_id'] = students['track'].apply(lambda x: mappings['track'].get(x))
        students['referral_id'] = students['referral'].apply(lambda x: mappings['referral'].get(x))
        students['skill_level_id'] = students['skill_level'].apply(lambda x: mappings['skill_level'].get(x))
        students.drop(columns=foreign_key_columns, inplace=True)
        return students
        
    
    def _prepare_key_mappings(self, *args):
        mappings = {} 
        
        if "aim" in args:
            aim = self._prepare_aims()
            aim_keys = self._index_to_map(aim['aim'])
            mappings["aims"] = aim_keys
        if "age_range" in args:
            age_range = self._prepare_age_range()
            age_keys = self._index_to_map(age_range['age_range'])
            mappings["age_range"] = age_keys
        if "country" in args:
            country = self._prepare_country()
            country_keys = self._index_to_map(country['country'])
            mappings["country"] = country_keys
        if "experience" in args:
            experience = self._prepare_experience()
            experience_keys = self._index_to_map(experience['experience_level'])
            mappings["experience"] = experience_keys
        if "track" in args:
            track = self._prepare_track()
            track_keys = self._index_to_map(track['track'])
            mappings["track"] = track_keys
        if "referral" in args:
            referral = self._prepare_referral()
            referral_keys = self._index_to_map(referral['source'])
            mappings["referral"] = referral_keys
        if "skill" in args: 
            skill = self._prepare_skills()
            skill_keys = self._index_to_map(skill['skill_level'])
            mappings["skill"] = skill_keys
        return mappings