# Data Engineering Project

## Loading Data & Dependencies

In [1]:
import pandas as pd
from sqlalchemy import create_engine
from pathlib import Path

In [2]:
project_root = Path.cwd().parent
dataset = 'data/Cohort 4 Capstone Project - Dataset.xlsx'

In [3]:
data_path = Path.joinpath(project_root, dataset)
data_path

PosixPath('/Users/talentmatch/Documents/Projects/code_projects/katlego-engineering-c4-project/data/Cohort 4 Capstone Project - Dataset.xlsx')

In [4]:
xls = pd.ExcelFile(data_path)

In [5]:
xls.sheet_names

['Cohort 3 DS', 'Cohort 3 DA']

In [6]:
dfs = xls.parse(0)
dfa = xls.parse(1)

## Exploratory Data Analysis

In [7]:
dfs.head()

Unnamed: 0,Timestamp,Id. No,Age range,Gender,Country,Where did you hear about Everything Data?,How many years of learning experience do you have in the field of data?,Which track are you applying for?,How many hours per week can you commit to learning?,What is your main aim for joining the mentorship program?,What is your motivation to join the Everything Data mentorship program?,How best would you describe your skill level in the track you are applying for?,Have you completed the everything data aptitude test for your track?,Total score,Graduated
0,2024-12-01 23:50:47.202,DS301,18-24 years,Male,Kenya,Word of mouth,Less than six months,Data science,less than 6 hours,Upskill,to enter into the data analysis career,Beginner - I have NO learning or work experien...,Yes,58.666667,No
1,2024-12-03 09:35:19.407,DS302,25-34 years,Male,Kenya,WhatsApp,6 months - 1 year,Data science,more than 14 hours,Upskill,To grow and improve my skills in data science ...,Elementary - I have theoretical understanding ...,Yes,70.0,No
2,2024-12-03 19:16:49.376,DS303,18-24 years,Female,Kenya,WhatsApp,6 months - 1 year,Data science,more than 14 hours,Upskill,I’m motivated to join Everything Data to enhan...,Intermediate - I have theoretical knowledge an...,Yes,64.333333,Yes
3,2024-12-03 12:52:35.541,DS304,18-24 years,Female,Kenya,WhatsApp,6 months - 1 year,Data science,7-14 hours,Upskill,I'd like to upskill and Join the Data Community,Intermediate - I have theoretical knowledge an...,Yes,75.0,No
4,2024-12-03 18:12:27.159,DS305,18-24 years,Male,Kenya,WhatsApp,Less than six months,Data science,7-14 hours,Upskill,I aim to join the mentorship program to enhanc...,Beginner - I have NO learning or work experien...,Yes,59.0,No


In [8]:
dfa.head()

Unnamed: 0,Timestamp,ID No.,Age range,Gender,Country,Where did you hear about Everything Data?,How many years of learning experience do you have in the field of data?,Which track are you applying for?,How many hours per week can you commit to learning?,What is your main aim for joining the mentorship program?,What is your motivation to join the Everything Data mentorship program?,How best would you describe your skill level in the track you are applying for?,Have you completed the everything data aptitude test for your track?,Total score,Graduated
0,2024-12-03 16:25:30.208,DA301,18-24 years,Female,Kenya,WhatsApp,Less than six months,Data analysis,more than 14 hours,Learn data afresh,The hands-on program offers valuable real-worl...,Elementary - I have theoretical understanding ...,Yes,67.333333,Yes
1,2024-12-02 18:17:14.522,DA302,18-24 years,Male,Kenya,Twitter,Less than six months,Data analysis,7-14 hours,Build a project portfolio,The mentorship will be a catalytic opportunity...,Beginner - I have NO learning or work experien...,Yes,62.0,No
2,2024-11-29 08:05:14.371,DA303,18-24 years,Male,Kenya,WhatsApp,6 months - 1 year,Data analysis,7-14 hours,Upskill,Desire to learn,Beginner - I have NO learning or work experien...,Yes,74.0,No
3,2024-11-28 21:37:53.455,DA304,25-34 years,Male,Kenya,Twitter,1-3 years,Data analysis,7-14 hours,Build a project portfolio,My background knowledge in Computer Science dr...,Elementary - I have theoretical understanding ...,Yes,65.333333,No
4,2024-12-03 17:15:14.443,DA305,18-24 years,Male,Kenya,LinkedIn,6 months - 1 year,Data analysis,more than 14 hours,Build a project portfolio,To horn my skills in data analysis\nLand a dat...,Intermediate - I have theoretical knowledge an...,Yes,69.333333,No


### Checking Column Name Consistency

In [9]:
dfa.columns == dfs.columns

array([ True, False,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True])

In [10]:
dfs.columns = dfa.columns

In [11]:
dfsa = pd.concat([dfs, dfa], axis=0)

In [12]:
print(dfs.shape)
print(dfa.shape)
print(dfsa.shape)

(63, 15)
(52, 15)
(115, 15)


In [13]:
dfs.columns == dfa.columns

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True])

### Renaming Columns For Shorter Names

In [14]:
columns = dfs.columns

In [15]:
print(columns)

Index(['Timestamp', 'ID No.', 'Age range', 'Gender', 'Country',
       'Where did you hear about Everything Data?',
       'How many years of learning experience do you have in the field of data?',
       'Which track are you applying for?',
       'How many hours per week can you commit to learning?',
       'What is your main aim for joining the mentorship program?',
       'What is your motivation to join the Everything Data mentorship program?',
       'How best would you describe your skill level in the track you are applying for?',
       'Have you completed the everything data aptitude test for your track?',
       'Total score', 'Graduated'],
      dtype='object')


In [16]:
new_cols = [
    'timestamp', 'id', 'age_range_years', 'gender', 'country', 'referral', 
    'experience', 'track', 'hours_available', 'aim', 'motivation', 'skill_level',
    'completed_aptitude', 'aptitude_score', 'graduated'
]
map_dict = {col: columns[num] for num, col in enumerate(new_cols)}

In [17]:
map_dict

{'timestamp': 'Timestamp',
 'id': 'ID No.',
 'age_range_years': 'Age range',
 'gender': 'Gender',
 'country': 'Country',
 'referral': 'Where did you hear about Everything Data?',
 'experience': 'How many years of learning experience do you have in the field of data?',
 'track': 'Which track are you applying for?',
 'hours_available': 'How many hours per week can you commit to learning?',
 'aim': 'What is your main aim for joining the mentorship program?',
 'motivation': 'What is your motivation to join the Everything Data mentorship program?',
 'skill_level': 'How best would you describe your skill level in the track you are applying for?',
 'completed_aptitude': 'Have you completed the everything data aptitude test for your track?',
 'aptitude_score': 'Total score',
 'graduated': 'Graduated'}

In [18]:
map_dict.keys()

dict_keys(['timestamp', 'id', 'age_range_years', 'gender', 'country', 'referral', 'experience', 'track', 'hours_available', 'aim', 'motivation', 'skill_level', 'completed_aptitude', 'aptitude_score', 'graduated'])

In [41]:
dfsa.columns = [*map_dict.keys()]

In [42]:
dfsa.head()

Unnamed: 0,timestamp,id,age_range_years,gender,country,referral,experience,track,hours_available,aim,motivation,skill_level,completed_aptitude,aptitude_score,graduated
0,2024-12-01 23:50:47.202,DS301,18-24 years,Male,Kenya,Word of mouth,Less than six months,Data science,less than 6 hours,Upskill,to enter into the data analysis career,Beginner - I have NO learning or work experien...,Yes,58.666667,No
1,2024-12-03 09:35:19.407,DS302,25-34 years,Male,Kenya,WhatsApp,6 months - 1 year,Data science,more than 14 hours,Upskill,To grow and improve my skills in data science ...,Elementary - I have theoretical understanding ...,Yes,70.0,No
2,2024-12-03 19:16:49.376,DS303,18-24 years,Female,Kenya,WhatsApp,6 months - 1 year,Data science,more than 14 hours,Upskill,I’m motivated to join Everything Data to enhan...,Intermediate - I have theoretical knowledge an...,Yes,64.333333,Yes
3,2024-12-03 12:52:35.541,DS304,18-24 years,Female,Kenya,WhatsApp,6 months - 1 year,Data science,7-14 hours,Upskill,I'd like to upskill and Join the Data Community,Intermediate - I have theoretical knowledge an...,Yes,75.0,No
4,2024-12-03 18:12:27.159,DS305,18-24 years,Male,Kenya,WhatsApp,Less than six months,Data science,7-14 hours,Upskill,I aim to join the mentorship program to enhanc...,Beginner - I have NO learning or work experien...,Yes,59.0,No


In [21]:
dfsa.dtypes

timestamp             datetime64[ns]
id                            object
age_range_years               object
gender                        object
country                       object
referral                      object
experience                    object
track                         object
hours_available               object
aim                           object
motivation                    object
skill_level                   object
completed_aptitude            object
aptitude_score               float64
graduated                     object
dtype: object

### Standardising Other Column Values

Standardise by removing `years`. 

In [22]:
dfsa.age_range_years.value_counts()

age_range_years
18-24 years    62
25-34 years    48
35-44 years     4
45-54 years     1
Name: count, dtype: int64

Standardise by removing replacing "through a geeks for geeks webinar" with `Geeks for Geeks`.

In [23]:
dfsa.referral.value_counts()

referral
WhatsApp                             53
Twitter                              38
LinkedIn                             13
Word of mouth                         7
Instagram                             2
through a geeks for geeks webinar     1
Friend                                1
Name: count, dtype: int64

Standardise by removing hours and correcting ranges.


Source             | Standardised  |
-------------------|---------------|
less than 6 hours  | less than  7   |
7 - 14 hours       | between 7 to 14  |
mort than 14 hours | more than 14  |


In [24]:
dfsa.hours_available.value_counts()

hours_available
7-14 hours            64
more than 14 hours    35
less than 6 hours     16
Name: count, dtype: int64

Create a skills level table and code levels ordinally: 

1 - Beginner.  
2 - Elementary.  
3 - Intermediate.  
4 - Advanced.  

In [25]:
dfsa.skill_level.value_counts()

skill_level
Elementary - I have theoretical understanding of basic data analysis/ data science concepts         56
Beginner - I have NO learning or work experience in data analysis/ data science                     42
Intermediate - I have theoretical knowledge and experience in data analysis/ data science           16
Advanced - I have deep knowledge and experience in advanced data analysis/ data science concepts     1
Name: count, dtype: int64

Standardise values by replacing "Less than six months" with `Less than 6 months`

In [26]:
dfsa.experience.value_counts()

experience
Less than six months    72
6 months - 1 year       30
1-3 years                9
4-6 years                4
Name: count, dtype: int64

In [27]:
dfsa.graduated.value_counts()

graduated
No     84
Yes    31
Name: count, dtype: int64

In [28]:
dfsa.aim.value_counts()

aim
Upskill                                                                         73
Learn data afresh                                                               23
Build a project portfolio                                                       15
Connect with fellow data professionals                                           2
both upskilling and connecting with fellow data professionals                    1
Learn more about data analysis and also network with fellow data enthusiasts     1
Name: count, dtype: int64

In [29]:
dfsa.completed_aptitude.value_counts()

completed_aptitude
Yes    112
No       3
Name: count, dtype: int64

In [30]:
dfsa_cleaned = dfsa.copy()

In [31]:
dfsa_cleaned['age_range_years'] =  dfsa_cleaned['age_range_years'].apply(lambda row: row.replace('years', ''))
dfsa_cleaned.head()

Unnamed: 0,timestamp,id,age_range_years,gender,country,referral,experience,track,hours_available,aim,motivation,skill_level,completed_aptitude,aptitude_score,graduated
0,2024-12-01 23:50:47.202,DS301,18-24,Male,Kenya,Word of mouth,Less than six months,Data science,less than 6 hours,Upskill,to enter into the data analysis career,Beginner - I have NO learning or work experien...,Yes,58.666667,No
1,2024-12-03 09:35:19.407,DS302,25-34,Male,Kenya,WhatsApp,6 months - 1 year,Data science,more than 14 hours,Upskill,To grow and improve my skills in data science ...,Elementary - I have theoretical understanding ...,Yes,70.0,No
2,2024-12-03 19:16:49.376,DS303,18-24,Female,Kenya,WhatsApp,6 months - 1 year,Data science,more than 14 hours,Upskill,I’m motivated to join Everything Data to enhan...,Intermediate - I have theoretical knowledge an...,Yes,64.333333,Yes
3,2024-12-03 12:52:35.541,DS304,18-24,Female,Kenya,WhatsApp,6 months - 1 year,Data science,7-14 hours,Upskill,I'd like to upskill and Join the Data Community,Intermediate - I have theoretical knowledge an...,Yes,75.0,No
4,2024-12-03 18:12:27.159,DS305,18-24,Male,Kenya,WhatsApp,Less than six months,Data science,7-14 hours,Upskill,I aim to join the mentorship program to enhanc...,Beginner - I have NO learning or work experien...,Yes,59.0,No


In [32]:
dfsa_cleaned['referral'] = dfsa_cleaned['referral'].apply(lambda row: row.replace('through a geeks for geeks webinar', 'Geeks for Geeks'))
dfsa_cleaned[dfsa_cleaned['referral'] == 'Geeks for Geeks']

Unnamed: 0,timestamp,id,age_range_years,gender,country,referral,experience,track,hours_available,aim,motivation,skill_level,completed_aptitude,aptitude_score,graduated
35,2024-12-01 18:49:03.203,DS336,18-24,Female,Kenya,Geeks for Geeks,Less than six months,Data science,7-14 hours,both upskilling and connecting with fellow dat...,Joining the Everything Data Mentorship program...,Elementary - I have theoretical understanding ...,Yes,74.333333,Yes


In [33]:
dfsa_cleaned['experience'] = dfsa_cleaned['experience'].apply(lambda row: row.replace('six', '6'))
dfsa_cleaned[dfsa_cleaned['experience'] == 'Less than 6 months']

Unnamed: 0,timestamp,id,age_range_years,gender,country,referral,experience,track,hours_available,aim,motivation,skill_level,completed_aptitude,aptitude_score,graduated
0,2024-12-01 23:50:47.202,DS301,18-24,Male,Kenya,Word of mouth,Less than 6 months,Data science,less than 6 hours,Upskill,to enter into the data analysis career,Beginner - I have NO learning or work experien...,Yes,58.666667,No
4,2024-12-03 18:12:27.159,DS305,18-24,Male,Kenya,WhatsApp,Less than 6 months,Data science,7-14 hours,Upskill,I aim to join the mentorship program to enhanc...,Beginner - I have NO learning or work experien...,Yes,59.000000,No
6,2024-11-28 14:42:44.559,DS307,18-24,Female,Kenya,WhatsApp,Less than 6 months,Data science,7-14 hours,Learn data afresh,To learn data science and develop skills neces...,Beginner - I have NO learning or work experien...,Yes,73.333333,No
7,2024-12-03 12:58:05.404,DS308,18-24,Male,Kenya,WhatsApp,Less than 6 months,Data science,more than 14 hours,Connect with fellow data professionals,I'm eager to gain a solid foundation in data f...,Elementary - I have theoretical understanding ...,Yes,71.000000,No
8,2024-12-01 22:02:22.734,DS309,25-34,Female,Kenya,Twitter,Less than 6 months,Data science,less than 6 hours,Upskill,I am inspired to make more data driven decisio...,Beginner - I have NO learning or work experien...,Yes,74.000000,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43,2024-11-28 19:53:42.862,DA344,25-34,Female,Kenya,WhatsApp,Less than 6 months,Data analysis,less than 6 hours,Upskill,"To connect with fellow data professionals, sta...",Elementary - I have theoretical understanding ...,Yes,78.000000,No
45,2024-12-02 13:08:20.517,DA346,25-34,Female,Kenya,WhatsApp,Less than 6 months,Data analysis,7-14 hours,Upskill,This is a program which will offer me an oppor...,Elementary - I have theoretical understanding ...,Yes,64.666667,Yes
47,2024-12-02 21:06:50.072,DA348,25-34,Male,Kenya,WhatsApp,Less than 6 months,Data analysis,7-14 hours,Upskill,My motivation to join the Everything Data ment...,Elementary - I have theoretical understanding ...,No,62.333333,No
48,2024-12-03 20:11:36.575,DA349,25-34,Male,Kenya,LinkedIn,Less than 6 months,Data analysis,7-14 hours,Build a project portfolio,Heavy inspiration from the collaboration and i...,Elementary - I have theoretical understanding ...,Yes,70.666667,No


In [34]:
hours_mapping_dict = {
    "7-14 hours":  "7-14 hours",           
    "more than 14 hours": "More than 14 hours",
    "less than 6 hours": "Less than 7 hours", 
}

dfsa_cleaned['hours_available'] = dfsa_cleaned['hours_available'].apply(lambda row: hours_mapping_dict[row])
dfsa_cleaned.head()

Unnamed: 0,timestamp,id,age_range_years,gender,country,referral,experience,track,hours_available,aim,motivation,skill_level,completed_aptitude,aptitude_score,graduated
0,2024-12-01 23:50:47.202,DS301,18-24,Male,Kenya,Word of mouth,Less than 6 months,Data science,Less than 7 hours,Upskill,to enter into the data analysis career,Beginner - I have NO learning or work experien...,Yes,58.666667,No
1,2024-12-03 09:35:19.407,DS302,25-34,Male,Kenya,WhatsApp,6 months - 1 year,Data science,More than 14 hours,Upskill,To grow and improve my skills in data science ...,Elementary - I have theoretical understanding ...,Yes,70.0,No
2,2024-12-03 19:16:49.376,DS303,18-24,Female,Kenya,WhatsApp,6 months - 1 year,Data science,More than 14 hours,Upskill,I’m motivated to join Everything Data to enhan...,Intermediate - I have theoretical knowledge an...,Yes,64.333333,Yes
3,2024-12-03 12:52:35.541,DS304,18-24,Female,Kenya,WhatsApp,6 months - 1 year,Data science,7-14 hours,Upskill,I'd like to upskill and Join the Data Community,Intermediate - I have theoretical knowledge an...,Yes,75.0,No
4,2024-12-03 18:12:27.159,DS305,18-24,Male,Kenya,WhatsApp,Less than 6 months,Data science,7-14 hours,Upskill,I aim to join the mentorship program to enhanc...,Beginner - I have NO learning or work experien...,Yes,59.0,No


In [35]:
dfsa_cleaned['motivation'] = dfsa_cleaned['motivation'].str.lower()
dfsa_cleaned.head()

Unnamed: 0,timestamp,id,age_range_years,gender,country,referral,experience,track,hours_available,aim,motivation,skill_level,completed_aptitude,aptitude_score,graduated
0,2024-12-01 23:50:47.202,DS301,18-24,Male,Kenya,Word of mouth,Less than 6 months,Data science,Less than 7 hours,Upskill,to enter into the data analysis career,Beginner - I have NO learning or work experien...,Yes,58.666667,No
1,2024-12-03 09:35:19.407,DS302,25-34,Male,Kenya,WhatsApp,6 months - 1 year,Data science,More than 14 hours,Upskill,to grow and improve my skills in data science ...,Elementary - I have theoretical understanding ...,Yes,70.0,No
2,2024-12-03 19:16:49.376,DS303,18-24,Female,Kenya,WhatsApp,6 months - 1 year,Data science,More than 14 hours,Upskill,i’m motivated to join everything data to enhan...,Intermediate - I have theoretical knowledge an...,Yes,64.333333,Yes
3,2024-12-03 12:52:35.541,DS304,18-24,Female,Kenya,WhatsApp,6 months - 1 year,Data science,7-14 hours,Upskill,i'd like to upskill and join the data community,Intermediate - I have theoretical knowledge an...,Yes,75.0,No
4,2024-12-03 18:12:27.159,DS305,18-24,Male,Kenya,WhatsApp,Less than 6 months,Data science,7-14 hours,Upskill,i aim to join the mentorship program to enhanc...,Beginner - I have NO learning or work experien...,Yes,59.0,No


In [43]:
skill_level = dfsa['skill_level']
split_skill_level = skill_level.str.split('-')
skill_label = split_skill_level.map(lambda x: x[0].strip())
skill_description = split_skill_level.map(lambda x: x[1].lower().strip())
dfsa_cleaned = dfsa_cleaned.drop(columns=['skill_level'])
dfsa_cleaned['skill_level'] = skill_label
dfsa_cleaned['skill_level_description'] = skill_description

dfsa_cleaned = dfsa_cleaned[['timestamp', 'id', 'age_range_years', 'gender', 'country', 'referral',
       'experience', 'track', 'hours_available', 'aim', 'motivation','skill_level','skill_level_description',
       'completed_aptitude', 'aptitude_score', 'graduated'
       ]]
dfsa_cleaned.head()

Unnamed: 0,timestamp,id,age_range_years,gender,country,referral,experience,track,hours_available,aim,motivation,skill_level,skill_level_description,completed_aptitude,aptitude_score,graduated
0,2024-12-01 23:50:47.202,DS301,18-24,Male,Kenya,Word of mouth,Less than 6 months,Data science,Less than 7 hours,Upskill,to enter into the data analysis career,Beginner,i have no learning or work experience in data ...,Yes,58.666667,No
1,2024-12-03 09:35:19.407,DS302,25-34,Male,Kenya,WhatsApp,6 months - 1 year,Data science,More than 14 hours,Upskill,to grow and improve my skills in data science ...,Elementary,i have theoretical understanding of basic data...,Yes,70.0,No
2,2024-12-03 19:16:49.376,DS303,18-24,Female,Kenya,WhatsApp,6 months - 1 year,Data science,More than 14 hours,Upskill,i’m motivated to join everything data to enhan...,Intermediate,i have theoretical knowledge and experience in...,Yes,64.333333,Yes
3,2024-12-03 12:52:35.541,DS304,18-24,Female,Kenya,WhatsApp,6 months - 1 year,Data science,7-14 hours,Upskill,i'd like to upskill and join the data community,Intermediate,i have theoretical knowledge and experience in...,Yes,75.0,No
4,2024-12-03 18:12:27.159,DS305,18-24,Male,Kenya,WhatsApp,Less than 6 months,Data science,7-14 hours,Upskill,i aim to join the mentorship program to enhanc...,Beginner,i have no learning or work experience in data ...,Yes,59.0,No
