In [266]:
import pandas as pd
from sqlalchemy import create_engine
from pathlib import Path
import logging
from django_countries import countries
logger =  logging.getLogger('notebook')

In [267]:
project_root = Path.cwd().parent
dataset = 'data/Cohort 4 Capstone Project - Dataset.xlsx'
data_path = Path.joinpath(project_root, dataset)
data_path

PosixPath('/Users/talentmatch/Documents/Projects/code_projects/katlego-engineering-c4-project/data/Cohort 4 Capstone Project - Dataset.xlsx')

In [268]:
class EverythinDataExtract:
    def __init__(self) -> None:
        self.data_path = data_path

    def merge_frames(self, path):
        logger.info("Standardising column names")
        path = self.data_path
        try: 
            frames = self._extract(path)
            if type(frames) == list and len(frames) > 0: 
                frame_cols = [frame.columns for frame in frames]
                prime_frame_cols = frame_cols[0]
                prime_frame_cols_lower = [col.lower() for col in prime_frame_cols]

                new_frames = []

                flag = {}
            
                for i, frame in enumerate(frames):
                    if [col.lower() for col in prime_frame_cols] == prime_frame_cols_lower:
                        frame.columns = prime_frame_cols_lower
                        new_frames.append(frame)
                    else: 
                        flag[i] = frame

                new_frame = pd.concat(new_frames, axis=0)

                logger.info("Column names successfully standardised.")

                return new_frame
        except Exception as e:
            logger.error(f"Column name standardisation failed due to {str(e)}")


    def _extract(self, path):
        logger.info("Uploading data")
        try:
            xls_file = pd.ExcelFile(self.data_path)
            sheets = xls_file.sheet_names
            names = ", ".join(str(name) for name in sheets)
            logger.info(f"Successfully uploaded {len(sheets)} named {names} from file.")
            frames = [xls_file.parse(sheet) for sheet in sheets]
            return frames
        except Exception as e: 
            logger.error("Cannot upload sheets from file")

In [269]:
class EverythinDataTransform:
    def __init__(self, hours_map, standard_cols, aims_map) -> None:
        self.frame = None
        self.hours_map = hours_map
        self.standard_cols = standard_cols
        self.aims_map = aims_map

    def clean_data(self, frame):
        short_names_df = self._map_short_column_names(frame)
        std_cols_df = self._standardise_columns(short_names_df)
        split_skills_df = self._split_skill_level(std_cols_df)
        mapped_aims = self._map_aim_categories(split_skills_df)

        return mapped_aims
    
    def _map_short_column_names(self, frame):
        logger.info("Shortening standardised column names")
        if len(frame.columns) == len(self.standard_cols):
            try:
                copy = frame.copy()
                columns = copy.columns
                map_dict = {col: columns[num] for num, col in enumerate(self.standard_cols)}
                copy.columns = [*map_dict.keys()]
                logger.info("Successfully shortened standardised column names.")
                return copy
            except Exception as e:
                logger.error(f"An error has occured attempting to shorten the standardised column names: {str(e)}")

    def _standardise_columns(self, frame): 
        logger.info("Standardising column values.")
        try: 
            copy = frame.copy()
            copy['referral'] = copy['referral'].apply(lambda row: row.replace(
                'through a geeks for geeks webinar', 'Geeks for Geeks')
            )
            copy['motivation'] = copy['motivation'].str.lower()
            copy['experience'] = copy['experience'].apply(
                lambda row: row.replace('six', '6')
            )
            copy['age_range'] =  copy['age_range'].apply(
                lambda row: row.replace('years', '')
            )
            copy['hours_available'] = copy['hours_available'].apply(
                lambda row: self.hours_map.get(row, row)
            )
            copy['track'] = copy['track'].str.lower()
            if 'timestamp' in copy.columns:
                copy['registration_date'] = copy['timestamp'].dt.date
                copy['registration_time'] = copy['timestamp'].dt.time
                copy = copy.drop(columns=['timestamp'])

            logger.info("Successfuly standardised column values.")
            return copy
    
        except Exception as e:
            logger.error(f"Failed to standardise column values due to the following error: {str(e)}")

    def _split_skill_level(self, frame):
        logger.info("Splitting the 'skill level' column into category and description.")
        try: 
            copy = frame.copy()
            skill_level = copy['skill_level']
            split_skill_level = skill_level.str.split('-')
            skill_label = split_skill_level.map(
                lambda x: x[0].strip()
            )
            skill_description = split_skill_level.map(
                lambda x: x[1].lower().strip()
            )
            copy.drop(columns=['skill_level'], inplace=True)
            copy['skill_level'] = skill_label.str.lower()
            copy['skill_description'] = skill_description
            new_standard_cols = ["registration_date", "registration_time"] + self.standard_cols[1:]
            new_standard_cols.insert(new_standard_cols.index("skill_level") + 1, "skill_description")
            copy = copy[new_standard_cols]

            logger.info("Successfully split the 'skill level' column.")
            return copy
        
        except Exception as e:
            logger.error(f"An error has occured attempting to split the 'skill level' column: {str(e)}")

    def _map_aim_categories(self, frame):
        logger.info("Mapping 'aim' column categories to standardised values")

        try:
            copy = frame.copy()
            extract_aim = lambda x:  x.split(" ")[1] if x.split(" ")[0] == 'Learn' else x.split(" ")[0]
            map_aims_func = lambda x: self.aims_map[extract_aim(x).lower()]
            copy['aim'] = copy['aim'].apply(lambda x: map_aims_func(x))

            logger.info("Successfully mapped 'aim' column categories" )
            return copy
        except Exception as e:
            logger.error(f"Failed to map 'aim' column categories: {str(e)}")
            

In [270]:
def snake_to_pascal(snake_str):
    # Split by underscore, capitalize each word, then join
    return ''.join(word.capitalize() for word in snake_str.split('_'))

In [None]:
class EverythinDataLoad: 
    def __init__(self) -> None:
        self.frame = pd.DataFrame()
        
    def _index_to_map(self, col):
        index = col.to_dict()
        index_map = {v: k for k, v in index.items()}
        return index_map
    
    def _prepare_cat_frame(self, cat):
        copy = self.frame.copy()
        frame = pd.DataFrame()
        
        if cat == "skill_level":
            cols = [col for col in copy.columns if "skill" in col]
            frame = pd.DataFrame(copy[cols].drop_duplicates(
                subset=cols
            ))
        else: 
            frame = pd.DataFrame(copy[cat].unique())
            frame.rename(columns={0: cat}, inplace=True)

        frame[cat] = frame[cat].astype('category')
        frame.index = pd.Index(range(1, len(frame.index) + 1))
        return frame
    
    def _prepare_students(self):
        copy = self.frame.copy()
        #mappings = self._prepare_key_mappings(*self.cat_frames)
        students = copy[['id', 'gender'] + self.cat_frames].copy()
        for col in self.cat_frames:
            cat_col = copy[col].astype('category')
            map_dict = {cat: idx for idx, cat in enumerate(cat_col.cat.categories)}
            students[f'{col}_id'] = cat_col.map(map_dict)
        students.drop(columns=self.cat_frames, inplace=True)
        return students
    
    def _prepare_motivation(self):
        copy = self.frame.copy()
        aim_cats = copy['aim'].astype('category')
        map_dict = {cat: idx for idx, cat in enumerate(aim_cats.cat.categories)}
        motivation = copy[['id', 'aim', 'motivation']].copy()
        motivation['aim'] = motivation['aim'].astype('category')
        motivation['aim_id'] = motivation['aim'].map(map_dict)
        motivation = motivation.drop(columns=['aim'])
        motivation.rename(columns={'id': 'student_id'}, inplace=True)
        return motivation
    
    def _prepare_registrations(self):
        copy = self.frame.copy()
        registrations = copy[['id', 'registration_date', 'registration_time']].copy()
        registrations = registrations.rename(columns={'id':'student_id'})
        return registrations
    
    def _prepare_student_outcomes(self):
        copy = self.frame.copy()
        outcomes = copy[['id','completed_aptitude', 'aptitude_score', 'graduated']].copy()
        outcomes.rename(columns={'id':'student_id'}, inplace=True)
        return outcomes
    
    def _load_cat_frame(self, frame_dict:dict):
        key = list(frame_dict.keys())[0]
        frame = frame_dict[key]

    def load_data(self, frame, cat_frames):
        self.frame = frame
        self.cat_frames = cat_frames
        cat_frames = [{name: self._prepare_cat_frame(name)} for name in self.cat_frames]
        
        for frame_dict in cat_frames:
            self._load_cat_frame(frame_dict)

        # Load Students
        student_df = self._prepare_students()
        print(student_df.head(1))
        print()

        # Load Motivation
        motivation_df = self._prepare_motivation()
        print(motivation_df.head(1))
        print()
        
        # Load Registrations
        registration_df = self._prepare_registrations()
        print(registration_df.head(1))
        print()
     
        # Load Outcomes
        outcome_df = self._prepare_student_outcomes()
        print(outcome_df.head(1))
        print()

In [272]:
hours_mapping_dict = {
    "7-14 hours":  "7-14 hours",           
    "more than 14 hours": "More than 14 hours",
    "less than 6 hours": "Less than 7 hours", 
}

aims_mapping_dict = {
    'upskill': 'upskill', 
    'data': 'learn', 
    'connect': 'network',
    'build': 'enhance portfolio',
    'both': 'network & upskill',
    'more': 'learn & network'
}

standard_columns_list = [
    'timestamp', 'id', 'age_range', 'gender', 'country', 'referral', 
    'experience', 'track', 'hours_available', 'aim', 'motivation', 'skill_level',
    'completed_aptitude', 'aptitude_score', 'graduated'
]

transform_kwargs = {
    "hours_map":hours_mapping_dict, 
    "aims_map":aims_mapping_dict, 
    "standard_cols":standard_columns_list
}

In [273]:
extactor = EverythinDataExtract()
transformer = EverythinDataTransform(**transform_kwargs)
loader = EverythinDataLoad()

In [274]:
raw_data = extactor.merge_frames(data_path)
clean_data = transformer.clean_data(raw_data)

In [275]:
clean_data.head(1)

Unnamed: 0,registration_date,registration_time,id,age_range,gender,country,referral,experience,track,hours_available,aim,motivation,skill_level,skill_description,completed_aptitude,aptitude_score,graduated
0,2024-12-01,23:50:47.202000,DS301,18-24,Male,Kenya,Word of mouth,Less than 6 months,data science,Less than 7 hours,upskill,to enter into the data analysis career,beginner,i have no learning or work experience in data ...,Yes,58.666667,No


In [276]:
category_columns = [
    'age_range', 'country', 'referral', 'experience', 'track', 'aim', 'skill_level'
]
loader.load_data(clean_data, category_columns)

{'id': 1} {'age_range': '18-24 '}
{'id': 2} {'age_range': '25-34 '}
{'id': 3} {'age_range': '45-54 '}
{'id': 4} {'age_range': '35-44 '}
{'id': 1} {'country': 'Kenya'}
{'id': 2} {'country': 'South Africa'}
{'id': 1} {'referral': 'Word of mouth'}
{'id': 2} {'referral': 'WhatsApp'}
{'id': 3} {'referral': 'Twitter'}
{'id': 4} {'referral': 'LinkedIn'}
{'id': 5} {'referral': 'Geeks for Geeks'}
{'id': 6} {'referral': 'Instagram'}
{'id': 7} {'referral': 'Friend'}
{'id': 1} {'experience': 'Less than 6 months'}
{'id': 2} {'experience': '6 months - 1 year'}
{'id': 3} {'experience': '1-3 years'}
{'id': 4} {'experience': '4-6 years'}
{'id': 1} {'track': 'data science'}
{'id': 2} {'track': 'data analysis'}
{'id': 1} {'aim': 'upskill'}
{'id': 2} {'aim': 'learn'}
{'id': 3} {'aim': 'network'}
{'id': 4} {'aim': 'enhance portfolio'}
{'id': 5} {'aim': 'network & upskill'}
{'id': 6} {'aim': 'learn & network'}
{'id': 1} {'skill_level': 'beginner', 'skill_description': 'i have no learning or work experience 