In [74]:
# src/extract.py

import pandas as pd
from utils.config import load_config
import logging

In [75]:
# src/feature_engineering.py
import pandas as pd
import re

def convert_salary_to_numeric(df):
    """
    Converts the 'salary' column to a numeric format (removes currency symbols, commas).
    """
    df['salary_numeric'] = df['salary'].replace({'₹': '', '/yr': '', ',': ''}, regex=True).astype(float)
    return df

def add_salary_band(df):
    """
    Adds a salary band column for easier grouping.
    """
    bins = [0, 500000, 1000000, 1500000, 2000000, float('inf')]
    labels = ['<5L', '5-10L', '10-15L', '15-20L', '20L+']
    df['salary_band'] = pd.cut(df['salary_numeric'], bins=bins, labels=labels)
    return df

def compute_avg_salary_by_group(df, group_col):
    """
    Computes the average salary grouped by a specified column.
    """
    avg_salary = df.groupby(group_col)['salary_numeric'].transform('mean')
    df[f'avg_salary_by_{group_col}'] = avg_salary
    return df

def add_salaries_per_reported(df):
    """
    Calculates the ratio of total salary to salaries reported.
    """
    df['salaries_per_reported'] = df['salary_numeric'] / df['salaries_reported']
    return df

def extract_job_seniority(df):
    """
    Extracts job seniority level from job title.
    """
    def seniority_level(title):
        if 'junior' in title.lower():
            return 'Junior'
        elif 'senior' in title.lower() or 'lead' in title.lower() or 'manager' in title.lower():
            return 'Senior'
        else:
            return 'Mid-level'
    
    df['job_seniority'] = df['job_title'].apply(seniority_level)
    return df

In [76]:
# src/extract.py

from src.feature_engineering import (
    convert_salary_to_numeric,
    add_salary_band,
    compute_avg_salary_by_group,
    add_salaries_per_reported,
    extract_job_seniority
)
from utils.config import load_config
import pandas as pd
import logging

# Initialize configurations and logging
config = load_config()
logging.basicConfig(level=config['logging']['level'])

def load_data(file_path=None):
    """
    Loads data from a CSV file into a DataFrame.
    Applies initial data cleaning steps like handling missing values and setting data types.
    """
    if file_path is None:
        file_path = config['data']['source_file']

    try:
        logging.info(f"Loading data from {file_path}")
        df = pd.read_csv(file_path)

        # Apply data type conversions and handle missing values
        df = handle_missing_values(df)
        df = set_column_dtypes(df)

        # Apply feature engineering
        df = convert_salary_to_numeric(df)
        df = add_salary_band(df)
        df = compute_avg_salary_by_group(df, group_col='company_name')  # Example: group by company_name
        df = add_salaries_per_reported(df)
        df = extract_job_seniority(df)

        logging.info("Data loaded and feature engineering applied successfully")
        return df

    except Exception as e:
        logging.error(f"Error loading data: {e}")
        return pd.DataFrame()  # Return empty DataFrame if there is an error


from utils.file_utils import to_snakecase, to_titlecase
from utils.config import load_config

config = load_config()

def standardize_column_names(df, case="snake"):
    """
    Standardizes column names based on the specified case.
    """
    if case == "snake":
        df.columns = [to_snakecase(col) for col in df.columns]
    elif case == "title":
        df.columns = [to_titlecase(col) for col in df.columns]
    return df

# In load_data
df = standardize_column_names(df, case=config['data']['column_name_case'])

def handle_missing_values(df: pd.DataFrame):
    """
    Fills or drops missing values based on configuration settings.
    """
    # Example: Fill numeric columns with mean and drop rows with missing categorical values
    for col in df.columns:
        if df[col].dtype in [float, int]:
            df[col].fillna(df[col].mean(), inplace=True)
        else:
            df[col].fillna("Unknown", inplace=True)

    logging.info("Handled missing values")
    return df

def set_column_dtypes(df: pd.DataFrame):
    """
    Sets column data types based on configuration or inferred data types.
    """
    date_format = config['data'].get('date_format', None)

    # Set columns to datetime format if date_format is provided
    for col in df.columns:
        if "date" in col.lower() and date_format:
            try:
                df[col] = pd.to_datetime(df[col], format=date_format)
            except Exception as e:
                logging.warning(f"Failed to parse dates in column {col}: {e}")

    logging.info("Data types set according to configuration")
    return df

def validate_data(df: pd.DataFrame):
    """
    Validates the data to ensure it meets basic quality criteria.
    Example: Check if critical columns exist and have no missing values.
    """
    required_columns = config['data'].get('required_columns', [])
    missing_columns = [col for col in required_columns if col not in df.columns]

    if missing_columns:
        logging.error(f"Missing required columns: {missing_columns}")
        return False

    # Additional custom validations could be added here
    logging.info("Data validation passed")
    return True



from utils.file_utils import to_snakecase, to_titlecase
from utils.config import load_config

config = load_config()

def standardize_column_names(df, case="snake"):
    """
    Standardizes column names based on the specified case.
    """
    if case == "snake":
        df.columns = [to_snakecase(col) for col in df.columns]
    elif case == "title":
        df.columns = [to_titlecase(col) for col in df.columns]
    return df

# In load_data
df = standardize_column_names(df, case=config['data']['column_name_case'])

def handle_missing_values(df: pd.DataFrame):
    """
    Fills or drops missing values based on configuration settings.
    """
    for col in df.columns:
        if df[col].dtype in [float, int]:
            df[col] = df[col].fillna(df[col].mean())  # Avoid inplace
        else:
            df[col] = df[col].fillna("Unknown")  # Avoid inplace

    logging.info("Handled missing values")
    return df


def set_column_dtypes(df: pd.DataFrame):
    """
    Sets column data types based on configuration or inferred data types.
    """
    date_format = config['data'].get('date_format', None)

    # Set columns to datetime format if date_format is provided
    for col in df.columns:
        if "date" in col.lower() and date_format:
            try:
                df[col] = pd.to_datetime(df[col], format=date_format)
            except Exception as e:
                logging.warning(f"Failed to parse dates in column {col}: {e}")

    logging.info("Data types set according to configuration")
    return df

def validate_data(df: pd.DataFrame):
    """
    Validates the data to ensure it meets basic quality criteria.
    Example: Check if critical columns exist and have no missing values.
    """
    required_columns = config['data'].get('required_columns', [])
    missing_columns = [col for col in required_columns if col not in df.columns]

    if missing_columns:
        logging.error(f"Missing required columns: {missing_columns}")
        return False

    # Additional custom validations could be added here
    logging.info("Data validation passed")
    return True

Calling Load Data in app.py

In [77]:
from src.extract import load_data, validate_data
import logging

# Initialize logging
logging.basicConfig(level=logging.INFO)

# Load the data
df = load_data()

# Validate the data
if validate_data(df):
    logging.info("Data validation successful. Proceeding with further analysis.")
    # Proceed with further analysis if data is valid
else:
    logging.error("Data validation failed. Please check the data source or configuration.")

INFO:root:Loading data from data/salary_dataset.csv
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna("Unknown", inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mean(), inplace=True)
INFO:root:Handled missing values
INFO:root:Data types set according to configuration
INFO:root:

In [78]:
df.head(10)

Unnamed: 0,Company Name,Job Title,Salaries Reported,Location,Salary
0,Mu Sigma,Data Scientist,105.0,Bangalore,"₹6,48,573/yr"
1,IBM,Data Scientist,95.0,Bangalore,"₹11,91,950/yr"
2,Tata Consultancy Services,Data Scientist,66.0,Bangalore,"₹8,36,874/yr"
3,Impact Analytics,Data Scientist,40.0,Bangalore,"₹6,69,578/yr"
4,Accenture,Data Scientist,32.0,Bangalore,"₹9,44,110/yr"
5,Infosys,Data Scientist,30.0,Bangalore,"₹9,08,764/yr"
6,Capgemini,Data Scientist,28.0,Bangalore,"₹9,26,124/yr"
7,Cognizant Technology Solutions,Data Scientist,26.0,Bangalore,"₹7,36,708/yr"
8,Anheuser-Busch InBev,Data Scientist,25.0,Bangalore,"₹16,46,721/yr"
9,Fractal,Data Scientist,22.0,Bangalore,"₹13,92,960/yr"


In [79]:
# utils/file_utils.py

import re

def to_snakecase(text: str) -> str:
    """
    Converts a string to snake_case.
    Example: 'Column Name' -> 'column_name'
    """
    # Replace spaces with underscores, remove special characters, and make lowercase
    text = re.sub(r'[\W]+', '_', text).strip().lower()
    return text

def to_titlecase(text: str) -> str:
    """
    Converts a string to Title Case.
    Example: 'column_name' -> 'Column Name'
    """
    # Split by underscores, capitalize each word, and join with spaces
    text = text.replace('_', ' ')
    return text.title()

In [80]:
from utils.file_utils import to_snakecase, to_titlecase
from utils.config import load_config

config = load_config()

def standardize_column_names(df, case="snake"):
    """
    Standardizes column names based on the specified case.
    """
    if case == "snake":
        df.columns = [to_snakecase(col) for col in df.columns]
    elif case == "title":
        df.columns = [to_titlecase(col) for col in df.columns]
    return df

# In load_data
df = standardize_column_names(df, case=config['data']['column_name_case'])

In [81]:
df.head(10)

Unnamed: 0,company_name,job_title,salaries_reported,location,salary
0,Mu Sigma,Data Scientist,105.0,Bangalore,"₹6,48,573/yr"
1,IBM,Data Scientist,95.0,Bangalore,"₹11,91,950/yr"
2,Tata Consultancy Services,Data Scientist,66.0,Bangalore,"₹8,36,874/yr"
3,Impact Analytics,Data Scientist,40.0,Bangalore,"₹6,69,578/yr"
4,Accenture,Data Scientist,32.0,Bangalore,"₹9,44,110/yr"
5,Infosys,Data Scientist,30.0,Bangalore,"₹9,08,764/yr"
6,Capgemini,Data Scientist,28.0,Bangalore,"₹9,26,124/yr"
7,Cognizant Technology Solutions,Data Scientist,26.0,Bangalore,"₹7,36,708/yr"
8,Anheuser-Busch InBev,Data Scientist,25.0,Bangalore,"₹16,46,721/yr"
9,Fractal,Data Scientist,22.0,Bangalore,"₹13,92,960/yr"


In [82]:
df.describe(include='all')

Unnamed: 0,company_name,job_title,salaries_reported,location,salary
count,4344,4344,4344.0,4344,4344
unique,2530,26,,5,3101
top,Tata Consultancy Services,Data Scientist,,Bangalore,"₹10,00,000/yr"
freq,41,1844,,1584,24
mean,,,2.77591,,
std,,,5.145342,,
min,,,1.0,,
25%,,,1.0,,
50%,,,1.0,,
75%,,,3.0,,


In [83]:
df = handle_missing_values(df)
df = set_column_dtypes(df)

# Apply feature engineering
df = convert_salary_to_numeric(df)
df = add_salary_band(df)
df = compute_avg_salary_by_group(df, group_col='company_name')  # Example: group by company_name
df = add_salaries_per_reported(df)
df = extract_job_seniority(df)

INFO:root:Handled missing values
INFO:root:Data types set according to configuration


ValueError: could not convert string to float: '41271/mo'