In [1]:
import pandas as pd
import numpy as np

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.io as pio
import plotly.figure_factory as ff
pio.templates.default = "plotly_dark"

import sys
from pathlib import Path

sys.path.append('..')
from paths import DATA_DIR


In [2]:
df = pd.read_csv(DATA_DIR / "data_model.csv")
df


Unnamed: 0,salary_usd,category,title_group,english_level,it_experience_years
0,2000.0,Analyst,Mid-level,Pre-Intermediate,2.0
1,4000.0,Quality Assurance,Senior-level,Intermediate,5.0
2,10000.0,DevOps,Tech leadership,Upper-Intermediate,9.0
3,3500.0,Management,Senior-level,Advanced,5.0
4,3000.0,Management,Tech leadership,Pre-Intermediate,18.0
...,...,...,...,...,...
7588,3227.0,HR,Tech leadership,Advanced,7.0
7589,7500.0,Data Scientist,Senior-level,Upper-Intermediate,4.0
7590,1450.0,Customer Support,Tech leadership,Pre-Intermediate,1.5
7591,9000.0,Software Engineer,Senior-level,Intermediate,20.0


In [3]:

fig = px.scatter(
    df,
    x='it_experience_years',
    y='salary_usd',
    color='title_group',
    hover_data=['title_group', 'category','it_experience_years', 'salary_usd'],
    title='Experience vs title',
    width=1500, height=500
)

fig.show()


In [4]:
df_test = df.copy()


In [5]:
# Діапазони стажу для українського IT ринку (базуючись на DOU 2025)

ukraine_experience_ranges = {
    'Entry-level': {
        'typical_range': (0, 2),      # типовий діапазон
        'acceptable': (0, 3),         # прийнятний діапазон
        'outlier_threshold': 4,       # більше = викид
        'critical_outlier': 6,        # критичний викид
        'note': 'Початківці, стажери, джуни'
    },

    'Mid-level': {
        'typical_range': (2, 5),
        'acceptable': (1, 7),         # може бути пізній перехід або рання кар'єра
        'outlier_threshold': 8,       # мідл з 8+ років - підозріло
        'critical_outlier': 10,
        'note': 'Середні розробники'
    },

    'Senior-level': {
        'typical_range': (4, 12),
        'acceptable': (3, 15),
        'outlier_threshold': 2,       # менше 3 років - червоний прапорець
        'critical_outlier': 1,        # 1-2 роки для сеньйора = критичний викид
        'max_outlier': 20,            # більше 20 років теж підозріло
        'note': 'Досвідчені розробники'
    },

    'Tech leadership': {  # Tech Lead, Staff, Principal
        'typical_range': (5, 15),
        'acceptable': (4, 20),
        'outlier_threshold': 3,       # менше 4 років - дуже підозріло
        'critical_outlier': 2,
        'max_outlier': 25,
        'note': 'Технічні лідери, Staff/Principal Engineers'
    },

    'Management': {  # Engineering Manager, Team Lead
        'typical_range': (5, 18),
        'acceptable': (4, 22),
        'outlier_threshold': 3,
        'critical_outlier': 2,
        'max_outlier': 30,
        'note': 'Менеджери команд, Team Leads'
    },

    'IC-high': {  # Individual Contributor високого рівня
        'typical_range': (7, 20),
        'acceptable': (6, 25),
        'outlier_threshold': 5,
        'critical_outlier': 4,
        'max_outlier': 30,
        'note': 'Високорівневі Individual Contributors'
    },

    'Executive': {  # CTO, VP Engineering тощо
        'typical_range': (10, 25),
        'acceptable': (8, 30),
        'outlier_threshold': 7,
        'critical_outlier': 5,
        'max_outlier': 35,
        'note': 'Топ-менеджмент, C-level'
    }
}


In [6]:
for index, row in df.iloc[56:58].iterrows():
    title = row['title_group']
    experience = row['it_experience_years']

    print(title, experience)

    if title in ukraine_experience_ranges:
        rules = ukraine_experience_ranges[title]
        print(rules)

        criticar_outlier = rules['critical_outlier']
        outlier_threshold = rules['outlier_threshold']
        acceptable_threshold = rules['acceptable']
        typical_range = rules['typical_range']

        #  першочергово перевіряємо на наявність max_outlier (присутній не в усіх ключах)
        if 'max_outlier' in rules and experience > rules['max_outlier']:
            print(f'st.5 - bad')
            score = 1.0
        # перевірка на наявність у типовому діапазоні
        elif typical_range[0] <= experience <= typical_range[1]:
            print(f'St.1: {experience} is within typical range. (ok)')
            score = 0.0
        # ще більш-менш прийнятний діапазон
        elif acceptable_threshold[0] <= experience <= acceptable_threshold[1]:
            print(f'St.2: {experience} is within acceptable threshold. (ok)')
            score = 0.25
        elif experience < criticar_outlier:
            print(f'St.4: {experience} < {criticar_outlier}')
            score = 0.75
        elif experience < outlier_threshold:
            print(f'St.3: {experience} < {outlier_threshold}. (bad)')
            score = 0.50
        else:
            print(f'St.6: {experience} is outside all ranges')
            score = 1.0
        print(f'Total score: {score}\n')


Mid-level 3.0
{'typical_range': (2, 5), 'acceptable': (1, 7), 'outlier_threshold': 8, 'critical_outlier': 10, 'note': 'Середні розробники'}
St.1: 3.0 is within typical range. (ok)
Total score: 0.0

Entry-level 3.0
{'typical_range': (0, 2), 'acceptable': (0, 3), 'outlier_threshold': 4, 'critical_outlier': 6, 'note': 'Початківці, стажери, джуни'}
St.2: 3.0 is within acceptable threshold. (ok)
Total score: 0.25



In [7]:
def detect_experience_outliers(df):

    # base
    df_copy = df.copy()
    df_copy['is_outlier'] = False
    df_copy['outlier_score'] = float(0.0)

    # even row
    for index, row in df_copy.iterrows():
        title = row['title_group'] # кожен рядок title
        experience = row['it_experience_years'] # кожен рядок experience

        score = 0.0

        if title in ukraine_experience_ranges:
            rules = ukraine_experience_ranges[title]

            # визначаємо діапазони для викидів
            critical_outlier = rules['critical_outlier']
            outlier_threshold = rules['outlier_threshold']
            acceptable_threshold = rules['acceptable'] # ok
            typical_range = rules['typical_range'] # ok

            if 'max_outlier' in rules and experience > rules['max_outlier']:
                score = 1.0
            elif typical_range[0] <= experience <= typical_range[1]:
                score = 0.0
            elif acceptable_threshold[0] <= experience <= acceptable_threshold[1]:
                score = 0.25
            elif experience < critical_outlier:
                score = 0.75
            elif experience < outlier_threshold:
                score = 0.50
            else:
                score = 1.0

        if score >= 0.5:
            df_copy.at[index, 'is_outlier'] = True
        df_copy.at[index, 'outlier_score'] = score

    return df_copy


In [8]:
new_df = detect_experience_outliers(df_test)


In [9]:
new_df['is_outlier'].value_counts()


is_outlier
False    6735
True      858
Name: count, dtype: int64

In [44]:
fig = px.scatter(
    new_df,
    x='it_experience_years',
    y='salary_usd',
    color='is_outlier',
    hover_data=['title_group', 'category','it_experience_years', 'salary_usd'],
    title='Outliers cleaning in "IT_Experience"',
    width=1500, height=500,
)

fig.show()


In [11]:
ukraine_salary_ranges = {
    'Entry-level': {
        'typical_range': (600, 1200),     # типовий діапазон для джунів
        'acceptable': (400, 1600),        # розширений прийнятний діапазон
        'critical_range': (200, 2500),              # нижче = критична помилка
        'outlier_range': (300, 2000),             # вище = підозріло для Entry
        'note': 'Джуніори, початківці'
    },

    'Mid-level': {
        'typical_range': (1800, 3200),    # основний діапазон мідлів
        'acceptable': (1200, 4000),       # розширений діапазон
        'critical_range': (800, 5500),              # менше 800 для мідла підозріло
        'outlier_range': (1000, 4500),              # менше 1000 - червоний прапорець
        'note': 'Розробники середнього рівня'
    },

    'Senior-level': {
        'typical_range': (3500, 6500),    # основний діапазон сеньйорів
        'acceptable': (2500, 8000),       # розширений діапазон
        'critical_range': (1500, 10000),             # менше 1500 = помилка даних
        'outlier_range': (2000, 8500),              # менше 2000 - підозріло
        'note': 'Досвідчені розробники'
    },

    'Tech leadership': {  # Tech Lead, Staff, Principal
        'typical_range': (4500, 8000),    # типовий діапазон тех лідів
        'acceptable': (3500, 10000),      # розширений діапазон
        'critical_range': (2500, 15000),             # менше 2500 для Tech Lead критично
        'outlier_range': (3000, 11000),              # менше 3000 підозріло
        'note': 'Технічні лідери, Staff/Principal Engineers'
    },

    'Management': {  # Engineering Manager, Team Lead
        'typical_range': (4500, 8500),    # менеджери команд
        'acceptable': (3500, 11000),      # розширений діапазон
        'critical_range': (2500, 150000),             # критично низько
        'outlier_range': (3000, 12000),              # підозріло низько
        'note': 'Менеджери команд, Engineering Managers'
    },

    'IC-high': {  # Individual Contributor високого рівня
        'typical_range': (6000, 12000),   # високорівневі IC
        'acceptable': (4500, 15000),      # широкий діапазон
        'critical_range': (3500, 20000),             # критично низько для IC-high
        'outlier_range': (4000, 16000),              # підозріло низько
        'note': 'Високорівневі Individual Contributors'
    },

    'Executive': {  # CTO, VP Engineering, тощо
        'typical_range': (8000, 20000),   # топ-менеджмент
        'acceptable': (6000, 30000),      # великий розкид
        'critical_range': (4000, 50000),             # критично низько для Executive
        'outlier_range': (5000, 35000),              # підозріло низько
        'note': 'Топ-менеджмент, C-level'
    }
}


In [None]:
# для перевірки
for index, row in df[16:17].iterrows():
    title = row['title_group']
    salary = row['salary_usd']
    print(title, salary)

    score = float(0.0)

    if title in ukraine_salary_ranges:
        rules = ukraine_salary_ranges[title]
        print(rules)

        acceptable_threshold = rules['acceptable']
        typical_range = rules['typical_range']
        outlier_threshold = rules['outlier_range']
        critical_range = rules['critical_range']

        if typical_range[0] <= salary <= typical_range[1]:
            print(f'st.1: {typical_range[0]} <= {salary} <= {typical_range[1]}')
            score = 0.0
        elif acceptable_threshold[0] <= salary <= acceptable_threshold[1]:
            print(f'st.2: {acceptable_threshold[0]} <= {salary} <= {acceptable_threshold[1]}')
            score = 0.25
        elif outlier_threshold[0] <= salary <= outlier_threshold[1]:
            print(f'st.3: {outlier_threshold[0]} <= {salary} <= {outlier_threshold[1]}')
            score = 0.75
        elif critical_range[0] <= salary <= critical_range[1]:
            print(f'st.4: {critical_range[0]} <= {salary} <= {critical_range[1]}')
            score = 1.0
        else:
            score = 1.0

        print('Total score:', score)


Executive 1200.0
{'typical_range': (8000, 20000), 'acceptable': (6000, 30000), 'critical_range': (4000, 50000), 'outlier_range': (5000, 35000), 'note': 'Топ-менеджмент, C-level'}
Total score: 1.0


In [None]:
def detect_salary_outliers(df):

    df_copy = df.copy()
    df_copy['is_salary_outlier'] = False
    df_copy['outlier_salary_score'] = float(0.0)

    for index, row in df_copy.iterrows():
        title = row['title_group']
        salary = row['salary_usd']

        score = float(0.0)

        if title in ukraine_salary_ranges:
            rules = ukraine_salary_ranges[title]

            acceptable_threshold = rules['acceptable']
            typical_range = rules['typical_range']
            outlier_threshold = rules['outlier_range']
            critical_range = rules['critical_range']

            if typical_range[0] <= salary <= typical_range[1]:
                score = 0.0
            elif acceptable_threshold[0] <= salary <= acceptable_threshold[1]:
                score = 0.25
            elif outlier_threshold[0] <= salary <= outlier_threshold[1]:
                score = 0.75
            elif critical_range[0] <= salary <= critical_range[1]:
                score = 1.0
            else:
                score = 1.0

        if score >= 0.5:
            df_copy.at[index, 'is_salary_outlier'] = True
        df_copy.at[index, 'outlier_salary_score'] = score

    return df_copy


In [None]:
new_df = detect_salary_outliers(new_df)


In [None]:
new_df


Unnamed: 0,salary_usd,category,title_group,english_level,it_experience_years,is_outlier,outlier_score,is_salary_outlier,outlier_salary_score
0,2000.0,Analyst,Mid-level,Pre-Intermediate,2.0,False,0.00,False,0.00
1,4000.0,Quality Assurance,Senior-level,Intermediate,5.0,False,0.00,False,0.00
2,10000.0,DevOps,Tech leadership,Upper-Intermediate,9.0,False,0.00,False,0.25
3,3500.0,Management,Senior-level,Advanced,5.0,False,0.00,False,0.00
4,3000.0,Management,Tech leadership,Pre-Intermediate,18.0,False,0.25,True,0.75
...,...,...,...,...,...,...,...,...,...
7588,3227.0,HR,Tech leadership,Advanced,7.0,False,0.00,True,0.75
7589,7500.0,Data Scientist,Senior-level,Upper-Intermediate,4.0,False,0.00,False,0.25
7590,1450.0,Customer Support,Tech leadership,Pre-Intermediate,1.5,True,0.75,True,1.00
7591,9000.0,Software Engineer,Senior-level,Intermediate,20.0,True,1.00,True,1.00


In [31]:
#  комбінований стовпець
new_df['outlier_status'] = new_df.apply(
    lambda row: (
        'Both Outliers' if row['is_outlier'] and row['is_salary_outlier'] else
        'Experience Outlier' if row['is_outlier'] else
        'Salary Outlier' if row['is_salary_outlier'] else
        'Normal'
    ),
    axis=1
)


In [46]:
fig = px.scatter(
    new_df,
    x='it_experience_years',
    y='salary_usd',
    hover_data=['salary_usd', 'title_group'],
    color='outlier_status',
)
fig.add_shape(
    type='rect',
    xref='paper',
    yref='y',
    x0=0, x1=1,
    y0=0, y1=1500,
    fillcolor='LightSalmon',
    opacity=0.3,
    layer='below',
    line_width=0
)
fig.show()


In [None]:
data_salary_V3 = new_df[new_df['outlier_status'] == 'Normal'].copy()
print(data_salary_V3['outlier_status'].value_counts())
data_salary_V3


outlier_status
Normal    5172
Name: count, dtype: int64


Unnamed: 0,salary_usd,category,title_group,english_level,it_experience_years,is_outlier,outlier_score,is_salary_outlier,outlier_salary_score,outlier_status
0,2000.0,Analyst,Mid-level,Pre-Intermediate,2.0,False,0.0,False,0.00,Normal
1,4000.0,Quality Assurance,Senior-level,Intermediate,5.0,False,0.0,False,0.00,Normal
2,10000.0,DevOps,Tech leadership,Upper-Intermediate,9.0,False,0.0,False,0.25,Normal
3,3500.0,Management,Senior-level,Advanced,5.0,False,0.0,False,0.00,Normal
5,1500.0,Analyst,Mid-level,Upper-Intermediate,5.0,False,0.0,False,0.25,Normal
...,...,...,...,...,...,...,...,...,...,...
7583,900.0,Software Engineer,Entry-level,Intermediate,1.5,False,0.0,False,0.00,Normal
7585,600.0,Quality Assurance,Entry-level,Upper-Intermediate,1.5,False,0.0,False,0.00,Normal
7586,4500.0,Analyst,Management,Upper-Intermediate,8.0,False,0.0,False,0.00,Normal
7587,1200.0,Quality Assurance,Mid-level,Intermediate,2.0,False,0.0,False,0.25,Normal


In [None]:
data_prepV3 = data_salary_V3[['category', 'title_group', 'english_level', 'it_experience_years', 'salary_usd']].copy()
data_prepV3


Unnamed: 0,category,title_group,english_level,it_experience_years,salary_usd
0,Analyst,Mid-level,Pre-Intermediate,2.0,2000.0
1,Quality Assurance,Senior-level,Intermediate,5.0,4000.0
2,DevOps,Tech leadership,Upper-Intermediate,9.0,10000.0
3,Management,Senior-level,Advanced,5.0,3500.0
5,Analyst,Mid-level,Upper-Intermediate,5.0,1500.0
...,...,...,...,...,...
7583,Software Engineer,Entry-level,Intermediate,1.5,900.0
7585,Quality Assurance,Entry-level,Upper-Intermediate,1.5,600.0
7586,Analyst,Management,Upper-Intermediate,8.0,4500.0
7587,Quality Assurance,Mid-level,Intermediate,2.0,1200.0


In [None]:
data_prepV3.to_csv(DATA_DIR / "data_prepV3.csv", index=False)
