In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder, OrdinalEncoder

import sys
from pathlib import Path
sys.path.append('..')
from paths import DATA_DIR


In [None]:
df = pd.read_csv(DATA_DIR / "prep_data_v2.csv")
df.head()


Unnamed: 0,salary_usd,title,category,english_level,it_experience_years
0,2000.0,Middle,Analyst,Pre-Intermediate,2.0
1,4000.0,Senior,Quality Assurance,Intermediate,5.0
2,10000.0,Lead / Team Lead,DevOps,Upper-Intermediate,9.0
3,3500.0,Senior,Management,Advanced,5.0
4,3000.0,Lead / Team Lead,Management,Pre-Intermediate,18.0


In [None]:
df['title'].value_counts()


title
Middle                                   2618
Senior                                   2058
Junior                                    996
Lead / Team Lead                          699
Manager                                   561
Немає тайтлу                              435
Head                                      242
CEO / C-level (Chief) / Director / VP     147
Tech Lead                                 114
Intern/Trainee                             70
Architect                                  44
Principal                                  23
Staff                                      22
Name: count, dtype: int64

In [None]:
df['english_level'].value_counts()


english_level
Upper-Intermediate    3571
Intermediate          2164
Advanced              1341
Pre-Intermediate       746
Elementary             192
Не знаю взагалі         15
Name: count, dtype: int64

In [None]:
df_test = df.copy()


In [6]:
categories_english = [
    [
        "Не знаю взагалі",
        "Elementary",
        "Pre-Intermediate",
        "Intermediate",
        "Upper-Intermediate",
        "Advanced",
    ]
]


In [None]:
english_decoded = {
    "Не знаю взагалі": 0,
    "Elementary": 1,
    "Pre-Intermediate": 2,
    "Intermediate": 3,
    "Upper-Intermediate": 4,
    "Advanced": 5
}


In [None]:
df_test['english_decoded'] = df_test['english_level'].map(english_decoded)


In [None]:
title_bins = {
    "Intern/Trainee": "Entry-level",
    "Junior": "Entry-level",

    "Middle": "Mid-level",
    "Senior": "Senior-level",

    "Tech Lead": "Tech leadership",
    "Lead / Team Lead": "Tech leadership",

    "Staff": "IC-high",
    "Principal": "IC-high",
    "Architect": "IC-high",

    "Manager": "Management",
    "Head": "Management",

    "CEO / C-level (Chief) / Director / VP": "Executive",

    "Немає тайтлу": "No title"
}

df_test['title_group'] = df_test['title'].map(title_bins)


In [None]:
df_test['title_group'].value_counts()


title_group
Mid-level          2618
Senior-level       2058
Entry-level        1066
Tech leadership     813
Management          803
No title            435
Executive           147
IC-high              89
Name: count, dtype: int64

In [None]:
df_test[df_test['title_group'] == 'No title'].head()


Unnamed: 0,salary_usd,title,category,english_level,it_experience_years,english_decoded,title_group
68,1700.0,Немає тайтлу,Design,Upper-Intermediate,0.1,4,No title
114,3229.0,Немає тайтлу,Data Scientist,Advanced,4.0,5,No title
115,2600.0,Немає тайтлу,Design,Upper-Intermediate,5.0,4,No title
184,1000.0,Немає тайтлу,Management,Upper-Intermediate,2.0,4,No title
188,900.0,Немає тайтлу,Design,Intermediate,7.0,3,No title


In [None]:
title_bins_ordered = {
    "No title": 0,
    "Entry-level": 1,
    "Mid-level": 2,
    "Senior-level": 3,
    "Tech leadership": 4,
    "IC-high": 5,
    "Management": 6,
    "Executive": 7
}

df_test['title_numeric'] = df_test['title_group'].map(title_bins_ordered)


In [None]:
df_test


Unnamed: 0,salary_usd,title,category,english_level,it_experience_years,english_decoded,title_group,title_numeric
0,2000.0,Middle,Analyst,Pre-Intermediate,2.0,2,Mid-level,2
1,4000.0,Senior,Quality Assurance,Intermediate,5.0,3,Senior-level,3
2,10000.0,Lead / Team Lead,DevOps,Upper-Intermediate,9.0,4,Tech leadership,4
3,3500.0,Senior,Management,Advanced,5.0,5,Senior-level,3
4,3000.0,Lead / Team Lead,Management,Pre-Intermediate,18.0,2,Tech leadership,4
...,...,...,...,...,...,...,...,...
8024,3227.0,Lead / Team Lead,HR,Advanced,7.0,5,Tech leadership,4
8025,7500.0,Senior,Data Scientist,Upper-Intermediate,4.0,4,Senior-level,3
8026,1450.0,Lead / Team Lead,Customer Support,Pre-Intermediate,1.5,2,Tech leadership,4
8027,9000.0,Senior,Software Engineer,Intermediate,20.0,3,Senior-level,3


In [None]:
corr_matrix = df_test.corr(numeric_only=True)
corr_matrix['salary_usd']


salary_usd             1.000000
it_experience_years    0.452051
english_decoded        0.230572
title_numeric          0.467597
Name: salary_usd, dtype: float64

In [None]:
df_test


Unnamed: 0,salary_usd,title,category,english_level,it_experience_years,english_decoded,title_group,title_numeric
0,2000.0,Middle,Analyst,Pre-Intermediate,2.0,2,Mid-level,2
1,4000.0,Senior,Quality Assurance,Intermediate,5.0,3,Senior-level,3
2,10000.0,Lead / Team Lead,DevOps,Upper-Intermediate,9.0,4,Tech leadership,4
3,3500.0,Senior,Management,Advanced,5.0,5,Senior-level,3
4,3000.0,Lead / Team Lead,Management,Pre-Intermediate,18.0,2,Tech leadership,4
...,...,...,...,...,...,...,...,...
8024,3227.0,Lead / Team Lead,HR,Advanced,7.0,5,Tech leadership,4
8025,7500.0,Senior,Data Scientist,Upper-Intermediate,4.0,4,Senior-level,3
8026,1450.0,Lead / Team Lead,Customer Support,Pre-Intermediate,1.5,2,Tech leadership,4
8027,9000.0,Senior,Software Engineer,Intermediate,20.0,3,Senior-level,3


In [None]:
df_test = df_test.drop(df_test[df_test['title_group'] == 'No title'].index)
df_test['title_group'].value_counts()


title_group
Mid-level          2618
Senior-level       2058
Entry-level        1066
Tech leadership     813
Management          803
Executive           147
IC-high              89
Name: count, dtype: int64

In [None]:
df_test = df_test.drop(columns=['title'])


In [None]:
df_test = df_test.drop(df_test[df_test['salary_usd'] >= 25000].index)


In [None]:
df_test


Unnamed: 0,salary_usd,category,english_level,it_experience_years,english_decoded,title_group,title_numeric
0,2000.0,Analyst,Pre-Intermediate,2.0,2,Mid-level,2
1,4000.0,Quality Assurance,Intermediate,5.0,3,Senior-level,3
2,10000.0,DevOps,Upper-Intermediate,9.0,4,Tech leadership,4
3,3500.0,Management,Advanced,5.0,5,Senior-level,3
4,3000.0,Management,Pre-Intermediate,18.0,2,Tech leadership,4
...,...,...,...,...,...,...,...
8024,3227.0,HR,Advanced,7.0,5,Tech leadership,4
8025,7500.0,Data Scientist,Upper-Intermediate,4.0,4,Senior-level,3
8026,1450.0,Customer Support,Pre-Intermediate,1.5,2,Tech leadership,4
8027,9000.0,Software Engineer,Intermediate,20.0,3,Senior-level,3


In [29]:
import plotly.figure_factory as ff

corr_matrix = df_test.corr(numeric_only=True)

fig = ff.create_annotated_heatmap(
    z=corr_matrix.values,
    x=list(corr_matrix.columns),
    y=list(corr_matrix.index),
    colorscale="RdBu_R",
    showscale=True,
    zmin = -1,
    zmax=1,
)

fig.show()


In [28]:
import plotly.express as px
import plotly.io as pio
import plotly.figure_factory as ff

pio.templates.default = "plotly_dark"


In [None]:
pio.templates


Templates configuration
-----------------------
    Default template: 'plotly_dark'
    Available templates:
        ['ggplot2', 'seaborn', 'simple_white', 'plotly',
         'plotly_white', 'plotly_dark', 'presentation', 'xgridoff',
         'ygridoff', 'gridon', 'none']

In [None]:
df_test.columns


Index(['salary_usd', 'category', 'english_level', 'it_experience_years',
       'english_decoded', 'title_group', 'title_numeric'],
      dtype='object')

In [60]:
fig = px.scatter(df_test, x = 'it_experience_years', y='salary_usd', hover_data='salary_usd',
                 color='salary_usd',
                 marginal_x='histogram',
                 trendline='lowess')

fig.add_shape(
    type='rect',
    xref='paper',
    yref='y',
    x0=0, x1=1,
    y0=0, y1=1500,
    fillcolor='LightSalmon',
    opacity=0.3,
    layer='below',
    line_width=0
)
fig.show()


In [61]:
fig = px.scatter(df_test, x = 'english_decoded', y='salary_usd', hover_data='salary_usd',
                 color='salary_usd',
                 marginal_x='histogram',
                 trendline='lowess')
fig.show()


In [None]:
df_model = df_test[['salary_usd', 'category', 'title_group', 'english_level', 'it_experience_years']].copy()


In [None]:
df_testing = df_model.copy()


In [None]:
def clean_1(df_testing):
    df_testing['anomaly_score'] = 0
    mask = (df_testing['title_group'] == 'Entry-level') & (df_testing['it_experience_years'] > 3)
    df_testing.loc[mask, 'anomaly_score'] = 1
    return df_testing
clean_1(df_testing)


Unnamed: 0,salary_usd,category,title_group,english_level,it_experience_years,anomaly_score
0,2000.0,Analyst,Mid-level,Pre-Intermediate,2.0,0
1,4000.0,Quality Assurance,Senior-level,Intermediate,5.0,0
2,10000.0,DevOps,Tech leadership,Upper-Intermediate,9.0,0
3,3500.0,Management,Senior-level,Advanced,5.0,0
4,3000.0,Management,Tech leadership,Pre-Intermediate,18.0,0
...,...,...,...,...,...,...
8024,3227.0,HR,Tech leadership,Advanced,7.0,0
8025,7500.0,Data Scientist,Senior-level,Upper-Intermediate,4.0,0
8026,1450.0,Customer Support,Tech leadership,Pre-Intermediate,1.5,0
8027,9000.0,Software Engineer,Senior-level,Intermediate,20.0,0


In [None]:
df_testing['anomaly_score'].value_counts()


anomaly_score
0    7555
1      38
Name: count, dtype: int64

In [None]:
df_testing[(df_testing['title_group'] == 'Entry-level') & (df['salary_usd'] > 2500) & (df_testing['anomaly_score'] == 0)]



Boolean Series key will be reindexed to match DataFrame index.



Unnamed: 0,salary_usd,category,title_group,english_level,it_experience_years,anomaly_score
363,3000.0,Quality Assurance,Entry-level,Intermediate,2.0,0
1366,2800.0,Analyst,Entry-level,Advanced,2.0,0
1496,3680.0,Software Engineer,Entry-level,Intermediate,1.0,0
1893,3090.0,Quality Assurance,Entry-level,Upper-Intermediate,2.0,0
1994,3000.0,Management,Entry-level,Upper-Intermediate,3.0,0
3352,3040.0,Software Engineer,Entry-level,Intermediate,2.0,0
3522,3000.0,Management,Entry-level,Advanced,0.5,0
4003,4800.0,Software Engineer,Entry-level,Upper-Intermediate,1.5,0
5705,3000.0,Management,Entry-level,Upper-Intermediate,0.5,0
5994,3500.0,Software Engineer,Entry-level,Intermediate,2.0,0
