In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import r2_score

## Dataset Config

In [2]:
df = pd.read_csv('ds_salaries.csv', index_col=0)
df = df.drop('salary_in_usd', axis=1)
df

Unnamed: 0,work_year,experience_level,employment_type,job_title,salary,salary_currency,employee_residence,remote_ratio,company_location,company_size
0,2020,MI,FT,Data Scientist,70000,EUR,DE,0,DE,L
1,2020,SE,FT,Machine Learning Scientist,260000,USD,JP,0,JP,S
2,2020,SE,FT,Big Data Engineer,85000,GBP,GB,50,GB,M
3,2020,MI,FT,Product Data Analyst,20000,USD,HN,0,HN,S
4,2020,SE,FT,Machine Learning Engineer,150000,USD,US,50,US,L
...,...,...,...,...,...,...,...,...,...,...
602,2022,SE,FT,Data Engineer,154000,USD,US,100,US,M
603,2022,SE,FT,Data Engineer,126000,USD,US,100,US,M
604,2022,SE,FT,Data Analyst,129000,USD,US,0,US,M
605,2022,SE,FT,Data Analyst,150000,USD,US,100,US,M


In [3]:
df.dtypes

work_year              int64
experience_level      object
employment_type       object
job_title             object
salary                 int64
salary_currency       object
employee_residence    object
remote_ratio           int64
company_location      object
company_size          object
dtype: object

## One Hot Encoder (Binarization)

In [4]:
ohe    = OneHotEncoder(categories='auto')

catego = df.select_dtypes(include=[object]).columns
binary = ohe.fit_transform(df[catego]).toarray()
labels = ohe.get_feature_names()

encoder = pd.DataFrame(binary, columns=labels)
encoder

Unnamed: 0,x0_EN,x0_EX,x0_MI,x0_SE,x1_CT,x1_FL,x1_FT,x1_PT,x2_3D Computer Vision Researcher,x2_AI Scientist,...,x5_RU,x5_SG,x5_SI,x5_TR,x5_UA,x5_US,x5_VN,x6_L,x6_M,x6_S
0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
602,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
603,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
604,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
605,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0


## Integration

In [5]:
df = df.drop(catego, axis=1)
df

Unnamed: 0,work_year,salary,remote_ratio
0,2020,70000,0
1,2020,260000,0
2,2020,85000,50
3,2020,20000,0
4,2020,150000,50
...,...,...,...
602,2022,154000,100
603,2022,126000,100
604,2022,129000,0
605,2022,150000,100


In [6]:
df = pd.concat([df, encoder], axis=1)
df

Unnamed: 0,work_year,salary,remote_ratio,x0_EN,x0_EX,x0_MI,x0_SE,x1_CT,x1_FL,x1_FT,...,x5_RU,x5_SG,x5_SI,x5_TR,x5_UA,x5_US,x5_VN,x6_L,x6_M,x6_S
0,2020,70000,0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,2020,260000,0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,2020,85000,50,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,2020,20000,0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,2020,150000,50,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
602,2022,154000,100,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
603,2022,126000,100,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
604,2022,129000,0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
605,2022,150000,100,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0


## Normalization

In [7]:
scaler = MinMaxScaler().fit(df.values)
norm_array = scaler.transform(df.values)
norm_df = pd.DataFrame(norm_array, columns=df.columns)
norm_df

Unnamed: 0,work_year,salary,remote_ratio,x0_EN,x0_EX,x0_MI,x0_SE,x1_CT,x1_FL,x1_FT,...,x5_RU,x5_SG,x5_SI,x5_TR,x5_UA,x5_US,x5_VN,x6_L,x6_M,x6_S
0,0.0,0.002171,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.0,0.008422,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.0,0.002665,0.5,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0.0,0.000526,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0.0,0.004803,0.5,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
602,1.0,0.004935,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
603,1.0,0.004014,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
604,1.0,0.004112,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
605,1.0,0.004803,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0


## Holdout

In [8]:
X  = norm_df.drop(['salary'], axis=1).values
y  = norm_df.iloc[:, 1:2].values

x_train, x_test, y_train, y_test = train_test_split(X, y, train_size= 0.7, test_size = 0.3, random_state = 0)

model = LinearRegression()
model.fit(x_train, y_train)
y_pred = model.predict(x_train)

r2_score(y_pred, y_train)

0.9749133566421617

### O conjunto de teste não funciona, tem algumas inconsistência, provavelmente fiz algo nesse notebook de maneira errada.