In [199]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


df = pd.read_csv('ds_salaries.csv')
df.drop(df.columns[8], axis = 1, inplace = True)


In [200]:
df.head()

Unnamed: 0,work_year,experience_level,employment_type,job_title,salary,salary_currency,salary_in_usd,employee_residence,company_location,company_size
0,2023,SE,FT,Principal Data Scientist,80000,EUR,85847,ES,ES,L
1,2023,MI,CT,ML Engineer,30000,USD,30000,US,US,S
2,2023,MI,CT,ML Engineer,25500,USD,25500,US,US,S
3,2023,SE,FT,Data Scientist,175000,USD,175000,CA,CA,M
4,2023,SE,FT,Data Scientist,120000,USD,120000,CA,CA,M


In [201]:
df.drop(df.columns[5], axis = 1, inplace = True)
df.head()

Unnamed: 0,work_year,experience_level,employment_type,job_title,salary,salary_in_usd,employee_residence,company_location,company_size
0,2023,SE,FT,Principal Data Scientist,80000,85847,ES,ES,L
1,2023,MI,CT,ML Engineer,30000,30000,US,US,S
2,2023,MI,CT,ML Engineer,25500,25500,US,US,S
3,2023,SE,FT,Data Scientist,175000,175000,CA,CA,M
4,2023,SE,FT,Data Scientist,120000,120000,CA,CA,M


In [202]:
df.drop(df.columns[8], axis = 1, inplace = True)
df.head()

Unnamed: 0,work_year,experience_level,employment_type,job_title,salary,salary_in_usd,employee_residence,company_location
0,2023,SE,FT,Principal Data Scientist,80000,85847,ES,ES
1,2023,MI,CT,ML Engineer,30000,30000,US,US
2,2023,MI,CT,ML Engineer,25500,25500,US,US
3,2023,SE,FT,Data Scientist,175000,175000,CA,CA
4,2023,SE,FT,Data Scientist,120000,120000,CA,CA


In [203]:
df.drop(df.columns[3], axis = 1, inplace = True)
df.head()

Unnamed: 0,work_year,experience_level,employment_type,salary,salary_in_usd,employee_residence,company_location
0,2023,SE,FT,80000,85847,ES,ES
1,2023,MI,CT,30000,30000,US,US
2,2023,MI,CT,25500,25500,US,US
3,2023,SE,FT,175000,175000,CA,CA
4,2023,SE,FT,120000,120000,CA,CA


In [204]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3755 entries, 0 to 3754
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   work_year           3755 non-null   int64 
 1   experience_level    3755 non-null   object
 2   employment_type     3755 non-null   object
 3   salary              3755 non-null   int64 
 4   salary_in_usd       3755 non-null   int64 
 5   employee_residence  3755 non-null   object
 6   company_location    3755 non-null   object
dtypes: int64(3), object(4)
memory usage: 205.5+ KB


In [205]:
df["experience_level"].value_counts()

experience_level
SE    2516
MI     805
EN     320
EX     114
Name: count, dtype: int64

In [206]:
normalizando_experencia = {
    "EN":1,
    "MI":2,
    "SE":3,
    "EX":4
}

df['experience_level'] = df['experience_level'].map(normalizando_experencia)

df.head()

Unnamed: 0,work_year,experience_level,employment_type,salary,salary_in_usd,employee_residence,company_location
0,2023,3,FT,80000,85847,ES,ES
1,2023,2,CT,30000,30000,US,US
2,2023,2,CT,25500,25500,US,US
3,2023,3,FT,175000,175000,CA,CA
4,2023,3,FT,120000,120000,CA,CA


In [207]:
df.drop(df.columns[5], axis = 1, inplace = True)
df.head()

Unnamed: 0,work_year,experience_level,employment_type,salary,salary_in_usd,company_location
0,2023,3,FT,80000,85847,ES
1,2023,2,CT,30000,30000,US
2,2023,2,CT,25500,25500,US
3,2023,3,FT,175000,175000,CA
4,2023,3,FT,120000,120000,CA


In [208]:
df.drop(df.columns[5], axis = 1, inplace = True)
df.head()

Unnamed: 0,work_year,experience_level,employment_type,salary,salary_in_usd
0,2023,3,FT,80000,85847
1,2023,2,CT,30000,30000
2,2023,2,CT,25500,25500
3,2023,3,FT,175000,175000
4,2023,3,FT,120000,120000


In [209]:
normalizando_tipo_emprego = {
    "FT":1,
    "PT":2,
    "CT":3,
    "FL":4
}

df['employment_type'] = df['employment_type'].map(normalizando_tipo_emprego)

In [210]:
def find_outliers_IQR(column_name, df):
    q1, q3 = df[column_name].quantile([0.25, 0.75])
    IQR = q3 - q1
    outliers = df[(df[column_name] < q1 - 1.5*IQR) | (df[column_name] > q3 + 1.5*IQR)]
    return outliers

def print_outliers_IQR(df):
    for column in df.columns:
        outliers = find_outliers_IQR(column, df)
        print(f"Outliers da coluna {column}: {outliers.shape[0]}")

In [211]:
print_outliers_IQR(df.describe())

Outliers da coluna work_year: 2
Outliers da coluna experience_level: 1
Outliers da coluna employment_type: 2
Outliers da coluna salary: 2
Outliers da coluna salary_in_usd: 1


In [212]:
model = MLPClassifier()
X = df.iloc[:,:-1].values
y = df.iloc[:, -1].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Pré-processamento dos dados (normalização)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Criar o MLPClassifier
mlp = MLPClassifier(hidden_layer_sizes=(10, 10), activation='relu', random_state=42, )

# Treinar o MLPClassifier
mlp.fit(X_train_scaled, y_train)

# Fazer previsões
predictions = mlp.predict(X_test_scaled)

# Avaliar o desempenho
accuracy = mlp.score(X_test_scaled, y_test)
print(f"Acurácia: {accuracy}")

Acurácia: 0.03861517976031957


