In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv("salaries_cyber.csv")

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.isnull().sum()

In [None]:
df.info()

# EDA

In [None]:
df["job_title"].value_counts().head(10).plot(kind="barh")

In [None]:
df["job_title"].value_counts().head(15).plot(kind="pie", autopct="%.2f%%")

In [None]:
df["experience_level"].value_counts().plot(kind="pie", autopct="%.2f%%")

In [None]:
df["employment_type"].value_counts().plot(kind="bar")

In [None]:
sns.boxplot(data=df, x="experience_level", y="salary_in_usd")

In [None]:
sns.boxplot(data=df, x="employment_type", y="salary_in_usd")

In [None]:
sns.violinplot(data=df, x="work_year", y="salary_in_usd")

In [None]:
job_salary = df.groupby("job_title", as_index=False)[["salary_in_usd"]].max().sort_values(by="salary_in_usd", ascending=False)
job_salary

In [None]:
df["job_title"].nunique()

In [None]:
sns.barplot(y=job_salary["job_title"].head(10), x=job_salary["salary_in_usd"].head(10))

In [None]:
experience_salary = df.groupby("experience_level", as_index=False)[["salary_in_usd"]].max().sort_values(by="salary_in_usd", ascending=False)
experience_salary

In [None]:
sns.barplot(y=experience_salary["experience_level"].head(10), x=experience_salary["salary_in_usd"].head(10))

In [None]:
employment_salary = df.groupby("employment_type", as_index=False)[["salary_in_usd"]].max().sort_values(by="salary_in_usd", ascending=False)
employment_salary

In [None]:
sns.barplot(y=employment_salary["employment_type"].head(10), x=employment_salary["salary_in_usd"].head(10))

In [None]:
from sklearn.preprocessing import LabelEncoder

def label_encoder(column):
    le = LabelEncoder().fit(column)
    print(column.name, le.classes_)
    return le.transform(column)

In [None]:
df.drop(["salary", "salary_currency"], axis=1, inplace=True)

In [None]:
Q1 = df.quantile(0.25)
Q3 = df.quantile(0.75)
IQR = Q3 - Q1

In [None]:
alt_sinir = Q1 - 1.5 * IQR
ust_sinir = Q3 + 1.5 * IQR

In [None]:
t_df = df[~((df < alt_sinir) | (df > ust_sinir)).any(axis=1)]

In [None]:
t_df.shape

In [None]:
t_df["work_year"] = label_encoder(t_df["work_year"])

In [None]:
experience_dummy = pd.get_dummies(t_df["experience_level"], dtype=np.int64)

In [None]:
t_df["employee_residence"] = label_encoder(t_df["employee_residence"])

In [None]:
employment_dummy = pd.get_dummies(t_df["employment_type"], dtype=np.int64)

In [None]:
t_df["company_location"] = label_encoder(t_df["company_location"])

In [None]:
company_dummy = pd.get_dummies(t_df["company_size"], dtype=np.int64)

In [None]:
t_df["job_title"] = label_encoder(t_df["job_title"])

In [None]:
ndf = pd.concat([t_df, experience_dummy, employment_dummy, company_dummy], axis=1)

In [None]:
ndf.drop(["experience_level", "employment_type", "company_size"], axis=1, inplace=True)

In [None]:
ndf.head()

In [None]:
ndf.info()

In [None]:
X = ndf.drop("salary_in_usd", axis=1)
y = ndf["salary_in_usd"]

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

linreg = LinearRegression()
linreg.fit(X_train, y_train)

In [None]:
ndf.info()

In [None]:
linreg_pred = linreg.predict(X_test)

In [None]:
r2_score(linreg_pred, y_test)

In [None]:
plt.figure()
plt.scatter(y_test, linreg_pred)
plt.scatter(y_test, y_test)
plt.plot(y_test, y_test)
plt.legend(["Predicted", "Original", "Regression Line"])
plt.show()

In [None]:
from xgboost import XGBRegressor

xgb = XGBRegressor()
xgb.fit(X_train, y_train)

In [None]:
xgb_pred = xgb.predict(X_test)

In [None]:
r2_score(xgb_pred, y_test)

In [None]:
plt.figure()
plt.scatter(y_test, xgb_pred)
plt.scatter(y_test, y_test)
plt.plot(y_test, y_test)
plt.legend(["Predicted", "Original", "Regression Line"])
plt.show()

In [None]:
from sklearn.ensemble import ExtraTreesRegressor

extra = ExtraTreesRegressor()
extra.fit(X_train, y_train)

In [None]:
extra_pred = extra.predict(X_test)

In [None]:
r2_score(extra_pred, y_test)

In [None]:
plt.figure()
plt.scatter(y_test, extra_pred)
plt.scatter(y_test, y_test)
plt.plot(y_test, y_test)
plt.legend(["Predicted", "Original", "Regression Line"])
plt.show()