In [1]:
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('ds_salaries (1) (1).csv')
df.head()

Unnamed: 0,work_year,experience_level,employment_type,job_title,salary,salary_currency,salary_in_usd,employee_residence,remote_ratio,company_location,company_size
0,2023,SE,FT,Principal Data Scientist,80000,EUR,85847,ES,100,ES,L
1,2023,MI,CT,ML Engineer,30000,USD,30000,US,100,US,S
2,2023,MI,CT,ML Engineer,25500,USD,25500,US,100,US,S
3,2023,SE,FT,Data Scientist,175000,USD,175000,CA,100,CA,M
4,2023,SE,FT,Data Scientist,120000,USD,120000,CA,100,CA,M


# data preprocessing



In [3]:
df.isnull().sum()

work_year             0
experience_level      0
employment_type       0
job_title             0
salary                0
salary_currency       0
salary_in_usd         0
employee_residence    0
remote_ratio          0
company_location      0
company_size          0
dtype: int64

In [4]:
df.duplicated().sum()

1171

In [5]:
df.drop_duplicates(inplace=True)
df.duplicated().sum()

0

In [6]:
df.dtypes

work_year              int64
experience_level      object
employment_type       object
job_title             object
salary                 int64
salary_currency       object
salary_in_usd          int64
employee_residence    object
remote_ratio           int64
company_location      object
company_size          object
dtype: object

In [7]:
cat_cols = df.dtypes[df.dtypes=='object'].index
print(cat_cols)
    

Index(['experience_level', 'employment_type', 'job_title', 'salary_currency',
       'employee_residence', 'company_location', 'company_size'],
      dtype='object')


In [8]:
for i in cat_cols:
    print('features: ',i)
    print(df[i].value_counts())
    print('*'*30)

features:  experience_level
SE    1554
MI     664
EN     270
EX      96
Name: experience_level, dtype: int64
******************************
features:  employment_type
FT    2547
PT      17
CT      10
FL      10
Name: employment_type, dtype: int64
******************************
features:  job_title
Data Engineer                598
Data Scientist               538
Data Analyst                 396
Machine Learning Engineer    206
Analytics Engineer            91
                            ... 
Compliance Data Analyst        1
Deep Learning Researcher       1
Staff Data Analyst             1
Data DevOps Engineer           1
Finance Data Analyst           1
Name: job_title, Length: 93, dtype: int64
******************************
features:  salary_currency
USD    2107
EUR     200
GBP     144
INR      59
CAD      25
AUD       9
SGD       6
BRL       6
PLN       5
CHF       4
HUF       3
DKK       3
JPY       3
TRY       3
THB       2
ILS       1
HKD       1
CZK       1
MXN       1
CLP       

In [9]:
df['company_size'].value_counts()

M    2028
L     409
S     147
Name: company_size, dtype: int64

In [10]:
print(cat_cols)

Index(['experience_level', 'employment_type', 'job_title', 'salary_currency',
       'employee_residence', 'company_location', 'company_size'],
      dtype='object')


# encoding

In [11]:
from sklearn.preprocessing import LabelEncoder

In [12]:
lb = LabelEncoder()
for i in cat_cols:
    df[i] = lb.fit_transform(df[i])

In [13]:
df.dtypes

work_year             int64
experience_level      int32
employment_type       int32
job_title             int32
salary                int64
salary_currency       int32
salary_in_usd         int64
employee_residence    int32
remote_ratio          int64
company_location      int32
company_size          int32
dtype: object

# selecting x and y


In [14]:
x1 = df[['work_year','experience_level','employment_type','job_title','salary_currency','salary_in_usd','employee_residence',
'remote_ratio','company_location','company_size']]
y1 = df['salary']
print(type(x1))
print(type(y1))


<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.series.Series'>


In [15]:
print(x1.shape)
print(y1.shape)

(2584, 10)
(2584,)


# spilt train and test

In [16]:
from sklearn.model_selection import train_test_split

In [17]:
print(x1.shape)
print(2584*0.30)

(2584, 10)
775.1999999999999


In [18]:
x_train,x_test,y_train,y_test = train_test_split(x1,y1,test_size=0.25,random_state=42)
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(1938, 10)
(646, 10)
(1938,)
(646,)


# linearRegression model

In [22]:
from sklearn.linear_model import LinearRegression

In [23]:
m1 = LinearRegression()
m1.fit(x_train,y_train)

In [24]:
print('train score',m1.score(x_train,y_train))
print('test score',m1.score(x_test,y_test))


train score 0.03157066035782974
test score -0.04626475996595936


In [26]:
ypred_m1 = m1.predict(x_test)
print(ypred_m1)

[ 50000 100000 200000 120000 100000 200000 200000 120000 250000  50000
 250000 200000 100000 200000 120000 100000 100000 100000 250000 120000
  50000 200000  50000  50000 300000 100000 120000 150000 100000 150000
 200000 120000 100000 200000  50000  80000  60000 100000 200000  60000
 250000 100000  80000 120000 100000 150000 200000 200000 120000 200000
 100000 150000 200000  50000 160000  60000 150000 150000 150000 150000
  60000 200000  80000  50000 100000 200000 100000  50000  50000  50000
 100000 150000 150000 160000 200000 200000 150000  80000 150000 100000
 250000 160000 120000 160000 250000 100000  70000 200000 250000  50000
 200000  80000 150000 150000  50000 100000  50000 100000 150000  80000
 200000 200000 120000  70000 250000  50000  80000 100000  50000 200000
 150000 100000  80000 200000 120000 200000 120000  50000 120000 250000
 100000  60000 120000  50000 200000 100000 200000 200000  80000 150000
 150000 150000 250000 120000 250000 200000 150000 150000  80000 150000
 15000

# mse,rmse,r2

In [26]:
from sklearn.metrics import mean_squared_error,mean_absolute_error,r2_score

In [27]:
def eval_model(ytest,ypred):
    mae = mean_absolute_error(ytest,ypred)
    mse = mean_squared_error(ytest,ypred)
    rmse = np.sqrt(mean_squared_error(ytest,ypred))
    r2s = r2_score(ytest,ypred)
    print('MAE',mae)
    print('MSE',mse)
    print('RMSE',rmse)
    print('R2_Score',r2s)

In [28]:
eval_model(y_test,ypred_m1)

MAE 183051.33164286273
MSE 280530510105.7078
RMSE 529651.3099254149
R2_Score -0.04626475996595936


the best accurate model id r2_score

# logistcRegression

In [29]:
from sklearn.linear_model import LogisticRegression

In [30]:
model = LogisticRegression()
model.fit(x_train,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [31]:
print('train score',model.score(x_train,y_train))
print('test score',model.score(x_test,y_test))

train score 0.07430340557275542
test score 0.04643962848297214


# knn classification

In [32]:
from sklearn.neighbors import KNeighborsClassifier

In [33]:
m1 = KNeighborsClassifier(n_neighbors=11)
m1.fit(x_train,y_train)

In [34]:
print('Train Score',m1.score(x_train,y_train))
print('Test Score',m1.score(x_test,y_test))

Train Score 0.5371517027863777
Test Score 0.44272445820433437


In [35]:
ypred_m1 = m1.predict(x_test)


In [36]:
from sklearn.metrics import confusion_matrix,classification_report

In [37]:
cm = confusion_matrix(y_test,ypred_m1)
print(cm)
print(classification_report(y_test,ypred_m1))

[[0 0 0 ... 0 0 0]
 [0 0 1 ... 0 0 0]
 [0 0 2 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
              precision    recall  f1-score   support

        8000       0.00      0.00      0.00         0
        9272       0.00      0.00      0.00         1
       10000       0.67      1.00      0.80         2
       12000       0.50      1.00      0.67         1
       15000       0.00      0.00      0.00         0
       19000       0.00      0.00      0.00         1
       20000       0.43      1.00      0.60         3
       21000       0.00      0.00      0.00         1
       22000       0.00      0.00      0.00         1
       23000       0.00      0.00      0.00         0
       24000       0.00      0.00      0.00         2
       25000       0.00      0.00      0.00         1
       28500       0.00      0.00      0.00         1
       30000       0.40      1.00      0.57         2
       31000       0.00      0.00      0.00         1
       33000   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [38]:
from sklearn.metrics import confusion_matrix, classification_report,accuracy_score

In [39]:
def gen_model(model,x_train,x_test,y_train,y_test):
    model.fit(x_train,y_train)
    print('Train Score',model.score(x_train,y_train))
    print('Test Score',model.score(x_test,y_test))
    ypred = model.predict(x_test)
    cm = confusion_matrix(y_test,ypred)
    print(cm)
    print(classification_report(y_test,ypred))

# decision tree

In [40]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [41]:
dt1 = DecisionTreeClassifier(criterion='gini',max_depth=8,min_samples_split=15)
gen_model(dt1,x_train,x_test,y_train,y_test)

Train Score 0.43498452012383904
Test Score 0.37306501547987614
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
              precision    recall  f1-score   support

        9272       0.00      0.00      0.00         1
       10000       0.00      0.00      0.00         2
       12000       0.00      0.00      0.00         1
       19000       0.00      0.00      0.00         1
       20000       0.11      1.00      0.20         3
       21000       0.00      0.00      0.00         1
       22000       0.00      0.00      0.00         1
       24000       0.00      0.00      0.00         2
       25000       0.00      0.00      0.00         1
       28500       0.00      0.00      0.00         1
       30000       0.17      1.00      0.29         2
       31000       0.00      0.00      0.00         1
       33000       0.00      0.00      0.00         3
       35000       0.00      0.00      0.00         2
      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# Randomforest classifier

In [42]:
rf1 = RandomForestClassifier(n_estimators=70,criterion='gini', max_depth=8, min_samples_split=15)
gen_model(rf1,x_train,x_test,y_train,y_test)

Train Score 0.4551083591331269
Test Score 0.22445820433436534
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 1 0 0]
 [0 0 0 ... 0 0 0]]
              precision    recall  f1-score   support

        9272       0.00      0.00      0.00         1
       10000       0.00      0.00      0.00         2
       12000       0.00      0.00      0.00         1
       19000       0.00      0.00      0.00         1
       20000       0.00      0.00      0.00         3
       21000       0.00      0.00      0.00         1
       22000       0.00      0.00      0.00         1
       24000       0.00      0.00      0.00         2
       25000       0.00      0.00      0.00         1
       28500       0.00      0.00      0.00         1
       30000       0.14      0.50      0.22         2
       31000       0.00      0.00      0.00         1
       33000       0.00      0.00      0.00         3
       35000       0.00      0.00      0.00         2
       

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# svc classifier

In [19]:
from sklearn.svm import SVC

In [32]:
m1 = SVC(C=1)
m1.fit(x_train,y_train)

In [21]:
print('Train Score',m1.score(x_train,y_train))  
print('Test Score',m1.score(x_test,y_test))     

Train Score 0.12332301341589268
Test Score 0.09907120743034056


In [22]:
ypred_m1 = m1.predict(x_test)
print(ypred_m1)

[ 50000 100000 200000 120000 100000 200000 200000 120000 250000  50000
 250000 200000 100000 200000 120000 100000 100000 100000 250000 120000
  50000 200000  50000  50000 300000 100000 120000 150000 100000 150000
 200000 120000 100000 200000  50000  80000  60000 100000 200000  60000
 250000 100000  80000 120000 100000 150000 200000 200000 120000 200000
 100000 150000 200000  50000 160000  60000 150000 150000 150000 150000
  60000 200000  80000  50000 100000 200000 100000  50000  50000  50000
 100000 150000 150000 160000 200000 200000 150000  80000 150000 100000
 250000 160000 120000 160000 250000 100000  70000 200000 250000  50000
 200000  80000 150000 150000  50000 100000  50000 100000 150000  80000
 200000 200000 120000  70000 250000  50000  80000 100000  50000 200000
 150000 100000  80000 200000 120000 200000 120000  50000 120000 250000
 100000  60000 120000  50000 200000 100000 200000 200000  80000 150000
 150000 150000 250000 120000 250000 200000 150000 150000  80000 150000
 15000

In [23]:
from sklearn.metrics import confusion_matrix,classification_report

In [24]:
def eval_model(ytest,ypred):
    cm = confusion_matrix(ytest,ypred)
    print(cm)
    print(classification_report(ytest,ypred))

In [25]:
eval_model(y_test,ypred_m1)

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
              precision    recall  f1-score   support

        9272       0.00      0.00      0.00         1
       10000       0.00      0.00      0.00         2
       12000       0.00      0.00      0.00         1
       19000       0.00      0.00      0.00         1
       20000       0.00      0.00      0.00         3
       21000       0.00      0.00      0.00         1
       22000       0.00      0.00      0.00         1
       24000       0.00      0.00      0.00         2
       25000       0.00      0.00      0.00         1
       28500       0.00      0.00      0.00         1
       30000       0.00      0.00      0.00         2
       31000       0.00      0.00      0.00         1
       33000       0.00      0.00      0.00         3
       35000       0.00      0.00      0.00         2
       36000       0.00      0.00      0.00         2
       38000   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
