# Data Processing

## Import data and select features

In [1]:
# import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
# import dataset
df = pd.read_csv('ds_salaries.csv')
df = df.iloc[:, 1:] # exclude 1st (0th) column in the dataset
X = df.iloc[:, [0,1,2,3,5,8,9,10]].values # independent variables aka features
y = df.iloc[:, 6].values # predict salary_in_usd

In [3]:
print(X.shape)
print(X)

(607, 8)
[[2020 'MI' 'FT' ... 0 'DE' 'L']
 [2020 'SE' 'FT' ... 0 'JP' 'S']
 [2020 'SE' 'FT' ... 50 'GB' 'M']
 ...
 [2022 'SE' 'FT' ... 0 'US' 'M']
 [2022 'SE' 'FT' ... 100 'US' 'M']
 [2022 'MI' 'FT' ... 100 'US' 'L']]


In [4]:
print(y.shape)
print(y[:10])

(607,)
[ 79833 260000 109024  20000 150000  72000 190000  35735 135000 125000]


## Impute missing data

In [5]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer.fit(X[:, [-3]]) # impute numerical cols
X[:, [-3]] = imputer.transform(X[:, [-3]])

In [6]:
print(X)

[[2020 'MI' 'FT' ... 0.0 'DE' 'L']
 [2020 'SE' 'FT' ... 0.0 'JP' 'S']
 [2020 'SE' 'FT' ... 50.0 'GB' 'M']
 ...
 [2022 'SE' 'FT' ... 0.0 'US' 'M']
 [2022 'SE' 'FT' ... 100.0 'US' 'M']
 [2022 'MI' 'FT' ... 100.0 'US' 'L']]


In [7]:
pd.DataFrame(X).head()

Unnamed: 0,0,1,2,3,4,5,6,7
0,2020,MI,FT,Data Scientist,EUR,0.0,DE,L
1,2020,SE,FT,Machine Learning Scientist,USD,0.0,JP,S
2,2020,SE,FT,Big Data Engineer,GBP,50.0,GB,M
3,2020,MI,FT,Product Data Analyst,USD,0.0,HN,S
4,2020,SE,FT,Machine Learning Engineer,USD,50.0,US,L


## Encoding categorical data

### Encoding the independent variables

In [8]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder

# drop='first' drop the first category in each feature to avoid dummy trap
# (eliminate collinearity)
# remainder="passthrough" allows non-specified columns to be kept. Default is "drop"
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(categories='auto',drop='first'), [1,2])], remainder='passthrough') 
X = ct.fit_transform(X)

In [9]:
print(X.shape)
pd.DataFrame(X).head()

(607, 12)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,0.0,1.0,0.0,0.0,1.0,0.0,2020,Data Scientist,EUR,0.0,DE,L
1,0.0,0.0,1.0,0.0,1.0,0.0,2020,Machine Learning Scientist,USD,0.0,JP,S
2,0.0,0.0,1.0,0.0,1.0,0.0,2020,Big Data Engineer,GBP,50.0,GB,M
3,0.0,1.0,0.0,0.0,1.0,0.0,2020,Product Data Analyst,USD,0.0,HN,S
4,0.0,0.0,1.0,0.0,1.0,0.0,2020,Machine Learning Engineer,USD,50.0,US,L


### Encoding the dependent variable

In [10]:
# Can use LabelEncoder. WE won't do that here
# from sklearn.preprocessing import LabelEncoder
# le = LabelEncoder()
# y = le.fit_transform(y

## Splitting the dataset into training and test set

In [11]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

In [12]:
print(X_train.shape)
print(X_train)

(485, 12)
[[0.0 1.0 0.0 ... 50.0 'CA' 'L']
 [0.0 0.0 1.0 ... 100.0 'US' 'L']
 [0.0 0.0 1.0 ... 100.0 'US' 'M']
 ...
 [0.0 0.0 0.0 ... 50.0 'GB' 'L']
 [0.0 1.0 0.0 ... 0.0 'US' 'S']
 [0.0 0.0 0.0 ... 50.0 'US' 'L']]


In [13]:
print(X_test.shape)
print(X_test)

(122, 12)
[[0.0 0.0 0.0 ... 50.0 'DE' 'S']
 [0.0 1.0 0.0 ... 100.0 'ES' 'M']
 [0.0 1.0 0.0 ... 100.0 'US' 'L']
 ...
 [0.0 0.0 1.0 ... 0.0 'US' 'L']
 [0.0 0.0 0.0 ... 100.0 'US' 'L']
 [0.0 1.0 0.0 ... 100.0 'IR' 'M']]


## Feature Scaling

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train[:, 6:] = sc.fit_transform(X_train[:, 6:])
X_test[:, 6:] = sc.transform(X_test[:, 6:])