In [115]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

import joblib

In [2]:
data = pd.read_csv("salary_predict_dataset.csv")

In [3]:
data

Unnamed: 0,experience,test_score,interview_score,Salary
0,,8.0,8.0,50000
1,,5.0,4.0,22000
2,three,6.0,5.0,30000
3,five,9.0,9.0,55000
4,six,3.0,5.0,13000
5,,2.0,1.0,9000
6,ten,8.0,6.0,48000
7,one,1.0,2.0,500
8,fifteen,9.0,9.0,60000
9,thirteen,7.5,7.5,45000


In [4]:
data.isna().sum()

experience         5
test_score         1
interview_score    1
Salary             0
dtype: int64

In [12]:
pipe = Pipeline(steps=[('impute',SimpleImputer())])

In [17]:
X= data.drop(columns=["experience","Salary"])

In [18]:
y = data["Salary"]

In [19]:
pipe.fit_transform(X)

array([[8.        , 8.        ],
       [5.        , 4.        ],
       [6.        , 5.        ],
       [9.        , 9.        ],
       [3.        , 5.        ],
       [2.        , 1.        ],
       [8.        , 6.        ],
       [1.        , 2.        ],
       [9.        , 9.        ],
       [7.5       , 7.5       ],
       [5.44736842, 5.        ],
       [1.        , 1.        ],
       [3.        , 6.        ],
       [5.        , 4.        ],
       [5.        , 5.65789474],
       [9.        , 8.        ],
       [9.        , 9.        ],
       [2.        , 6.        ],
       [5.        , 5.        ],
       [6.        , 7.        ]])

In [41]:
pipe = Pipeline(steps=[('encode',OrdinalEncoder())])

X= data[["experience"]]
 
X = X.replace({np.nan:'unknown'})

y = data["Salary"]

pipe.fit_transform(X)

array([[10.],
       [10.],
       [ 8.],
       [ 2.],
       [ 5.],
       [10.],
       [ 6.],
       [ 4.],
       [ 1.],
       [ 7.],
       [ 6.],
       [10.],
       [ 4.],
       [ 3.],
       [ 5.],
       [ 0.],
       [ 9.],
       [10.],
       [ 8.],
       [ 2.]])

In [63]:
category_col = data.select_dtypes(exclude = np.number).columns

In [64]:
numeric_col = data.select_dtypes(include = np.number).drop(columns="Salary").columns

In [65]:
category_col

Index(['experience'], dtype='object')

In [66]:
numeric_col

Index(['test_score', 'interview_score'], dtype='object')

In [67]:
category_pipe = Pipeline(steps=[('impute', SimpleImputer(strategy='constant',fill_value = "unknown")),
                                ('encode', OrdinalEncoder())])

In [68]:
numeric_pipe = Pipeline(steps=[('impute',SimpleImputer()),
                               ('scale', StandardScaler())])

In [69]:
preprocessing_pipe = ColumnTransformer([
                    ('cat_pipe', category_pipe, category_col),
                    ('numeric_pipe', numeric_pipe, numeric_col)
                 ])

In [70]:
pipe = Pipeline([
                ('preprocess',preprocessing_pipe),
                ('linearRegression',LinearRegression())
                ])

In [71]:
X = data.drop(columns="Salary")

In [72]:
y = data["Salary"]

In [73]:
pipe.fit(X,y)

Pipeline(steps=[('preprocess',
                 ColumnTransformer(transformers=[('cat_pipe',
                                                  Pipeline(steps=[('impute',
                                                                   SimpleImputer(fill_value='unknown',
                                                                                 strategy='constant')),
                                                                  ('encode',
                                                                   OrdinalEncoder())]),
                                                  Index(['experience'], dtype='object')),
                                                 ('numeric_pipe',
                                                  Pipeline(steps=[('impute',
                                                                   SimpleImputer()),
                                                                  ('scale',
                                                                

In [116]:
pipe

Pipeline(steps=[('preprocess',
                 ColumnTransformer(transformers=[('cat_pipe',
                                                  Pipeline(steps=[('impute',
                                                                   SimpleImputer(fill_value='unknown',
                                                                                 strategy='constant')),
                                                                  ('encode',
                                                                   OrdinalEncoder())]),
                                                  Index(['experience'], dtype='object')),
                                                 ('numeric_pipe',
                                                  Pipeline(steps=[('impute',
                                                                   SimpleImputer()),
                                                                  ('scale',
                                                                

In [110]:
test_data = pd.DataFrame(["five", np.nan, np.nan]).T

In [111]:
test_data.columns = X.columns

In [112]:
test_data

Unnamed: 0,experience,test_score,interview_score
0,five,,


In [113]:
pipe.predict(test_data)

array([31267.68686948])

In [117]:
#Model Dump

In [118]:
joblib.dump(pipe, "pipe.pkl")

['pipe.pkl']