In [491]:
from sklearn.model_selection import (train_test_split , KFold , cross_val_score , cross_validate , StratifiedKFold)
from sklearn.datasets import make_regression , load_iris
from sklearn.preprocessing import (StandardScaler , MinMaxScaler , RobustScaler , OneHotEncoder , LabelEncoder , OrdinalEncoder)
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import (SimpleImputer , KNNImputer , IterativeImputer)
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [492]:
import pandas as pd
import numpy as np

# **Model selection**

## **Cross Validation**

In [592]:
X_iris , y_iris = load_iris(return_X_y=True)
data = load_iris()
print('feature names:\n' , data.feature_names)
print('-------------------')
print('target names:\n' , data.target_names)

feature names:
 ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
-------------------
target names:
 ['setosa' 'versicolor' 'virginica']


In [494]:
model = LogisticRegression(max_iter=1000)
cv = KFold(n_splits=5 , shuffle = True , random_state=42)

scores = cross_val_score(model , X_iris , y_iris , cv = cv , scoring ='accuracy' , n_jobs=-1)

print('fold scores' , scores)
print('mean accuracy' , scores.mean())
print('std' , scores.std())

fold scores [1.         1.         0.93333333 0.96666667 0.96666667]
mean accuracy 0.9733333333333334
std 0.024944382578492935


In [495]:
scores_validate = cross_validate(model , X_iris , y_iris , cv = cv , n_jobs=-1)
print(scores_validate)

{'fit_time': array([0.02089238, 0.02612638, 0.02998519, 0.01643515, 0.01784372]), 'score_time': array([0.00091958, 0.0011127 , 0.00086069, 0.00105882, 0.00063181]), 'test_score': array([1.        , 1.        , 0.93333333, 0.96666667, 0.96666667])}


### **with pipeline**

In [496]:
pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
     ('scaler', StandardScaler()),
    ('clf' , LogisticRegression(max_iter = 1000))
])

cv = KFold(n_splits=5 , shuffle = True , random_state=42)
scores = cross_val_score(pipe , X_iris , y_iris , n_jobs=-1 , cv=cv)
validate_scores = cross_validate(pipe , X_iris , y_iris , cv = cv , n_jobs=-1)
print('scores:\n' , scores)
print('mean:\n' , scores.mean())
print('std:\n' , scores.std())
print()
print('---------------------')
print()
print('validate scores , fit time:\n' , validate_scores['fit_time'])
print('validate scores , score time:\n' , validate_scores['score_time'])
print('validate scores , test score:\n' , validate_scores['test_score'])

scores:
 [1.         0.96666667 0.93333333 0.9        0.96666667]
mean:
 0.9533333333333334
std:
 0.03399346342395189

---------------------

validate scores , fit time:
 [0.01229215 0.01071429 0.01260829 0.01489902 0.00947213]
validate scores , score time:
 [0.00250411 0.00184274 0.00214052 0.00222778 0.00132704]
validate scores , test score:
 [1.         0.96666667 0.93333333 0.9        0.96666667]


In [497]:
cv = StratifiedKFold(n_splits=5 , shuffle = True , random_state = True)

scores = cross_val_score(pipe , X_iris , y_iris , n_jobs=-1 , cv=cv)
validate_scores = cross_validate(pipe , X_iris , y_iris , cv = cv , n_jobs=-1)
print('scores:\n' , scores)
print('mean:\n' , scores.mean())
print('std:\n' , scores.std())
print()
print('---------------------')
print()
print('validate scores , fit time:\n' , validate_scores['fit_time'])
print('validate scores , score time:\n' , validate_scores['score_time'])
print('validate scores , test score:\n' , validate_scores['test_score'])


scores:
 [0.96666667 1.         0.96666667 1.         0.9       ]
mean:
 0.9666666666666668
std:
 0.036514837167011066

---------------------

validate scores , fit time:
 [0.01057458 0.01053953 0.01167727 0.01154494 0.00616765]
validate scores , score time:
 [0.00188208 0.00174618 0.00211263 0.00183535 0.00119615]
validate scores , test score:
 [0.96666667 1.         0.96666667 1.         0.9       ]


## **Train Test Split**

### **making dataset**

In [498]:
X, y = make_regression(n_samples=2000, n_features=5, noise=10, random_state=42)

In [499]:
X_train , X_test , y_train , y_test = train_test_split(X , y , test_size = 0.2 , random_state=42 , shuffle = True)

# **Scaling**

## **Standard Scaler**

In [500]:
standard_scaler = StandardScaler()
standard_scaler.fit(X_train)
standard_scaler_X_train = standard_scaler.transform(X_train)
standard_scaler_X_test = standard_scaler.transform(X_test)

In [501]:
print('standard scaler , mean of each features of dataset:' , standard_scaler.mean_)
print('standard scaler , std of each features of dataset:' , standard_scaler.scale_)

standard scaler , mean of each features of dataset: [-0.01246863 -0.04134265  0.01382606  0.03929304 -0.01703299]
standard scaler , std of each features of dataset: [1.01228085 0.97371981 1.02965    0.98499564 1.01172027]


In [502]:
print(standard_scaler_X_train[:10])

[[-0.73697601  0.95668287 -0.23719397  1.01112762 -0.8966907 ]
 [-1.20526994  2.19730984 -0.67820889 -0.18007732  0.46886209]
 [ 0.60824635  1.12913485  0.27614472  0.33332818  0.64809729]
 [ 0.96477624  2.04808406 -0.31007697 -0.10191979 -1.69231551]
 [ 1.07040366  1.19459842 -2.02817695 -0.22941421 -0.00494392]
 [-0.54314857 -0.8900727  -0.99709335 -1.4737088   0.32744261]
 [ 0.29639258 -0.84035557 -1.14395832 -0.94402636 -0.33700762]
 [-0.6090891  -1.92663866 -0.12769483 -1.53342755  0.03168934]
 [-0.20503144 -0.26337505 -0.13716596  0.6029796   0.11829211]
 [ 1.05414348 -0.98332006  0.9818366  -1.28087639  1.17187863]]


## **Min Max Scaler**

In [503]:
min_max_scaler = MinMaxScaler()
min_max_scaler_X_train = min_max_scaler.fit_transform(X_train)
min_max_scaler_X_test = min_max_scaler.transform(X_test)

In [504]:
print('Min Max scaler , min of each features of dataset:' , min_max_scaler.data_min_)
print('Min Max scaler , max of each features of dataset:' , min_max_scaler.data_max_)

Min Max scaler , min of each features of dataset: [-3.92240025 -3.00763234 -3.68836529 -3.13851467 -3.6010851 ]
Min Max scaler , max of each features of dataset: [3.92623771 3.11768113 3.24309297 3.85273149 3.15205673]


In [505]:
print('Min Max scaler , max of all features after scaling' , min_max_scaler_X_train.max())
print('Min Max scaler , min of all features after scaling' , min_max_scaler_X_train.min())

Min Max scaler , max of all features after scaling 1.0
Min Max scaler , min of all features after scaling 0.0


In [506]:
print(min_max_scaler_X_train[:10])

[[0.40311516 0.63634796 0.49887981 0.59699858 0.39638616]
 [0.34271678 0.83356613 0.4333682  0.42916989 0.60096611]
 [0.57661568 0.66376206 0.57513493 0.50150351 0.62781819]
 [0.62259925 0.80984422 0.48805323 0.44018149 0.27718983]
 [0.63622259 0.67416857 0.23283397 0.42221882 0.52998298]
 [0.42811412 0.34277564 0.38599874 0.24691034 0.57977939]
 [0.53639424 0.350679   0.36418234 0.32153722 0.48023494]
 [0.41960942 0.17799635 0.51514562 0.23849657 0.53547118]
 [0.4717229  0.44239959 0.51373871 0.53949466 0.54844556]
 [0.63412542 0.32795244 0.67996361 0.27407847 0.70628836]]


### **using feature range -> select the min and max**

In [507]:
min_max_scaler_v2 = MinMaxScaler(feature_range=(-10 , 10))

min_max_scaler_v2_X_train = min_max_scaler_v2.fit_transform(X_train)
min_max_scaler_v2_X_test = min_max_scaler_v2.transform(X_test)

In [508]:
print('Min Max scaler v2, min of each features of dataset:' , min_max_scaler_v2.data_min_)
print('Min Max scaler v2, max of each features of dataset:' , min_max_scaler_v2.data_max_)

Min Max scaler v2, min of each features of dataset: [-3.92240025 -3.00763234 -3.68836529 -3.13851467 -3.6010851 ]
Min Max scaler v2, max of each features of dataset: [3.92623771 3.11768113 3.24309297 3.85273149 3.15205673]


In [509]:
print('Min Max scaler v2, max of all features after scaling:' , min_max_scaler_v2_X_train.max())
print('Min Max scaler v2, min of all features after scaling:' , min_max_scaler_v2_X_train.min())

Min Max scaler v2, max of all features after scaling: 10.0
Min Max scaler v2, min of all features after scaling: -10.0


In [510]:
min_max_scaler_v2_X_train[:10]

array([[-1.93769687,  2.72695927, -0.02240381,  1.93997152, -2.0722768 ],
       [-3.14566437,  6.67132251, -1.33263608, -1.41660224,  2.01932221],
       [ 1.5323137 ,  3.27524113,  1.50269859,  0.03007026,  2.55636376],
       [ 2.45198508,  6.19688444, -0.23893541, -1.19637018, -4.45620347],
       [ 2.72445176,  3.48337147, -5.34332061, -1.5556236 ,  0.59965961],
       [-1.43771763, -3.14448714, -2.28002515, -5.0617932 ,  1.59558774],
       [ 0.72788471, -2.98641992, -2.71635329, -3.56925559, -0.39530119],
       [-1.60781168, -6.44007296,  0.30291239, -5.23006851,  0.70942368],
       [-0.56554209, -1.15200815,  0.27477418,  0.78989322,  0.96891113],
       [ 2.6825085 , -3.44095129,  3.59927226, -4.5184306 ,  4.12576718]])

## **Robust Scaler**

In [511]:
robust_scaler = RobustScaler()

robust_scaler_X_train = robust_scaler.fit_transform(X_train)
robust_scaler_X_test = robust_scaler.transform(X_test)

In [512]:
print('robust sclaer, median of each feature', robust_scaler.center_)
print('robust scaler, IQR of each feature' , robust_scaler.scale_)

robust sclaer, median of each feature [ 0.00498441 -0.00458973 -0.02429709  0.01569002 -0.02279351]
robust scaler, IQR of each feature [1.34496684 1.29358683 1.37874312 1.32731872 1.35668339]


In [513]:
print(robust_scaler_X_train[:10])

[[-5.67656929e-01  6.91710925e-01 -1.49486605e-01  7.68134515e-01
  -6.64443630e-01]
 [-9.20115415e-01  1.62556633e+00 -4.78838034e-01 -1.15851867e-01
   3.53890813e-01]
 [ 4.44816234e-01  8.21520466e-01  2.33876455e-01  2.65143420e-01
   4.87551982e-01]
 [ 7.13156220e-01  1.51323983e+00 -2.03915880e-01 -5.78516096e-02
  -1.25776537e+00]
 [ 7.92656036e-01  8.70796771e-01 -1.48699873e+00 -1.52464496e-01
   5.59189742e-04]
 [-4.21773924e-01 -6.98394817e-01 -7.16982026e-01 -1.07584840e+00
   2.48429989e-01]
 [ 2.10101455e-01 -6.60971306e-01 -8.26661277e-01 -6.82774089e-01
  -2.47070856e-01]
 [-4.71403647e-01 -1.47864766e+00 -6.77122835e-02 -1.12016534e+00
   2.78777333e-02]
 [-1.67292185e-01 -2.26661569e-01 -7.47853493e-02  4.65250197e-01
   9.24600661e-02]
 [ 7.80417911e-01 -7.68584766e-01  7.60889528e-01 -9.32748568e-01
   8.78151734e-01]]


# **Encoding**

## **One Hot Encoder**

In [514]:
data = {
    'City': ['Tehran', 'Shiraz', 'Tehran', 'Tabriz', 'Shiraz'],
    'Job': ['Engineer', 'Doctor', 'Artist', 'Engineer', 'Doctor'],
    'Experience': [5, 7, 2, 10, 3],
    'Purchased': ['Yes', 'No', 'Yes', 'No', 'Yes']  # target
}

df = pd.DataFrame(data)
df

Unnamed: 0,City,Job,Experience,Purchased
0,Tehran,Engineer,5,Yes
1,Shiraz,Doctor,7,No
2,Tehran,Artist,2,Yes
3,Tabriz,Engineer,10,No
4,Shiraz,Doctor,3,Yes


In [515]:
X = df.drop(['Experience'  , 'Purchased'] , axis = 1)
y= df['Purchased']

In [516]:
onehot_encoder = OneHotEncoder(sparse_output=False , handle_unknown='ignore')

encoded_X = onehot_encoder.fit_transform(X)

In [517]:
encoded_df = pd.DataFrame(encoded_X , columns = onehot_encoder.get_feature_names_out(X.columns))
encoded_df['Experience'] = df['Experience']

In [518]:
encoded_df

Unnamed: 0,City_Shiraz,City_Tabriz,City_Tehran,Job_Artist,Job_Doctor,Job_Engineer,Experience
0,0.0,0.0,1.0,0.0,0.0,1.0,5
1,1.0,0.0,0.0,0.0,1.0,0.0,7
2,0.0,0.0,1.0,1.0,0.0,0.0,2
3,0.0,1.0,0.0,0.0,0.0,1.0,10
4,1.0,0.0,0.0,0.0,1.0,0.0,3


## **Label Encoder**

In [519]:
label_encoder = LabelEncoder()

In [520]:
df[['City' , 'Job']] = df[['City' , 'Job']].apply(label_encoder.fit_transform)

In [521]:
label_encoder.classes_

array(['Artist', 'Doctor', 'Engineer'], dtype=object)

In [522]:
label_encoder.inverse_transform([0 , 1, 2])

array(['Artist', 'Doctor', 'Engineer'], dtype=object)

In [523]:
data = {
    'City': ['Tehran', 'Shiraz', 'Tehran', 'Tabriz', 'Shiraz'],
    'Job': ['Engineer', 'Doctor', 'Artist', 'Engineer', 'Doctor'],
    'Experience': [5, 7, 2, 10, 3],
    'Purchased': ['Yes', 'No', 'Yes', 'No', 'Yes']  # target
}

df = pd.DataFrame(data)
df

Unnamed: 0,City,Job,Experience,Purchased
0,Tehran,Engineer,5,Yes
1,Shiraz,Doctor,7,No
2,Tehran,Artist,2,Yes
3,Tabriz,Engineer,10,No
4,Shiraz,Doctor,3,Yes


In [524]:
encoders = {}

for col in ['City' , 'Job']:
  label_encoder = LabelEncoder()
  df[col] = label_encoder.fit_transform(df[col])
  encoders[col] = label_encoder.classes_


encoders

{'City': array(['Shiraz', 'Tabriz', 'Tehran'], dtype=object),
 'Job': array(['Artist', 'Doctor', 'Engineer'], dtype=object)}

## **Ordinal Encoder**

In [525]:
data = {
    'City': ['Tehran', 'Shiraz', 'Tehran', 'Tabriz', 'Shiraz'],
    'Job': ['Engineer', 'Doctor', 'Artist', 'Engineer', 'Doctor'],
    'Experience': [5, 7, 2, 10, 3],
    'Purchased': ['Yes', 'No', 'Yes', 'No', 'Yes']  # target
}

df = pd.DataFrame(data)
df

Unnamed: 0,City,Job,Experience,Purchased
0,Tehran,Engineer,5,Yes
1,Shiraz,Doctor,7,No
2,Tehran,Artist,2,Yes
3,Tabriz,Engineer,10,No
4,Shiraz,Doctor,3,Yes


In [526]:
ordinal_encoder = OrdinalEncoder()
X_encoded = ordinal_encoder.fit_transform(X)

In [527]:
ordinal_encoder.categories_

[array(['Shiraz', 'Tabriz', 'Tehran'], dtype=object),
 array(['Artist', 'Doctor', 'Engineer'], dtype=object)]

In [528]:
encoded_df = pd.DataFrame(X_encoded , columns = ordinal_encoder.get_feature_names_out(X.columns))

In [529]:
encoded_df['Experience'] = df['Experience']
encoded_df

Unnamed: 0,City,Job,Experience
0,2.0,2.0,5
1,0.0,1.0,7
2,2.0,0.0,2
3,1.0,2.0,10
4,0.0,1.0,3


# **Imputation**

## **Simple Imputer**

In [530]:
data = {
    'Age': [25, np.nan, 28, 35, np.nan, 40, 23],
    'Salary': [50000, 54000, np.nan, 58000, 60000, np.nan, 52000],
    'City': ['Tehran', 'Shiraz', np.nan, 'Tabriz', 'Shiraz', 'Tehran', np.nan],
    'Purchased': ['Yes', 'No', 'Yes', np.nan, 'No', 'Yes', 'No']
}

df = pd.DataFrame(data)
df

Unnamed: 0,Age,Salary,City,Purchased
0,25.0,50000.0,Tehran,Yes
1,,54000.0,Shiraz,No
2,28.0,,,Yes
3,35.0,58000.0,Tabriz,
4,,60000.0,Shiraz,No
5,40.0,,Tehran,Yes
6,23.0,52000.0,,No


In [531]:
simple_imputer = SimpleImputer(strategy='most_frequent')

In [532]:
df_imputed_most_frequent = simple_imputer.fit_transform(df)

In [533]:
df_imputed_most_frequent

array([[25.0, 50000.0, 'Tehran', 'Yes'],
       [23.0, 54000.0, 'Shiraz', 'No'],
       [28.0, 50000.0, 'Shiraz', 'Yes'],
       [35.0, 58000.0, 'Tabriz', 'No'],
       [23.0, 60000.0, 'Shiraz', 'No'],
       [40.0, 50000.0, 'Tehran', 'Yes'],
       [23.0, 52000.0, 'Shiraz', 'No']], dtype=object)

In [534]:
numeric_only = df.select_dtypes(include=['number'])
cat_only = df.select_dtypes(include=['object'])

In [535]:
simple_imputer = SimpleImputer(strategy = 'mean')

numeric_only_imputed = simple_imputer.fit_transform(numeric_only)

print(numeric_only_imputed)

[[2.50e+01 5.00e+04]
 [3.02e+01 5.40e+04]
 [2.80e+01 5.48e+04]
 [3.50e+01 5.80e+04]
 [3.02e+01 6.00e+04]
 [4.00e+01 5.48e+04]
 [2.30e+01 5.20e+04]]


In [536]:
simple_imputer = SimpleImputer(strategy='median')

numeric_only_imputed = simple_imputer.fit_transform(numeric_only)

print(numeric_only_imputed ,"\n\n", numeric_only)

[[2.5e+01 5.0e+04]
 [2.8e+01 5.4e+04]
 [2.8e+01 5.4e+04]
 [3.5e+01 5.8e+04]
 [2.8e+01 6.0e+04]
 [4.0e+01 5.4e+04]
 [2.3e+01 5.2e+04]] 

     Age   Salary
0  25.0  50000.0
1   NaN  54000.0
2  28.0      NaN
3  35.0  58000.0
4   NaN  60000.0
5  40.0      NaN
6  23.0  52000.0


In [537]:
simple_imputer = SimpleImputer(strategy='constant' , fill_value = -1)

numeric_only_imputed = simple_imputer.fit_transform(numeric_only)

print(numeric_only_imputed ,"\n\n", numeric_only)

[[ 2.5e+01  5.0e+04]
 [-1.0e+00  5.4e+04]
 [ 2.8e+01 -1.0e+00]
 [ 3.5e+01  5.8e+04]
 [-1.0e+00  6.0e+04]
 [ 4.0e+01 -1.0e+00]
 [ 2.3e+01  5.2e+04]] 

     Age   Salary
0  25.0  50000.0
1   NaN  54000.0
2  28.0      NaN
3  35.0  58000.0
4   NaN  60000.0
5  40.0      NaN
6  23.0  52000.0


## **KNN Imputer**

In [538]:
data = {
    'Age': [25, np.nan, 28, 35, np.nan, 40, 23],
    'Salary': [50000, 54000, np.nan, 58000, 60000, np.nan, 52000],
    'City': ['Tehran', 'Shiraz', np.nan, 'Tabriz', 'Shiraz', 'Tehran', np.nan],
    'Purchased': ['Yes', 'No', 'Yes', np.nan, 'No', 'Yes', 'No']
}

df = pd.DataFrame(data)
df

Unnamed: 0,Age,Salary,City,Purchased
0,25.0,50000.0,Tehran,Yes
1,,54000.0,Shiraz,No
2,28.0,,,Yes
3,35.0,58000.0,Tabriz,
4,,60000.0,Shiraz,No
5,40.0,,Tehran,Yes
6,23.0,52000.0,,No


In [539]:
imputer = KNNImputer(n_neighbors=2)

numeric_only_KNN_imputed = imputer.fit_transform(numeric_only)

print(numeric_only_KNN_imputed , '\n\n' , numeric_only)

[[2.5e+01 5.0e+04]
 [2.9e+01 5.4e+04]
 [2.8e+01 5.1e+04]
 [3.5e+01 5.8e+04]
 [2.9e+01 6.0e+04]
 [4.0e+01 5.4e+04]
 [2.3e+01 5.2e+04]] 

     Age   Salary
0  25.0  50000.0
1   NaN  54000.0
2  28.0      NaN
3  35.0  58000.0
4   NaN  60000.0
5  40.0      NaN
6  23.0  52000.0


## **Iterative Imputer**

In [540]:
data = {
    'Age': [25, np.nan, 28, 35, np.nan, 40, 23],
    'Salary': [50000, 54000, np.nan, 58000, 60000, np.nan, 52000],
    'City': ['Tehran', 'Shiraz', np.nan, 'Tabriz', 'Shiraz', 'Tehran', np.nan],
    'Purchased': ['Yes', 'No', 'Yes', np.nan, 'No', 'Yes', 'No']
}

df = pd.DataFrame(data)
df

Unnamed: 0,Age,Salary,City,Purchased
0,25.0,50000.0,Tehran,Yes
1,,54000.0,Shiraz,No
2,28.0,,,Yes
3,35.0,58000.0,Tabriz,
4,,60000.0,Shiraz,No
5,40.0,,Tehran,Yes
6,23.0,52000.0,,No


In [541]:
imputer = IterativeImputer(random_state=42)
df['Age'] = imputer.fit_transform(numeric_only)

In [542]:
df

Unnamed: 0,Age,Salary,City,Purchased
0,25.0,50000.0,Tehran,Yes
1,28.843558,54000.0,Shiraz,No
2,28.0,,,Yes
3,35.0,58000.0,Tabriz,
4,38.044728,60000.0,Shiraz,No
5,40.0,,Tehran,Yes
6,23.0,52000.0,,No


# **Pipe Line**

In [543]:
pipe = Pipeline([
    ('imputer' , SimpleImputer(strategy = 'mean')),
    ('scaler' , StandardScaler()),
    ('clf' , LogisticRegression(max_iter = 5000))
])

In [544]:
scores = cross_val_score(pipe , X_iris , y_iris , cv = 5)
print(scores)

[0.96666667 1.         0.93333333 0.9        1.        ]


# **Column Transformer**

In [545]:
data = {
    'City': ['Tehran', 'Shiraz', 'Tabriz', 'Mashhad', 'Tehran', np.nan],
    'Job': ['Engineer', 'Doctor', 'Artist', 'Lawyer', np.nan, 'Doctor'],
    'Experience': [5, 7, 2, np.nan, 10, 3],
    'Salary': [50000, 54000, np.nan, 62000, 60000, 58000],
    'Purchased': ['Yes', 'No', 'Yes', 'No', 'Yes', 'No']  # target
}

df = pd.DataFrame(data)

df

Unnamed: 0,City,Job,Experience,Salary,Purchased
0,Tehran,Engineer,5.0,50000.0,Yes
1,Shiraz,Doctor,7.0,54000.0,No
2,Tabriz,Artist,2.0,,Yes
3,Mashhad,Lawyer,,62000.0,No
4,Tehran,,10.0,60000.0,Yes
5,,Doctor,3.0,58000.0,No


In [565]:
X= df.drop('Purchased' , axis=1)
y= df["Purchased"]


In [566]:
num_cols = X.select_dtypes(include =['number']).columns.tolist()
cat_cols = X.select_dtypes(include = ['object']).columns.tolist()

In [567]:
preprocessor = ColumnTransformer(transformers=[
    ('num' , Pipeline([
        ('imputer' , SimpleImputer(strategy='mean')),
        ('scaler' , RobustScaler()),
    ]) , num_cols) ,
    ('cat' , Pipeline([
        ('imputer' , SimpleImputer(strategy='most_frequent')),
        ('ohe' , OneHotEncoder())
    ]) , cat_cols)
])

In [568]:
pipe = Pipeline([
    ('preprocessor' , preprocessor),
    ('logistic' , LogisticRegression(max_iter=1000))
])

In [569]:
pipe.fit(X , y)

In [591]:
pipe.predict(pd.DataFrame([{'City':'Shiraz' , 'Job' :'Artist' , 'Experience': 1 , 'Salary':3}]))

array(['Yes'], dtype=object)