In [1]:
# HR Analytics: Job Change of Data Scientists
# https://www.kaggle.com/arashnic/hr-analytics-job-change-of-data-scientists?select=aug_train.csv

In [2]:
import os
import pandas as pd

def load_data():
    csv_path = os.path.join("aug_train.csv")
    return pd.read_csv(csv_path)

In [3]:
project_data = load_data()
project_data.head()

Unnamed: 0,enrollee_id,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours,target
0,8949,city_103,0.92,Male,Has relevent experience,no_enrollment,Graduate,STEM,>20,,,1,36,1.0
1,29725,city_40,0.776,Male,No relevent experience,no_enrollment,Graduate,STEM,15,50-99,Pvt Ltd,>4,47,0.0
2,11561,city_21,0.624,,No relevent experience,Full time course,Graduate,STEM,5,,,never,83,0.0
3,33241,city_115,0.789,,No relevent experience,,Graduate,Business Degree,<1,,Pvt Ltd,never,52,1.0
4,666,city_162,0.767,Male,Has relevent experience,no_enrollment,Masters,STEM,>20,50-99,Funded Startup,4,8,0.0


In [4]:
fields = ["relevent_experience", "gender", "enrolled_university", "education_level", "major_discipline",
          "experience", "company_size", "company_type", "last_new_job", "city"]

for field in project_data:
    print(f"NaN: {project_data[field].isna().sum()}")
    print(project_data[field].value_counts())
    print()

NaN: 0
2047     1
2692     1
4743     1
27272    1
25225    1
        ..
17682    1
19731    1
29972    1
25878    1
6147     1
Name: enrollee_id, Length: 19158, dtype: int64

NaN: 0
city_103    4355
city_21     2702
city_16     1533
city_114    1336
city_160     845
            ... 
city_121       3
city_111       3
city_129       3
city_171       1
city_140       1
Name: city, Length: 123, dtype: int64

NaN: 0
0.920    5200
0.624    2702
0.910    1533
0.926    1336
0.698     683
         ... 
0.649       4
0.807       4
0.781       3
0.625       3
0.664       1
Name: city_development_index, Length: 93, dtype: int64

NaN: 4508
Male      13221
Female     1238
Other       191
Name: gender, dtype: int64

NaN: 0
Has relevent experience    13792
No relevent experience      5366
Name: relevent_experience, dtype: int64

NaN: 386
no_enrollment       13817
Full time course     3757
Part time course     1198
Name: enrolled_university, dtype: int64

NaN: 460
Graduate          11598
Masters      

In [5]:
project_data

Unnamed: 0,enrollee_id,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours,target
0,8949,city_103,0.920,Male,Has relevent experience,no_enrollment,Graduate,STEM,>20,,,1,36,1.0
1,29725,city_40,0.776,Male,No relevent experience,no_enrollment,Graduate,STEM,15,50-99,Pvt Ltd,>4,47,0.0
2,11561,city_21,0.624,,No relevent experience,Full time course,Graduate,STEM,5,,,never,83,0.0
3,33241,city_115,0.789,,No relevent experience,,Graduate,Business Degree,<1,,Pvt Ltd,never,52,1.0
4,666,city_162,0.767,Male,Has relevent experience,no_enrollment,Masters,STEM,>20,50-99,Funded Startup,4,8,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19153,7386,city_173,0.878,Male,No relevent experience,no_enrollment,Graduate,Humanities,14,,,1,42,1.0
19154,31398,city_103,0.920,Male,Has relevent experience,no_enrollment,Graduate,STEM,14,,,4,52,1.0
19155,24576,city_103,0.920,Male,Has relevent experience,no_enrollment,Graduate,STEM,>20,50-99,Pvt Ltd,4,44,0.0
19156,5756,city_65,0.802,Male,Has relevent experience,no_enrollment,High School,,<1,500-999,Pvt Ltd,2,97,0.0


In [6]:
project_data.describe()

Unnamed: 0,enrollee_id,city_development_index,training_hours,target
count,19158.0,19158.0,19158.0,19158.0
mean,16875.358179,0.828848,65.366896,0.249348
std,9616.292592,0.123362,60.058462,0.432647
min,1.0,0.448,1.0,0.0
25%,8554.25,0.74,23.0,0.0
50%,16982.5,0.903,47.0,0.0
75%,25169.75,0.92,88.0,0.0
max,33380.0,0.949,336.0,1.0


In [7]:
import numpy as np

np.random.seed(42)
project_data.shape

(19158, 14)

In [8]:
corr_matrix = project_data.corr()

In [9]:
corr_matrix

Unnamed: 0,enrollee_id,city_development_index,training_hours,target
enrollee_id,1.0,-0.040455,0.000998,0.049475
city_development_index,-0.040455,1.0,0.00192,-0.341665
training_hours,0.000998,0.00192,1.0,-0.021577
target,0.049475,-0.341665,-0.021577,1.0


In [10]:
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

numeric_attributes = ["city_development_index", "training_hours"]
almost_numeric_attributes = ["last_new_job", "company_size", "experience", "education_level"]

# numeric_data = project_data.select_dtypes(include=np.number)
numeric_data = project_data[numeric_attributes]

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('std_scaler', StandardScaler()),
])

num_pipeline.fit_transform(numeric_data)

array([[ 0.73891926, -0.4889846 ],
       [-0.42840976, -0.30582494],
       [-1.66059039,  0.29360665],
       ...,
       [ 0.73891926, -0.35577758],
       [-0.21764202,  0.52671894],
       [ 0.21199991,  1.02624527]])

In [11]:
from sklearn.base import BaseEstimator, TransformerMixin

def replace_end_value(x):
    if x == '>20':
        return 21
    elif x == '<1':
        return 0
    else:
        try:
            return int(x)
        except:
            # return np.nan
            return 0

class ExperienceTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        return X.applymap(replace_end_value)
    
experience_data = project_data[["experience"]]
experience_pipeline = Pipeline([
    ('extreme_value_removal', ExperienceTransformer()),
    ('std_scaler', StandardScaler()),
])

experience_pipeline.fit_transform(experience_data)

array([[ 1.61014154],
       [ 0.7265914 ],
       [-0.74599217],
       ...,
       [ 1.61014154],
       [-1.48228396],
       [-1.18776725]])

In [12]:
def binary_hre(x):
    if x == "Has relevent experience":
        return 1
    else:
        return 0
    
class RelevantExperienceTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        return X.applymap(binary_hre)

re_data = project_data[["relevent_experience"]]
relevant_experience_pipeline = Pipeline([
    ('set_to_binary', RelevantExperienceTransformer()),
])
relevant_experience_pipeline.fit_transform(re_data)

Unnamed: 0,relevent_experience
0,1
1,0
2,0
3,0
4,1
...,...
19153,0
19154,1
19155,1
19156,1


In [13]:
class PandarizerTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        return pd.DataFrame(X, columns=self.columns)

project_data[["enrolled_university"]]

def to_enroll_transform(x):
    if x == "no_enrollment" or x == "0":
        return 0
    elif x == "Part time course":
        return 1
    elif x == "Full time course":
        return 2
    else:
        return x

class EnrollmentTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        return X.applymap(to_enroll_transform)


enrollment_pipeline = Pipeline([
    ('fill_imputer', SimpleImputer(strategy='constant', fill_value="0")),
    ("pandarizer", PandarizerTransformer(["enrolled_university"])),
    ('enroll_numeric_transformer', EnrollmentTransformer()),
])
enrollment_pipeline.fit_transform(project_data[["enrolled_university"]])

Unnamed: 0,enrolled_university
0,0
1,0
2,2
3,0
4,0
...,...
19153,0
19154,0
19155,0
19156,0


In [14]:
def to_education_level_transform(x):
    if x == "Primary School":
        return 0.00
    elif x == "High School":
        return 0.25
    elif x == "Graduate":
        return 0.50
    elif x == "Masters":
        return 0.75
    elif x == "Phd":
        return 1.00
    else:
        return 0.00

class EducationLevelTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        return X.applymap(to_education_level_transform)

education_level_pipeline = Pipeline([
    ('education_level_numeric_transformer', EducationLevelTransformer()),
])

education_level_pipeline.fit_transform(project_data[["education_level"]])

Unnamed: 0,education_level
0,0.50
1,0.50
2,0.50
3,0.50
4,0.75
...,...
19153,0.50
19154,0.50
19155,0.50
19156,0.25


In [15]:
'''
NaN: 5938
50-99        3083
100-500      2571
10000+       2019
10/49        1471
1000-4999    1328
<10          1308
500-999       877
5000-9999     563
'''

def to_company_size_transform(x):
    if x == "<10":
        return 5
    elif x == "10000+":
        return 10000
    elif x == "10/49":
        return 30
    elif '-' in x:
        a, b = list(map(int, x.split('-')))
        return (a+b) // 2
    else:
        return int(x)

class CompanySizeTransformer:
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        return X.applymap(to_company_size_transform)

company_size_pipeline = Pipeline([
    ('fill_imputer', SimpleImputer(strategy="constant", fill_value="15")),
    ('pandarizer', PandarizerTransformer(["company_size"])),
    ('company_size_numeric_transformer', CompanySizeTransformer()),
    ("std_scaler", StandardScaler()),
])

company_size_pipeline.fit_transform(project_data[["company_size"]])

array([[-0.48470471],
       [-0.46638324],
       [-0.48470471],
       ...,
       [-0.46638324],
       [-0.25677317],
       [-0.48470471]])

In [16]:
# NaN: 423
# 1        8040
# >4       3290
# 2        2900
# never    2452
# 4        1029
# 3        1024
# Name: last_new_job, dtype: int64

def lnj_to_numeric(x):
    if x in ("1", "2", "3", "4"):
        return int(x)
    elif x == ">4":
        return 5
    elif x == "never":
        return 10
    else:
        return x

class LNJNumericTransformer:
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        return X.applymap(lnj_to_numeric)

last_new_job_pipeline = Pipeline([
    ('numeric_transformer', LNJNumericTransformer()),
    ('fill_nan_imputer', SimpleImputer()),
    ('pandarizer', PandarizerTransformer(["last_new_job"])),
])

last_new_job_pipeline.fit_transform(project_data[["last_new_job"]])

Unnamed: 0,last_new_job
0,1.0
1,5.0
2,10.0
3,10.0
4,4.0
...,...
19153,1.0
19154,4.0
19155,4.0
19156,2.0


In [17]:
# NaN: 2813
# STEM               14492
# Humanities           669
# Other                381
# Business Degree      327
# Arts                 253
# No Major             223
# Name: major_discipline, dtype: int64

# md_imputer = SimpleImputer(fill_value="Unlisted Major", strategy="constant")
# md_no_nan = md_imputer.fit_transform(project_data[["major_discipline"]])
# print(md_no_nan)

from sklearn.preprocessing import OneHotEncoder

major_discipline_pipeline = Pipeline([
    ('md_imputer', SimpleImputer(fill_value="Unlisted Major", strategy="constant")),
    ('md_one_hot_encoder', OneHotEncoder())
])

# print(project_data[["major_discipline"]].head(5))
# major_discipline_pipeline.fit_transform(project_data[["major_discipline"]]).toarray()[:5]
major_discipline_pipeline.fit_transform(project_data[["major_discipline"]]).toarray()

# [array(['Arts', 'Business Degree', 'Humanities', 'No Major', 'Other',
#         'STEM', 'Unlisted Major'], dtype=object)]

# major_discipline_encoder = OneHotEncoder()
# md_ohe = major_discipline_encoder.fit_transform(md_no_nan)
# md_ohe.toarray()
# major_discipline_encoder.categories_

array([[0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       ...,
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 1.]])

In [18]:
# NaN: 4508
# Male      13221
# Female     1238
# Other       191
# Name: gender, dtype: int64

# Categories: [array(['Female', 'Male', 'Other', 'Unlisted Gender'], dtype=object)]

gender_encoding_pipeline = Pipeline([
    ('gender_imputer', SimpleImputer(fill_value="Unlisted Gender", strategy="constant")),
    ('gender_one_hot_encoder', OneHotEncoder()),
])

gender_encoding_pipeline.fit_transform(project_data[["gender"]]).toarray()

array([[0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 0., 0., 1.],
       ...,
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 0., 0., 1.]])

In [19]:
# NaN: 6140
# Pvt Ltd                9817
# Funded Startup         1001
# Public Sector           955
# Early Stage Startup     603
# NGO                     521
# Other                   121
# Name: company_type, dtype: int64

# categories: [array(['Early Stage Startup', 'Funded Startup', 'NGO', 'Other',
#         'Public Sector', 'Pvt Ltd', 'Unlisted Company Type'], dtype=object)]

company_type_pipeline = Pipeline([
    ("company_type_imputer", SimpleImputer(fill_value="Unlisted Company Type", strategy="constant")),
    ("company_type_one_hot_encoder", OneHotEncoder()),
])

company_type_pipeline.fit_transform(project_data[["company_type"]]).toarray()

array([[0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.],
       ...,
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])

In [20]:
result_pandarizer = PandarizerTransformer([[
    "city_development_index", "training_hours", "experience", "has_relevant_experience", "enrolled_university",
    "education_level", "company_size", "last_new_job", "major_discipline_Arts", "major_discipline_Business",
    "major_discipline_Humanities", "major_discipline_No_Major", "major_discipline_Other", "major_discipline_STEM",
    "major_discipline_Unlisted_Major", "gender_female", "gender_male", "gender_other", "gender_unlisted",
    "company_early_startup", "company_funded_startup", "company_ngo", "company_other", "company_public_sector",
    "company_pvt_ltd", "company_unlisted", "target"
]])

In [21]:
class IdentityTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        return X

In [22]:
from sklearn.compose import ColumnTransformer

full_pipeline = ColumnTransformer([
    ("numeric_attributes", num_pipeline, ["city_development_index", "training_hours"]),
    ("experience", experience_pipeline, ["experience"]),
    ("relevant_experience", relevant_experience_pipeline, ["relevent_experience"]),
    ("enrolled_university", enrollment_pipeline, ["enrolled_university"]),
    ("education_level", education_level_pipeline, ["education_level"]),
    ("company_size", company_size_pipeline, ["company_size"]),
    ("last_new_job", last_new_job_pipeline, ["last_new_job"]),
    ("major_discipline", major_discipline_pipeline, ["major_discipline"]),
    ("gender", gender_encoding_pipeline, ["gender"]),
    ("company_type", company_type_pipeline, ["company_type"]),
    ("target", IdentityTransformer(), ["target"])
#     ("pandarizer", result_pandarizer, []),
#     ("pandarizer", PandarizerTransformer, ["city_development_index", "training_hours", "experience",
#                                           "has_relevant_experience", "enrolled_university", "education_level",
#                                           "company_size", "last_new_job", "major_discipline_Arts",
#                                           "major_discipline_Business", "major_discipline_Humanities",
#                                           "major_discipline_No_Major", "major_discipline_Other",
#                                           "major_discipline_STEM", "major_discipline_Unlisted_Major",
#                                           "gender_female", "gender_male", "gender_other", "gender_unlisted",
#                                           "company_early_startup", "company_funded_startup", "company_ngo",
#                                           "company_other", "company_public_sector", "company_pvt_ltd",
#                                           "company_unlisted", "target"]),
])

# [array(['Early Stage Startup', 'Funded Startup', 'NGO', 'Other',
#         'Public Sector', 'Pvt Ltd', 'Unlisted Company Type'], dtype=object)]
data_prepared = result_pandarizer.fit_transform(full_pipeline.fit_transform(project_data))
# print(data_prepared.head(1))
data_prepared.corr()

Unnamed: 0,city_development_index,training_hours,experience,has_relevant_experience,enrolled_university,education_level,company_size,last_new_job,major_discipline_Arts,major_discipline_Business,...,gender_other,gender_unlisted,company_early_startup,company_funded_startup,company_ngo,company_other,company_public_sector,company_pvt_ltd,company_unlisted,target
city_development_index,1.0,0.00192,0.334238,0.059808,-0.165555,0.041468,0.073157,-0.054952,0.066403,0.036718,...,0.030444,-0.161336,-0.051432,0.03456,0.023154,0.015417,0.060069,0.033483,-0.071801,-0.341665
training_hours,0.00192,1.0,0.000369,0.011566,-0.003943,-0.013522,-0.006639,-0.009102,-0.009813,0.001805,...,-0.002503,0.001779,0.012374,0.006373,0.00693,0.004823,-0.012649,-0.00153,-0.003365,-0.021577
experience,0.334238,0.000369,1.0,0.338764,-0.336866,0.270663,0.10606,-0.020254,0.006891,0.017059,...,-0.016679,-0.164116,-0.054093,0.020969,0.008775,0.006407,0.018687,0.137482,-0.149879,-0.176898
has_relevant_experience,0.059808,0.011566,0.338764,1.0,-0.372919,0.243095,0.106335,-0.286443,0.013098,-0.016522,...,-0.035689,-0.105871,0.049863,0.096309,-0.007198,0.015981,-0.069175,0.305266,-0.359494,-0.12843
enrolled_university,-0.165555,-0.003943,-0.336866,-0.372919,1.0,-0.143825,-0.096102,0.090617,-0.044033,-0.039136,...,0.017824,0.097047,0.007768,-0.063089,0.005246,-0.005784,0.047878,-0.197633,0.215688,0.149539
education_level,0.041468,-0.013522,0.270663,0.243095,-0.143825,1.0,0.129585,-0.209047,0.015952,0.034129,...,-0.030771,-0.047973,0.010474,0.037055,0.056206,-0.002032,0.097464,0.120589,-0.21545,-0.006527
company_size,0.073157,-0.006639,0.10606,0.106335,-0.096102,0.129585,1.0,-0.055276,-0.018931,0.004298,...,0.001147,-0.025924,-0.085951,-0.104696,0.007226,0.016965,0.089728,0.300549,-0.28708,-0.074445
last_new_job,-0.054952,-0.009102,-0.020254,-0.286443,0.090617,-0.209047,-0.055276,1.0,-0.017872,-0.01301,...,0.010292,0.039231,-0.055377,-0.109869,-0.037141,-0.006431,-0.028577,-0.098859,0.206361,0.009036
major_discipline_Arts,0.066403,-0.009813,0.006891,0.013098,-0.044033,0.015952,-0.018931,-0.017872,1.0,-0.015244,...,0.020608,-0.010275,0.013191,0.007769,0.017204,-0.003451,-0.001285,-0.016139,0.003836,-0.010659
major_discipline_Business,0.036718,0.001805,0.017059,-0.016522,-0.039136,0.034129,0.004298,-0.01301,-0.015244,1.0,...,-0.001055,-0.003748,-0.016831,-0.000155,-0.002212,-0.000332,-0.011667,0.008415,0.003626,0.004157


In [23]:
from numpy import ravel

y_train = ravel(data_prepared[["target"]])
print(y_train)

X_train = data_prepared.drop(["target"], axis=1)
X_train

[1. 0. 0. ... 0. 0. 0.]


  obj = obj._drop_axis(labels, axis, level=level, errors=errors)


Unnamed: 0,city_development_index,training_hours,experience,has_relevant_experience,enrolled_university,education_level,company_size,last_new_job,major_discipline_Arts,major_discipline_Business,...,gender_male,gender_other,gender_unlisted,company_early_startup,company_funded_startup,company_ngo,company_other,company_public_sector,company_pvt_ltd,company_unlisted
0,0.738919,-0.488985,1.610142,1.0,0.0,0.50,-0.484705,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,-0.428410,-0.305825,0.726591,0.0,0.0,0.50,-0.466383,5.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,-1.660590,0.293607,-0.745992,0.0,2.0,0.50,-0.484705,10.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,-0.323026,-0.222571,-1.482284,0.0,0.0,0.50,-0.484705,10.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,-0.501368,-0.955209,1.610142,1.0,0.0,0.75,-0.466383,4.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19153,0.398448,-0.389079,0.579333,0.0,0.0,0.50,-0.484705,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
19154,0.738919,-0.222571,0.579333,1.0,0.0,0.50,-0.484705,4.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
19155,0.738919,-0.355778,1.610142,1.0,0.0,0.50,-0.466383,4.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
19156,-0.217642,0.526719,-1.482284,1.0,0.0,0.25,-0.256773,2.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [45]:
X_train.drop(labels=["company_other", "gender_other"], axis=1)

  obj = obj._drop_axis(labels, axis, level=level, errors=errors)


Unnamed: 0,city_development_index,training_hours,experience,has_relevant_experience,enrolled_university,education_level,company_size,last_new_job,major_discipline_Arts,major_discipline_Business,...,major_discipline_Unlisted_Major,gender_female,gender_male,gender_unlisted,company_early_startup,company_funded_startup,company_ngo,company_public_sector,company_pvt_ltd,company_unlisted
0,0.738919,-0.488985,1.610142,1.0,0.0,0.50,-0.484705,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,-0.428410,-0.305825,0.726591,0.0,0.0,0.50,-0.466383,5.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,-1.660590,0.293607,-0.745992,0.0,2.0,0.50,-0.484705,10.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
3,-0.323026,-0.222571,-1.482284,0.0,0.0,0.50,-0.484705,10.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
4,-0.501368,-0.955209,1.610142,1.0,0.0,0.75,-0.466383,4.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19153,0.398448,-0.389079,0.579333,0.0,0.0,0.50,-0.484705,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
19154,0.738919,-0.222571,0.579333,1.0,0.0,0.50,-0.484705,4.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
19155,0.738919,-0.355778,1.610142,1.0,0.0,0.50,-0.466383,4.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
19156,-0.217642,0.526719,-1.482284,1.0,0.0,0.25,-0.256773,2.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [46]:
from sklearn.linear_model import SGDClassifier

clf = SGDClassifier(loss="log", penalty="l2", max_iter=100)
clf.fit(X_train, y_train)

SGDClassifier(alpha=0.0001, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
              l1_ratio=0.15, learning_rate='optimal', loss='log', max_iter=100,
              n_iter_no_change=5, n_jobs=None, penalty='l2', power_t=0.5,
              random_state=None, shuffle=True, tol=0.001,
              validation_fraction=0.1, verbose=0, warm_start=False)

In [47]:
s = clf.score(X_train, y_train)
print(f"score: {s}\n")

v = clf.predict(X_train.iloc[:30])
print(f"v: {v}\n")

ds = clf.decision_function(X_train.iloc[:30])
print(f"ds: {ds}\n")

score: 0.7614573546299196

v: [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0.]

ds: [-0.95549954 -1.67629556 -0.21528353 -2.33680722 -2.24411012  0.00912639
 -3.41717826 -1.846745   -2.02196433 -2.82831325 -1.24691032 -2.02833082
 -2.34615987 -1.35553197 -1.26118702 -2.69922992 -3.53366801 -0.64130906
 -1.9088908  -0.21103846 -1.84300292 -2.39651085 -0.56144689 -1.21256962
 -0.54736011 -2.79642736 -0.5972614  -0.9323013  -2.95174572 -1.10174459]



In [48]:
from sklearn.linear_model import LogisticRegression

log_reg_clf = LogisticRegression(random_state=11, max_iter=300)
log_reg_clf.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=300,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=11, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [49]:
log_reg_clf.classes_

array([0., 1.])

In [50]:
log_reg_clf.decision_function(X_train[:30])

array([-1.13895279, -1.19036381,  1.03776852, -0.97933527, -1.8907794 ,
        0.09441056, -3.30796953, -1.35401897, -2.0555783 , -2.38280284,
        0.09190724, -2.08418182, -2.28052011, -0.02933093,  0.05207204,
       -2.34813549, -2.30583378, -0.23007942, -0.61516047, -0.15125525,
       -1.85699839, -2.35815222, -0.43634199, -1.08915709, -0.24137119,
       -1.51781674, -0.36905581, -0.85190594, -2.92727285, -0.83042112])

In [53]:
# log_reg_clf.score(X_train, y_train)
log_reg_clf.predict_proba(X_train)

array([[0.75748732, 0.24251268],
       [0.76680612, 0.23319388],
       [0.26158079, 0.73841921],
       ...,
       [0.91489008, 0.08510992],
       [0.91573746, 0.08426254],
       [0.74689949, 0.25310051]])