<a href="https://colab.research.google.com/github/unjuken/scalene_svm/blob/main/scalene_svm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np 
import pandas as pd

from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

import seaborn as sns

In [2]:
df = pd.read_csv(r'/content/IMB553-XLS-ENG.csv', delimiter=';', decimal=',')  
df

Unnamed: 0,Candidate Ref,DOJ Extended,Duration to accept offer,Notice period,Offered band,Pecent hike expected in CTC,Percent hike offered in CTC,Percent difference CTC,Joining Bonus,Candidate relocate actual,Gender,Candidate Source,Rex in Yrs,LOB,Location,Age,Status
0,2110407,Yes,14.0,30,E2,-20.79,13.16,42.86,No,No,Female,Agency,7,ERS,Noida,34,Joined
1,2112635,No,18.0,30,E2,50.00,320.00,180.00,No,No,Male,Employee Referral,8,INFRA,Chennai,34,Joined
2,2112838,No,3.0,45,E2,42.84,42.84,0.00,No,No,Male,Agency,4,INFRA,Noida,27,Joined
3,2115021,No,26.0,30,E2,42.84,42.84,0.00,No,No,Male,Employee Referral,4,INFRA,Noida,34,Joined
4,2115125,Yes,1.0,120,E2,42.59,42.59,0.00,No,Yes,Male,Employee Referral,6,INFRA,Noida,34,Joined
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12328,3828260,No,,0,E0,47.54,42.86,-3.18,No,No,Male,Direct,0,BSERV,Chennai,34,Joined
12329,3830270,No,,0,E0,47.54,42.86,-3.18,No,No,Male,Direct,0,BSERV,Chennai,34,Joined
12330,3834159,No,0.0,0,E2,35.42,35.42,0.00,No,No,Male,Direct,5,INFRA,Noida,34,Joined
12331,3835433,No,0.0,30,E1,76.92,53.85,-13.04,No,No,Male,Direct,4,INFRA,Noida,34,Joined


In [3]:
df.info() 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12333 entries, 0 to 12332
Data columns (total 17 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Candidate Ref                12333 non-null  int64  
 1   DOJ Extended                 12333 non-null  object 
 2   Duration to accept offer     9614 non-null   float64
 3   Notice period                12333 non-null  int64  
 4   Offered band                 12333 non-null  object 
 5   Pecent hike expected in CTC  11586 non-null  float64
 6   Percent hike offered in CTC  11737 non-null  float64
 7   Percent difference CTC       11482 non-null  float64
 8   Joining Bonus                12333 non-null  object 
 9   Candidate relocate actual    12333 non-null  object 
 10  Gender                       12333 non-null  object 
 11  Candidate Source             12333 non-null  object 
 12  Rex in Yrs                   12333 non-null  int64  
 13  LOB             

In [4]:
df['DOJ Extended'] = df['DOJ Extended'].astype('category')
df['Offered band'] = df['Offered band'].astype('category')
df['Joining Bonus'] = df['Joining Bonus'].astype('category')
df['Candidate relocate actual'] = df['Candidate relocate actual'].astype('category')
df['Gender'] = df['Gender'].astype('category')
df['Candidate Source'] = df['Candidate Source'].astype('category')
df['LOB'] = df['LOB'].astype('category')
df['Location'] = df['Location'].astype('category')
df['Status'] = df['Status'].astype('category')

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12333 entries, 0 to 12332
Data columns (total 17 columns):
 #   Column                       Non-Null Count  Dtype   
---  ------                       --------------  -----   
 0   Candidate Ref                12333 non-null  int64   
 1   DOJ Extended                 12333 non-null  category
 2   Duration to accept offer     9614 non-null   float64 
 3   Notice period                12333 non-null  int64   
 4   Offered band                 12333 non-null  category
 5   Pecent hike expected in CTC  11586 non-null  float64 
 6   Percent hike offered in CTC  11737 non-null  float64 
 7   Percent difference CTC       11482 non-null  float64 
 8   Joining Bonus                12333 non-null  category
 9   Candidate relocate actual    12333 non-null  category
 10  Gender                       12333 non-null  category
 11  Candidate Source             12333 non-null  category
 12  Rex in Yrs                   12333 non-null  int64   
 13  L

In [6]:
#chack missing values in terms of percentage.
percent_missing = df.isnull().sum() * 100 / len(df)
missing_value_df = pd.DataFrame({'column_name': df.columns,
                                 'percent_missing': percent_missing})
missing_value_df

Unnamed: 0,column_name,percent_missing
Candidate Ref,Candidate Ref,0.0
DOJ Extended,DOJ Extended,0.0
Duration to accept offer,Duration to accept offer,22.046542
Notice period,Notice period,0.0
Offered band,Offered band,0.0
Pecent hike expected in CTC,Pecent hike expected in CTC,6.05692
Percent hike offered in CTC,Percent hike offered in CTC,4.832563
Percent difference CTC,Percent difference CTC,6.900186
Joining Bonus,Joining Bonus,0.0
Candidate relocate actual,Candidate relocate actual,0.0


In [16]:
# creación de variables sustitutas
for col in df:
    if df[col].isna().sum() != 0: 
        df[col + '_surrogate'] = df[col].isna().astype(int)

# fijación de variables categóricas
imputer = SimpleImputer(missing_values = np.nan, strategy='constant')
imputer.fit(df.select_dtypes(exclude=['int64','float64']))
df[df.select_dtypes(exclude=['int64','float64']).columns] = imputer.transform(df.select_dtypes(exclude=['int64','float64']))
           
# fijación de variables numéricas 
imputer = SimpleImputer(missing_values = np.nan, strategy='median')
imputer.fit(df.select_dtypes(include=['int64','float64']))
df[df.select_dtypes(include=['int64','float64']).columns] = imputer.transform(df.select_dtypes(include=['int64','float64']))

In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12333 entries, 0 to 12332
Data columns (total 21 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   Candidate Ref                          12333 non-null  float64
 1   DOJ Extended                           12333 non-null  object 
 2   Duration to accept offer               12333 non-null  float64
 3   Notice period                          12333 non-null  float64
 4   Offered band                           12333 non-null  object 
 5   Pecent hike expected in CTC            12333 non-null  float64
 6   Percent hike offered in CTC            12333 non-null  float64
 7   Percent difference CTC                 12333 non-null  float64
 8   Joining Bonus                          12333 non-null  object 
 9   Candidate relocate actual              12333 non-null  object 
 10  Gender                                 12333 non-null  object 
 11  Ca

In [18]:
df

Unnamed: 0,Candidate Ref,DOJ Extended,Duration to accept offer,Notice period,Offered band,Pecent hike expected in CTC,Percent hike offered in CTC,Percent difference CTC,Joining Bonus,Candidate relocate actual,Gender,Candidate Source,Rex in Yrs,LOB,Location,Age,Status,Duration to accept offer_surrogate,Pecent hike expected in CTC_surrogate,Percent hike offered in CTC_surrogate,Percent difference CTC_surrogate
0,2110407.0,Yes,14.0,30.0,E2,-20.79,13.16,42.86,No,No,Female,Agency,7.0,ERS,Noida,34.0,Joined,0.0,0.0,0.0,0.0
1,2112635.0,No,18.0,30.0,E2,50.00,320.00,180.00,No,No,Male,Employee Referral,8.0,INFRA,Chennai,34.0,Joined,0.0,0.0,0.0,0.0
2,2112838.0,No,3.0,45.0,E2,42.84,42.84,0.00,No,No,Male,Agency,4.0,INFRA,Noida,27.0,Joined,0.0,0.0,0.0,0.0
3,2115021.0,No,26.0,30.0,E2,42.84,42.84,0.00,No,No,Male,Employee Referral,4.0,INFRA,Noida,34.0,Joined,0.0,0.0,0.0,0.0
4,2115125.0,Yes,1.0,120.0,E2,42.59,42.59,0.00,No,Yes,Male,Employee Referral,6.0,INFRA,Noida,34.0,Joined,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12328,3828260.0,No,9.0,0.0,E0,47.54,42.86,-3.18,No,No,Male,Direct,0.0,BSERV,Chennai,34.0,Joined,1.0,0.0,0.0,0.0
12329,3830270.0,No,9.0,0.0,E0,47.54,42.86,-3.18,No,No,Male,Direct,0.0,BSERV,Chennai,34.0,Joined,1.0,0.0,0.0,0.0
12330,3834159.0,No,0.0,0.0,E2,35.42,35.42,0.00,No,No,Male,Direct,5.0,INFRA,Noida,34.0,Joined,0.0,0.0,0.0,0.0
12331,3835433.0,No,0.0,30.0,E1,76.92,53.85,-13.04,No,No,Male,Direct,4.0,INFRA,Noida,34.0,Joined,0.0,0.0,0.0,0.0


In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12333 entries, 0 to 12332
Data columns (total 21 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   Candidate Ref                          12333 non-null  float64
 1   DOJ Extended                           12333 non-null  object 
 2   Duration to accept offer               12333 non-null  float64
 3   Notice period                          12333 non-null  float64
 4   Offered band                           12333 non-null  object 
 5   Pecent hike expected in CTC            12333 non-null  float64
 6   Percent hike offered in CTC            12333 non-null  float64
 7   Percent difference CTC                 12333 non-null  float64
 8   Joining Bonus                          12333 non-null  object 
 9   Candidate relocate actual              12333 non-null  object 
 10  Gender                                 12333 non-null  object 
 11  Ca

In [26]:
pd.options.display.max_columns = None # remove the limit on the number of columns by default only 20 are shows

df = pd.get_dummies(df, columns = df.select_dtypes(exclude=['int64','float64']).columns.drop('Status'), drop_first = True)

df.head()

Unnamed: 0,Candidate Ref,Duration to accept offer,Notice period,Pecent hike expected in CTC,Percent hike offered in CTC,Percent difference CTC,Rex in Yrs,Age,Status,Duration to accept offer_surrogate,Pecent hike expected in CTC_surrogate,Percent hike offered in CTC_surrogate,Percent difference CTC_surrogate,DOJ Extended_Yes,Offered band_E1,Offered band_E2,Offered band_E3,Offered band_E4,Offered band_E5,Offered band_E6,Joining Bonus_Yes,Candidate relocate actual_Yes,Gender_Male,Candidate Source_Direct,Candidate Source_Employee Referral,LOB_BFSI,LOB_BSERV,LOB_CORP,LOB_CSMP,LOB_EAS,LOB_ERS,LOB_ETS,LOB_Healthcare,LOB_INFRA,LOB_MMS,LOB_SALES,Location_Bangalore,Location_Chennai,Location_Cochin,Location_Coimbatore,Location_Gurgaon,Location_Hyderabad,Location_Kolkata,Location_Mumbai,Location_Noida,Location_Others,Location_Pune
0,2110407.0,14.0,30.0,-20.79,13.16,42.86,7.0,34.0,Joined,0.0,0.0,0.0,0.0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
1,2112635.0,18.0,30.0,50.0,320.0,180.0,8.0,34.0,Joined,0.0,0.0,0.0,0.0,0,0,1,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0
2,2112838.0,3.0,45.0,42.84,42.84,0.0,4.0,27.0,Joined,0.0,0.0,0.0,0.0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0
3,2115021.0,26.0,30.0,42.84,42.84,0.0,4.0,34.0,Joined,0.0,0.0,0.0,0.0,0,0,1,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0
4,2115125.0,1.0,120.0,42.59,42.59,0.0,6.0,34.0,Joined,0.0,0.0,0.0,0.0,1,0,1,0,0,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0


In [27]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12333 entries, 0 to 12332
Data columns (total 47 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   Candidate Ref                          12333 non-null  float64
 1   Duration to accept offer               12333 non-null  float64
 2   Notice period                          12333 non-null  float64
 3   Pecent hike expected in CTC            12333 non-null  float64
 4   Percent hike offered in CTC            12333 non-null  float64
 5   Percent difference CTC                 12333 non-null  float64
 6   Rex in Yrs                             12333 non-null  float64
 7   Age                                    12333 non-null  float64
 8   Status                                 12333 non-null  object 
 9   Duration to accept offer_surrogate     12333 non-null  float64
 10  Pecent hike expected in CTC_surrogate  12333 non-null  float64
 11  Pe