In [1]:
import pandas as pd

In [83]:
df = pd.read_csv('XY_train.csv',encoding='latin-1')
df.head(5)

Unnamed: 0,enrollee_id,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours,target
0,13163,city_16,0.91,,No relevent experience,no_enrollment,High School,,2,,,never,16,1
1,9357,city_136,0.897,Male,Has relevent experience,no_enrollment,Masters,STEM,6,1000-4999,Pvt Ltd,1,262,0
2,604,city_16,0.91,Male,No relevent experience,,,,3,,,never,24,0
3,19033,city_16,0.91,Male,No relevent experience,no_enrollment,High School,,3,,,1,35,0
4,4825,city_145,0.555,Male,Has relevent experience,Full time course,Graduate,STEM,4,,,1,43,1


# Pre-processing

## Dealing with missing values

#### "gender", "company_size", "company_type"  - I added an unknown category
#### "enrolled_university", "major_discipline", "education_level", "last_new_job" - fillna with the mode value: "no_enrollment", "STEM"

In [84]:
df["gender"] = df["gender"].fillna("unknown")
df["company_size"] = df["company_size"].fillna("unknown")
df["company_type"] = df["company_type"].fillna("unknown")
print(df["enrolled_university"].mode())
df["enrolled_university"] = df["enrolled_university"].fillna("no_enrollment")
print(df["major_discipline"].mode())
df["major_discipline"] = df["major_discipline"].fillna("STEM")
print(df["last_new_job"].mode())
df["last_new_job"] = df["last_new_job"].fillna("1")
print(df["education_level"].mode())
df["education_level"] = df["education_level"].fillna("Graduate")

0    no_enrollment
dtype: object
0    STEM
dtype: object
0    1
dtype: object
0    Graduate
dtype: object


#### "experience" - cut to 4 level ("low", "low-middle", "middle-high", "high")

In [85]:
print(df["experience"].unique())
df["experience"] = df["experience"].replace("<1","1")
df["experience"] = df["experience"].replace(">20","20")
df["experience"] = pd.to_numeric(df["experience"])
print(df["experience"].unique())
print(df["experience"].mode())
df["experience"] = df["experience"].fillna(20)
print(df["experience"].value_counts())
bins = [0,5,9,15,20]
labels = ["low", "low-middle", "middle-high", "high"]
df["experience"] = pd.cut(df["experience"], bins= bins, labels= labels)
print(df["experience"].value_counts())

['2' '6' '3' '4' '>20' '13' '19' '10' '1' '5' '7' '20' '14' '16' '12' '15'
 '8' '<1' '17' '9' '11' '18' nan]
[ 2.  6.  3.  4. 20. 13. 19. 10.  1.  5.  7. 14. 16. 12. 15.  8. 17.  9.
 11. 18. nan]
0    20.0
dtype: float64
20.0    2783
5.0     1161
4.0     1097
3.0     1084
6.0      979
2.0      889
1.0      841
7.0      825
9.0      804
10.0     789
8.0      636
15.0     572
11.0     525
14.0     478
16.0     403
12.0     393
13.0     313
17.0     282
19.0     249
18.0     223
Name: experience, dtype: int64
low            5072
high           3940
low-middle     3244
middle-high    3070
Name: experience, dtype: int64


In [86]:
df.isna().sum()

enrollee_id               0
city                      0
city_development_index    0
gender                    0
relevent_experience       0
enrolled_university       0
education_level           0
major_discipline          0
experience                0
company_size              0
company_type              0
last_new_job              0
training_hours            0
target                    0
dtype: int64

In [87]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15326 entries, 0 to 15325
Data columns (total 14 columns):
 #   Column                  Non-Null Count  Dtype   
---  ------                  --------------  -----   
 0   enrollee_id             15326 non-null  int64   
 1   city                    15326 non-null  object  
 2   city_development_index  15326 non-null  float64 
 3   gender                  15326 non-null  object  
 4   relevent_experience     15326 non-null  object  
 5   enrolled_university     15326 non-null  object  
 6   education_level         15326 non-null  object  
 7   major_discipline        15326 non-null  object  
 8   experience              15326 non-null  category
 9   company_size            15326 non-null  object  
 10  company_type            15326 non-null  object  
 11  last_new_job            15326 non-null  object  
 12  training_hours          15326 non-null  int64   
 13  target                  15326 non-null  int64   
dtypes: category(1), float6

In [64]:
df_city = df[["city","city_development_index"]]
df_city_group = df_city.groupby(by=["city","city_development_index"]).count().sort_values(by="city_development_index")
df_city_group.head()

# Feature Representation

### LabelEncoder - Order is important

In [88]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [106]:
cols = ["education_level", "experience", "relevent_experience", "gender", "major_discipline", "last_new_job", "company_size", "enrolled_university", "company_type"]
for column in cols:
    df[column] = le.fit_transform(df[column])
df.head(10)

Unnamed: 0,enrollee_id,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours,target
0,13163,city_16,0.91,3,1,2,1,5,1,7,6,5,16,1
1,9357,city_136,0.897,1,0,2,2,5,2,1,5,0,262,0
2,604,city_16,0.91,1,1,2,0,5,1,7,6,5,24,0
3,19033,city_16,0.91,1,1,2,1,5,1,7,6,0,35,0
4,4825,city_145,0.555,1,0,0,0,5,1,7,6,0,43,1
5,3759,city_136,0.897,1,0,2,2,5,0,4,5,1,18,0
6,23234,city_160,0.92,1,0,2,0,5,2,6,5,1,29,0
7,32330,city_84,0.698,1,0,2,2,5,3,3,5,4,122,1
8,22965,city_139,0.487,1,0,2,2,5,0,7,6,0,52,1
9,28416,city_160,0.92,1,0,0,0,5,3,7,6,1,41,0


### get_dummies - No matter the order

In [94]:
cols_02 = ["gender", "major_discipline", "last_new_job", "company_size"]
df_dummies = df["enrollee_id"]
df_dummies = pd.get_dummies(df["gender"], drop_first=True)
df_dummies.head(10)

Unnamed: 0,Male,Other,unknown
0,0,0,1
1,1,0,0
2,1,0,0
3,1,0,0
4,1,0,0
5,1,0,0
6,1,0,0
7,1,0,0
8,1,0,0
9,1,0,0


# Feature Selection – Wrappers

### Implementing Forward selection using built-in functions in Python: SequentialFeatureSelector

In [99]:
from sklearn.feature_selection import SequentialFeatureSelector as SFS
from sklearn.linear_model import LogisticRegression as LGR
from sklearn.ensemble import RandomForestClassifier as rfc

In [None]:
y = df["target"]
df = df.drop(columns = ["target","city"])
x = df

In [121]:
sfs_selector = SFS(estimator=LGR, n_features_to_select = 5, cv =10, direction ="forward")
sfs_selector.fit(x, y)
x.columns[sfs_selector.get_support()]

Index(['city_development_index', 'education_level', 'major_discipline',
       'experience', 'company_type'],
      dtype='object')