# Import libraries

In [45]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [46]:
import os
os.chdir('/content/gdrive/My Drive/data/archive')

In [47]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from scipy import stats
from scipy.stats import norm
from scipy.stats import multivariate_normal as mn

In [48]:
!pip install category_encoders
import category_encoders as ce

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


# **1. Import the Test Set - Preprocess, get ready for modeling**

In [49]:
test = pd.read_csv('aug_test.csv')
test.rename(columns = {'relevent_experience':'relevant_experience'}, inplace = True)
test['relevant_experience'] = list(map(lambda x: x.replace('relevent', 'relevant'), test['relevant_experience']))
test.head()

Unnamed: 0,enrollee_id,city,city_development_index,gender,relevant_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours
0,32403,city_41,0.827,Male,Has relevant experience,Full time course,Graduate,STEM,9,<10,,1,21
1,9858,city_103,0.92,Female,Has relevant experience,no_enrollment,Graduate,STEM,5,,Pvt Ltd,1,98
2,31806,city_21,0.624,Male,No relevant experience,no_enrollment,High School,,<1,,Pvt Ltd,never,15
3,27385,city_13,0.827,Male,Has relevant experience,no_enrollment,Masters,STEM,11,10/49,Pvt Ltd,1,39
4,27724,city_103,0.92,Male,Has relevant experience,no_enrollment,Graduate,STEM,>20,10000+,Pvt Ltd,>4,72


For now, the test set and its labels are separated. Let's convert them into one set only!

In [50]:
test_values = np.load('jobchange_test_target_values.npy')
test_values

array([1., 0., 1., ..., 0., 0., 0.])

In [51]:
# Merge labels into test set
test['target'] = test_values
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2129 entries, 0 to 2128
Data columns (total 14 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   enrollee_id             2129 non-null   int64  
 1   city                    2129 non-null   object 
 2   city_development_index  2129 non-null   float64
 3   gender                  1621 non-null   object 
 4   relevant_experience     2129 non-null   object 
 5   enrolled_university     2098 non-null   object 
 6   education_level         2077 non-null   object 
 7   major_discipline        1817 non-null   object 
 8   experience              2124 non-null   object 
 9   company_size            1507 non-null   object 
 10  company_type            1495 non-null   object 
 11  last_new_job            2089 non-null   object 
 12  training_hours          2129 non-null   int64  
 13  target                  2129 non-null   float64
dtypes: float64(2), int64(2), object(10)
memo

# **2. Preprocessing**

## **2.1. Data Cleaning**

### **2.1.1. Check Duplicates**

In [52]:
# Check for duplicate values
dup = test.duplicated()
print(test[dup])

Empty DataFrame
Columns: [enrollee_id, city, city_development_index, gender, relevant_experience, enrolled_university, education_level, major_discipline, experience, company_size, company_type, last_new_job, training_hours, target]
Index: []


### **2.1.2. Dealing with unnecessary columns & incorrect data types**

In [53]:
test.drop(['city','company_size'], axis=1, inplace=True)
test['enrollee_id'] = test['enrollee_id'].astype(str)
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2129 entries, 0 to 2128
Data columns (total 12 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   enrollee_id             2129 non-null   object 
 1   city_development_index  2129 non-null   float64
 2   gender                  1621 non-null   object 
 3   relevant_experience     2129 non-null   object 
 4   enrolled_university     2098 non-null   object 
 5   education_level         2077 non-null   object 
 6   major_discipline        1817 non-null   object 
 7   experience              2124 non-null   object 
 8   company_type            1495 non-null   object 
 9   last_new_job            2089 non-null   object 
 10  training_hours          2129 non-null   int64  
 11  target                  2129 non-null   float64
dtypes: float64(2), int64(1), object(9)
memory usage: 199.7+ KB


### **2.1.3. Removing special characters**

In [54]:
test['experience'] = test['experience'].replace({'>20':'21', '<1':'0.5'})
test['last_new_job'] = test['last_new_job'].replace({'>4':'5', 'never':'0'})
change = ['experience', 'last_new_job']
test[change] = test[change].apply(pd.to_numeric, axis=1)
print(test['experience'].dtype, test['last_new_job'].dtype)

float64 float64


### **2.1.4. Handling Missing Value**

In [55]:
test.isna().sum()

enrollee_id                 0
city_development_index      0
gender                    508
relevant_experience         0
enrolled_university        31
education_level            52
major_discipline          312
experience                  5
company_type              634
last_new_job               40
training_hours              0
target                      0
dtype: int64

In [56]:
#Drop missing value in the Experience column
test.dropna(subset = ['experience'], inplace = True)

In [57]:
print(type(test))

<class 'pandas.core.frame.DataFrame'>


In [58]:
#Fill missing value with new class "Unknown"
col_with_null = ['gender', 'enrolled_university', 'education_level', 'major_discipline', 'company_type']

for col in col_with_null:
    test[col].fillna(value = 'Unknown', inplace = True)
test['last_new_job'] = test['last_new_job'].fillna(test['last_new_job'].mean().round(2))

In [59]:
test.isna().sum()

enrollee_id               0
city_development_index    0
gender                    0
relevant_experience       0
enrolled_university       0
education_level           0
major_discipline          0
experience                0
company_type              0
last_new_job              0
training_hours            0
target                    0
dtype: int64

### **2.1.5 Handling Outliers**

In [60]:
def z_score(df, threshold=1):
    mean, std = np.mean(df), np.std(df)
    z_score = np.abs((df - mean) / std)
    good = z_score < threshold
    print(f"z-score of {threshold} corresponds to a prob of {100 * 2 * norm.sf(3):0.2f}%")
    print(f"Rejection {(~good).sum()} points")
    return good

In [61]:
good = z_score(test['training_hours'], 3)
good_hours = test[good]
print(max(good_hours['training_hours']))

z-score of 3 corresponds to a prob of 0.27%
Rejection 57 points
244


In [62]:
condition = test['training_hours'] <= 244
test = test[condition]
test.describe()   #Outliers in training_hours have been removed
                 #Maximum value for this column now is 244 hours.

Unnamed: 0,city_development_index,experience,last_new_job,training_hours,target
count,2067.0,2067.0,2067.0,2067.0,2067.0
mean,0.824981,10.12119,2.005206,58.879052,0.272375
std,0.125156,6.828503,1.630119,47.890403,0.44529
min,0.448,0.5,0.0,1.0,0.0
25%,0.698,4.0,1.0,23.0,0.0
50%,0.903,9.0,1.0,46.0,0.0
75%,0.92,16.0,3.0,83.0,1.0
max,0.949,21.0,5.0,244.0,1.0


## **2.2. Encoding categorical data**

In [63]:
nom = ['gender', 'major_discipline', 'company_type']
# Ordinal attributes -> Label Encoder
ord = ['education_level', 'enrolled_university', 'relevant_experience']

In [64]:
def Ordinal_Encode(df):
  data = df.copy()
  education_level_map = {'Unknown': 0, 'Primary School': 1,'High School': 2,'Graduate': 3, 'Masters': 4,'Phd': 5}
  enrolled_uni_map = {'Unknown': 0,'no_enrollment': 1, 'Part time course': 2, 'Full time course': 3}
  relevant_exp_map = {'No relevant experience': 0, 'Has relevant experience': 1}
  
  data.loc[:,'education_level'] = data['education_level'].map(education_level_map)
  data.loc[:,'enrolled_university'] = data['enrolled_university'].map(enrolled_uni_map)
  data.loc[:,'relevant_experience'] = data['relevant_experience'].map(relevant_exp_map)
  return data

test = Ordinal_Encode(test)
test.head()

Unnamed: 0,enrollee_id,city_development_index,gender,relevant_experience,enrolled_university,education_level,major_discipline,experience,company_type,last_new_job,training_hours,target
0,32403,0.827,Male,1,3,3,STEM,9.0,Unknown,1.0,21,1.0
1,9858,0.92,Female,1,1,3,STEM,5.0,Pvt Ltd,1.0,98,0.0
2,31806,0.624,Male,0,1,2,Unknown,0.5,Pvt Ltd,0.0,15,1.0
3,27385,0.827,Male,1,1,4,STEM,11.0,Pvt Ltd,1.0,39,0.0
4,27724,0.92,Male,1,1,3,STEM,21.0,Pvt Ltd,5.0,72,1.0


In [66]:
nom = ['gender', 'major_discipline', 'company_type']  #variable to encode dummies

# One Hot encoding base function
def one_hot_encoding(df,col):
    one_hot_encoder=ce.OneHotEncoder(cols=col,return_df=True,use_cat_names=True)
    df_final = one_hot_encoder.fit_transform(df)
    return df_final

for col in nom:
  test = one_hot_encoding(test, col)

test.head(5)

Unnamed: 0,enrollee_id,city_development_index,gender_Male,gender_Female,gender_Unknown,gender_Other,relevant_experience,enrolled_university,education_level,major_discipline_STEM,...,company_type_Unknown,company_type_Pvt Ltd,company_type_Funded Startup,company_type_Other,company_type_Public Sector,company_type_Early Stage Startup,company_type_NGO,last_new_job,training_hours,target
0,32403,0.827,1,0,0,0,1,3,3,1,...,1,0,0,0,0,0,0,1.0,21,1.0
1,9858,0.92,0,1,0,0,1,1,3,1,...,0,1,0,0,0,0,0,1.0,98,0.0
2,31806,0.624,1,0,0,0,0,1,2,0,...,0,1,0,0,0,0,0,0.0,15,1.0
3,27385,0.827,1,0,0,0,1,1,4,1,...,0,1,0,0,0,0,0,1.0,39,0.0
4,27724,0.92,1,0,0,0,1,1,3,1,...,0,1,0,0,0,0,0,5.0,72,1.0


In [67]:
def investigator(df):
  print("Shape of the DataFrame: ", df.shape, "\n")
  print("Num of missing values: ","\n", df.isna().sum(), "\n")
  print("Datatypes of all features in the DataFrame: ", df.dtypes)
  

investigator(test)

Shape of the DataFrame:  (2067, 27) 

Num of missing values:  
 enrollee_id                         0
city_development_index              0
gender_Male                         0
gender_Female                       0
gender_Unknown                      0
gender_Other                        0
relevant_experience                 0
enrolled_university                 0
education_level                     0
major_discipline_STEM               0
major_discipline_Unknown            0
major_discipline_Other              0
major_discipline_Business Degree    0
major_discipline_Arts               0
major_discipline_Humanities         0
major_discipline_No Major           0
experience                          0
company_type_Unknown                0
company_type_Pvt Ltd                0
company_type_Funded Startup         0
company_type_Other                  0
company_type_Public Sector          0
company_type_Early Stage Startup    0
company_type_NGO                    0
last_new_job            

# **3. Export the final test set**
so that we can dive into modeling in anytime

In [68]:
test.to_csv('test_with_labels.csv', index = False)
test_cleaned = pd.read_csv('test_with_labels.csv')
test_cleaned.head()

Unnamed: 0,enrollee_id,city_development_index,gender_Male,gender_Female,gender_Unknown,gender_Other,relevant_experience,enrolled_university,education_level,major_discipline_STEM,...,company_type_Unknown,company_type_Pvt Ltd,company_type_Funded Startup,company_type_Other,company_type_Public Sector,company_type_Early Stage Startup,company_type_NGO,last_new_job,training_hours,target
0,32403,0.827,1,0,0,0,1,3,3,1,...,1,0,0,0,0,0,0,1.0,21,1.0
1,9858,0.92,0,1,0,0,1,1,3,1,...,0,1,0,0,0,0,0,1.0,98,0.0
2,31806,0.624,1,0,0,0,0,1,2,0,...,0,1,0,0,0,0,0,0.0,15,1.0
3,27385,0.827,1,0,0,0,1,1,4,1,...,0,1,0,0,0,0,0,1.0,39,0.0
4,27724,0.92,1,0,0,0,1,1,3,1,...,0,1,0,0,0,0,0,5.0,72,1.0


In [69]:
print(len(test_cleaned))

2067
