IMPORT

In [11]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder
import re
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from imblearn.over_sampling import RandomOverSampler
from sklearn.feature_selection import SelectPercentile ,chi2

READ DATA

In [2]:
df = pd.read_excel("career_levels.ods", engine="odf", dtype=str)

In [3]:
df.head()

Unnamed: 0,title,location,description,function,industry,career_level
0,Technical Professional Lead - Process,"Houston, TX","Responsible for the study, design, and specifi...",production_manufacturing,Machinery and Industrial Facilities Engineering,senior_specialist_or_project_manager
1,Cnslt - Systems Eng- Midrange 1,"Seattle, WA","Participates in design, development and implem...",information_technology_telecommunications,Financial Services,senior_specialist_or_project_manager
2,SharePoint Developers and Solution Architects,"Dallas, TX",We are currently in need of Developers who can...,consulting,IT Consulting,senior_specialist_or_project_manager
3,Business Information Services - Strategic Acco...,North Carolina,Experian is seeking an experienced Account Exe...,sales,"Security, Risk, Restructuring Consulting",senior_specialist_or_project_manager
4,Strategic Development Director (procurement),"Austin, TX",Â Want to join a world-class global procuremen...,procurement_materials_logistics,Information Technology,bereichsleiter


In [4]:
df.isnull().sum()

title           0
location        0
description     1
function        0
industry        0
career_level    0
dtype: int64

In [5]:
df = df.dropna()
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8073 entries, 0 to 8073
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   title         8073 non-null   object
 1   location      8073 non-null   object
 2   description   8073 non-null   object
 3   function      8073 non-null   object
 4   industry      8073 non-null   object
 5   career_level  8073 non-null   object
dtypes: object(6)
memory usage: 441.5+ KB


In [6]:
df["career_level"].value_counts()

career_level
senior_specialist_or_project_manager      4337
manager_team_leader                       2672
bereichsleiter                             960
director_business_unit_leader               70
specialist                                  30
managing_director_small_medium_company       4
Name: count, dtype: int64

DATA PREPROCESSING

In [7]:
def filter_location(string):
    result = re.findall("\,\ [A-Z]{2}$", string)
    if len(result) == 1:
        return result[0][-2:]
    else:
        return string
target = "career_level"
x = df.drop(target, axis=1)
df["location"] = df["location"].apply(filter_location)
y = df["career_level"]
x_train , x_test , y_train , y_test = train_test_split(x, y, test_size=0.2, random_state=27, stratify=y)
ros = RandomOverSampler(random_state=27)
x_train, y_train = ros.fit_resample(x_train, y_train)

In [8]:
preprocessor = ColumnTransformer(transformers=[
    ("title", TfidfVectorizer(stop_words="english", ngram_range=(1,1)), "title"),
    ("location", OneHotEncoder(handle_unknown="ignore"), ["location"]),
    ("description", TfidfVectorizer(stop_words="english", ngram_range=(1,2), min_df=0.01, max_df=0.95), "description"),
    ("function", OneHotEncoder(handle_unknown="ignore"), ["function"]),
    ("industry", TfidfVectorizer(stop_words="english", ngram_range=(1,1)), "industry")
])

In [9]:
cls = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("feature_selection", SelectPercentile(chi2, percentile=5)),
    ("model", RandomForestClassifier())
])
x_train["description"] = x_train["description"].values.astype("U")

TRAINING MODEL

In [12]:
para_grid = {
    "model__n_estimators": [50,100,200],
    "model__criterion" : ["gini", "entropy", "log_loss"],
    "feature_selection__percentile": [1,5,8],
    "preprocessor__description__ngram_range": [(1,1),(1,2)]
}
rand_search = RandomizedSearchCV(cls, param_distributions=para_grid, cv=5,scoring="f1_weighted",n_jobs=6, verbose=1, n_iter=5)
rand_search.fit(x_train, y_train)
y_predict = rand_search.predict(x_test)
print(classification_report(y_test, y_predict))

Fitting 5 folds for each of 54 candidates, totalling 270 fits


KeyboardInterrupt: 