In [16]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder
import re
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, recall_score, f1_score
from sklearn.feature_selection import SelectKBest, chi2, SelectPercentile
from sklearn.svm import SVC
from imblearn.over_sampling import RandomOverSampler, SMOTEN

def filter_location(location):
    result = re.findall("\,\s[A-Z]{2}$", location)
    if len(result) > 0:
        return result[0][2:]
    else:
        return location
    

data = pd.read_excel("../datasets/final_project.ods", engine="odf", dtype=str)
# print(data.info())  # xem co cot bi khuyet khong
data.dropna(axis=0) # bo di o bi khuyet
# print(len(data["industry"].unique()))
data["location"] = data["location"].apply(filter_location)

#Step1: Divide data
target = "career_level"
x = data.drop(target, axis=1)
y = data[target]
# print(y.value_counts())
# print("--------------------")

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=2201, stratify=y) # stratify: phan chia dong deu cho tung class
# print(y_train.value_counts())

# Balance data
# ros = RandomOverSampler(random_state=42)
x_train["description"] = x_train["description"].fillna("")
ros = SMOTEN(random_state=42, k_neighbors=2, sampling_strategy={
    "bereichsleiter": 1000,
    "director_business_unit_leader": 500,
    "specialist": 500,
    "managing_director_small_medium_company": 100
})
print(y_train.value_counts())
print("--------------------")
x_train, y_train = ros.fit_resample(x_train, y_train)
print(y_train.value_counts())

# career_level
# senior_specialist_or_project_manager      3470
# manager_team_leader                       2138
# bereichsleiter                             768
# director_business_unit_leader               56
# specialist                                  24
# managing_director_small_medium_company       3
# Name: count, dtype: int64
# --------------------
# career_level
# senior_specialist_or_project_manager      3470
# bereichsleiter                            3470
# manager_team_leader                       3470
# specialist                                3470
# director_business_unit_leader             3470
# managing_director_small_medium_company    3470
# Name: count, dtype: int64

#Step2: Data preprocessing

# Preprocessing for "title" column
# vectorizer = TfidfVectorizer()
# output = vectorizer.fit_transform(x_train["title"])
# output = pd.DataFrame.sparse.from_spmatrix(output)
# print(vectorizer.vocabulary_)
# print(len(vectorizer.vocabulary_))
# print(output.shape)

# Preprocessing for "location" column
# encoder = OneHotEncoder()
# output = encoder.fit_transform(x_train[["location"]])
# print(output.shape)

# Preprocessing for "description" column
# x_train["description"] = x_train["description"].fillna("")
# vectorizer = TfidfVectorizer(ngram_range=(1,2))
# output = vectorizer.fit_transform(x_train["description"])
# print(output.shape)
# Only unigram: (6459, 67690)
# Unigram and Bigram: (6459, 759408)


# Step3: Traning model
# preprocessor = ColumnTransformer(transformers=[
#     ("title", TfidfVectorizer(stop_words="english"), "title"),
#     ("location", OneHotEncoder(handle_unknown="ignore"), ["location"]),
#     ("description", TfidfVectorizer(stop_words="english", ngram_range=(1,2), min_df=0.01, max_df=0.95), "description"),
#     ("function", OneHotEncoder(), ["function"]),
#     ("industry", TfidfVectorizer(stop_words="english"), "industry")
# ])

# output = preprocessor.fit_transform(x_train)
# print(x_train.shape)
# print(output.shape)

# classifier = Pipeline(steps=[
#     ("preprocessor", preprocessor),
#     # ("feature_selector", SelectKBest(chi2, k=800)),
#     ("feature_selector", SelectPercentile(chi2, percentile=10)), # giu lai bao nhieu phan tram
#     ("classifier", RandomForestClassifier(random_state=2201))
# ])

# params = {
#     "feature_selector__percentile": [10, 5, 2],
#     "preprocessor__description__min_df": [0.01, 0.05],
#     "preprocessor__description__max_df": [0.95, 0.99]
# }

# model = GridSearchCV(
#     estimator=classifier,
#     param_grid=params,   # param_distributions=params (dung cho RandomizedSearchCV)
#     scoring="recall_weighted",
#     cv=6,
#     verbose=2,
#     n_jobs=-1
# )

# model.fit(x_train, y_train)
# y_predict = model.predict(x_test)
# print(model.best_params_)
# print(model.best_score_)
# print(classification_report(y_test, y_predict))

# Default random forest ~ 850k features
#                                         precision    recall  f1-score   support

#                         bereichsleiter       0.62      0.05      0.10       192
#          director_business_unit_leader       1.00      0.07      0.13        14
#                    manager_team_leader       0.63      0.44      0.52       534
# managing_director_small_medium_company       0.00      0.00      0.00         1
#   senior_specialist_or_project_manager       0.69      0.98      0.81       868
#                             specialist       0.00      0.00      0.00         6

#                               accuracy                           0.68      1615
#                              macro avg       0.49      0.26      0.26      1615
#                           weighted avg       0.66      0.68      0.62      1615


# random forest (+ min_df, max_df) ~ 8k features
# After add the min_df, the max_df for TfidfVectorizer => Select feature important in "description" column
#                                         precision    recall  f1-score   support

#                         bereichsleiter       0.62      0.07      0.12       192
#          director_business_unit_leader       1.00      0.07      0.13        14
#                    manager_team_leader       0.64      0.67      0.65       534
# managing_director_small_medium_company       0.00      0.00      0.00         1
#   senior_specialist_or_project_manager       0.80      0.94      0.86       868
#                             specialist       0.00      0.00      0.00         6

#                               accuracy                           0.74      1615
#                              macro avg       0.51      0.29      0.30      1615
#                           weighted avg       0.72      0.74      0.70      1615


# random forest (+ min_df + max_df + select k best) = 800 features
#                                         precision    recall  f1-score   support

#                         bereichsleiter       0.74      0.13      0.22       192
#          director_business_unit_leader       1.00      0.07      0.13        14
#                    manager_team_leader       0.65      0.75      0.70       534
# managing_director_small_medium_company       0.00      0.00      0.00         1
#   senior_specialist_or_project_manager       0.84      0.94      0.88       868
#                             specialist       0.00      0.00      0.00         6

#                               accuracy                           0.77      1615
#                              macro avg       0.54      0.31      0.32      1615
#                           weighted avg       0.76      0.77      0.73      1615


# GridSearchCV
# {'feature_selector__percentile': 5, 'preprocessor__description__max_df': 0.95, 'preprocessor__description__min_df': 0.01}
# 0.7586298048988711
#                                         precision    recall  f1-score   support

#                         bereichsleiter       0.67      0.18      0.29       192
#          director_business_unit_leader       1.00      0.07      0.13        14
#                    manager_team_leader       0.67      0.73      0.70       534
# managing_director_small_medium_company       0.00      0.00      0.00         1
#   senior_specialist_or_project_manager       0.83      0.94      0.88       868
#                             specialist       0.00      0.00      0.00         6

#                               accuracy                           0.77      1615
#                              macro avg       0.53      0.32      0.33      1615
#                           weighted avg       0.76      0.77      0.74      1615


career_level
senior_specialist_or_project_manager      3470
manager_team_leader                       2138
bereichsleiter                             768
director_business_unit_leader               56
specialist                                  24
managing_director_small_medium_company       3
Name: count, dtype: int64
--------------------
career_level
senior_specialist_or_project_manager      3470
manager_team_leader                       2138
bereichsleiter                            1000
specialist                                 500
director_business_unit_leader              500
managing_director_small_medium_company     100
Name: count, dtype: int64
