In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.feature_selection import SelectKBest, chi2, SelectPercentile

def filter_location(location):
    result = location.split(",")
    if len(result) > 1:
        return result[1][1:] #lấy phần tử có index là 1, bỏ ký tự trắng đầu tiên (Houston, TX => lấy đc TX)
    else:
        return location

data = pd.read_excel("week8/job_dataset.ods", engine="odf", dtype="str")
data = data.dropna(axis=0)
data["location"] = data["location"].apply(filter_location)

target = "career_level"

x = data.drop(target, axis=1)
y = data[target]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=100,stratify=y)

# vectorizer = TfidfVectorizer(stop_words="english", ngram_range=(1, 2), min_df=0.01, max_df=0.95)
# processed_data = vectorizer.fit_transform(x_train["description"])
# print (vectorizer.vocabulary_)
# print (len (vectorizer.vocabulary_))
# print(processed_data.shape)
#uni gram: 66674
#uni gram + bigram: 846809
#uni gram + bigram + max_df + min_df: 4342
# encoder = OneHotEncoder ()
# processed_data = encoder.fit_transform(x_train[["industry"]])
# print(processed_data.shape)

preprocessor =  ColumnTransformer(transformers=[
    ("title", TfidfVectorizer(stop_words="english", ngram_range=(1, 1)), "title"),
    ("location", OneHotEncoder(handle_unknown='ignore'), ["location"]),
    ("description", TfidfVectorizer(stop_words="english", ngram_range=(1, 2), min_df=0.01, max_df=0.99), "description"),
    ("function", OneHotEncoder(handle_unknown='ignore'), ["function"]),
    ("industry", TfidfVectorizer(stop_words="english", ngram_range=(1, 1)), "industry"),
])

model = Pipeline(steps=[
    ("pre_processor", preprocessor),
    # ("feature_selector", SelectKBest(chi2, k=400)),
    ("feature_selector", SelectPercentile(chi2, percentile=5)),
    ("regressor", RandomForestClassifier(random_state=100))
])

model.fit(x_train, y_train)
y_predict = model.predict(x_test)

print(classification_report(y_test, y_predict))

                                        precision    recall  f1-score   support

                        bereichsleiter       0.77      0.16      0.26       192
         director_business_unit_leader       1.00      0.07      0.13        14
                   manager_team_leader       0.65      0.76      0.70       534
managing_director_small_medium_company       0.00      0.00      0.00         1
  senior_specialist_or_project_manager       0.84      0.92      0.88       868
                            specialist       0.00      0.00      0.00         6

                              accuracy                           0.77      1615
                             macro avg       0.54      0.32      0.33      1615
                          weighted avg       0.77      0.77      0.74      1615



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# without max_df, min_df
                                        precision    recall  f1-score   support

                        bereichsleiter       0.44      0.02      0.04       192
         director_business_unit_leader       1.00      0.07      0.13        14
                   manager_team_leader       0.63      0.55      0.59       534
managing_director_small_medium_company       0.00      0.00      0.00         1
  senior_specialist_or_project_manager       0.72      0.95      0.82       868
                            specialist       0.00      0.00      0.00         6

                              accuracy                           0.70      1615
                             macro avg       0.47      0.27      0.26      1615
                          weighted avg       0.66      0.70      0.64      1615

# with max_df, min_df
                                        precision    recall  f1-score   support

                        bereichsleiter       0.45      0.03      0.05       192
         director_business_unit_leader       1.00      0.07      0.13        14
                   manager_team_leader       0.62      0.67      0.65       534
managing_director_small_medium_company       0.00      0.00      0.00         1
  senior_specialist_or_project_manager       0.79      0.93      0.85       868
                            specialist       0.00      0.00      0.00         6

                              accuracy                           0.73      1615
                             macro avg       0.48      0.28      0.28      1615
                          weighted avg       0.69      0.73      0.68      1615

# with max_df, min_df, top 400 features
                                        precision    recall  f1-score   support

                        bereichsleiter       0.71      0.15      0.25       192
         director_business_unit_leader       1.00      0.07      0.13        14
                   manager_team_leader       0.66      0.77      0.71       534
managing_director_small_medium_company       0.00      0.00      0.00         1
  senior_specialist_or_project_manager       0.85      0.92      0.88       868
                            specialist       0.00      0.00      0.00         6

                              accuracy                           0.77      1615
                             macro avg       0.53      0.32      0.33      1615
                          weighted avg       0.76      0.77      0.74      1615


              