In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

In [8]:
def run_model(df):
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(df['Resume_str'], df['Category'], test_size=0.2, random_state=42)

    # Vectorize the text
    vectorizer = TfidfVectorizer()
    X_train_vec = vectorizer.fit_transform(X_train)
    X_test_vec = vectorizer.transform(X_test)

    # Train a simple model
    model = LogisticRegression()
    model.fit(X_train_vec, y_train)

    # Evaluate the model
    y_pred = model.predict(X_test_vec)
    print(classification_report(y_test, y_pred, zero_division=0))

In [11]:
# Load the clean data (after simple preprocess only)
df_cleaned = pd.read_csv('dataset/Resume/cleaned-Resume.csv')

# Load the filtered data (after EDA and further processing)
df_filtered = pd.read_csv('dataset/Resume/filtered-Resume.csv')

# Run the model for both dataframes to understand the differences
print("Model results for clean data (after simple preprocess only):\n")
run_model(df_cleaned)
print("\nModel results for filtered data (after EDA and further processing):\n")
run_model(df_filtered)

Model results for clean data (after simple preprocess only):

                        precision    recall  f1-score   support

            ACCOUNTANT       0.86      0.89      0.88        28
              ADVOCATE       0.58      0.50      0.54        30
           AGRICULTURE       1.00      0.10      0.18        10
               APPAREL       0.47      0.50      0.48        14
                  ARTS       0.29      0.28      0.29        18
            AUTOMOBILE       0.00      0.00      0.00         5
              AVIATION       0.80      0.83      0.82        24
               BANKING       0.60      0.75      0.67        16
                   BPO       0.00      0.00      0.00         3
  BUSINESS-DEVELOPMENT       0.59      0.57      0.58        23
                  CHEF       0.81      0.81      0.81        31
          CONSTRUCTION       0.89      0.75      0.81        32
            CONSULTANT       0.75      0.26      0.39        23
              DESIGNER       0.82      0.

Let's use beter models to get better results

In [15]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

# Split the data
X_train, X_test, y_train, y_test = train_test_split(df_filtered['Resume_str'], df_filtered['Category'], test_size=0.2, random_state=42)

# Initialize TF-IDF Vectorizer
vectorizer = TfidfVectorizer(stop_words='english')

# Initialize models
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
gb_model = GradientBoostingClassifier(n_estimators=100, random_state=42)

# Fit and transform training data
X_train_vec = vectorizer.fit_transform(X_train)

# Transform test data
X_test_vec = vectorizer.transform(X_test)

# Fit models
rf_model.fit(X_train_vec, y_train)
gb_model.fit(X_train_vec, y_train)


In [16]:
# Predict
rf_predictions = rf_model.predict(X_test_vec)
gb_predictions = gb_model.predict(X_test_vec)

# Evaluate
print("Random Forest Classification Report:\n", classification_report(y_test, rf_predictions))
print("Gradient Boosting Classification Report:\n", classification_report(y_test, gb_predictions))

Random Forest Classification Report:
                         precision    recall  f1-score   support

            ACCOUNTANT       0.69      0.93      0.79        29
              ADVOCATE       0.85      0.42      0.56        26
                  ARTS       0.20      0.06      0.09        17
              AVIATION       0.71      0.74      0.72        23
               BANKING       0.70      0.52      0.60        27
  BUSINESS-DEVELOPMENT       0.52      0.57      0.54        23
                  CHEF       0.65      0.88      0.75        17
          CONSTRUCTION       0.86      0.72      0.78        25
            CONSULTANT       0.67      0.26      0.38        23
              DESIGNER       0.88      0.75      0.81        20
           ENGINEERING       0.67      0.77      0.72        31
               FINANCE       0.57      0.62      0.59        21
               FITNESS       0.86      0.46      0.60        26
            HEALTHCARE       0.34      0.71      0.47        14
 

The gradient boosting model produced much better  results!