In [84]:
import pandas as pd
import os
import PyPDF2
import re
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import joblib

# Load job descriptions from an Excel file
job_descriptions = pd.read_csv('sample_resumes/Resume/job_descriptions.xlsx.csv')

# Display the first few rows of the DataFrame to check the content
print(job_descriptions.head())

# Extract text from a PDF resume
def extract_text_from_resume(resume_path):
    """Extract text from a PDF resume."""
    text = ""
    try:
        with open(resume_path, "rb") as file:
            reader = PyPDF2.PdfReader(file)
            for page in reader.pages:
                text += page.extract_text() + "\n"
    except Exception as e:
        print(f"Error extracting text from {resume_path}: {e}")
    return text.strip()

# Load resumes from the specified folder and extract text
def load_resumes(folder_path):
    """Load resumes from the specified folder and extract text."""
    resumes = []
    for filename in os.listdir(folder_path):
        if filename.endswith('.pdf'):
            resume_path = os.path.join(folder_path, filename)
            resume_text = extract_text_from_resume(resume_path)
            resumes.append(resume_text)
    return resumes

# Update the path to the absolute path on your system
resumes_folder = r'C:\Users\Lenovo\Contacts\sample_resumes\data\data\accountant'  # Use your actual path here
resumes = load_resumes(resumes_folder)

# Display the number of resumes loaded
print(f"Loaded {len(resumes)} resumes.")

# Clean text function
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Clean resumes
cleaned_resumes = [clean_text(resume) for resume in resumes]

# Load SpaCy English model
nlp = spacy.load("en_core_web_sm")

# Preprocess text function
def preprocess_text(text):
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if not token.is_stop and token.is_alpha]
    return ' '.join(tokens)

# Process job descriptions
job_descriptions['Processed_Resume_str'] = job_descriptions['Resume_str'].apply(preprocess_text)

# TF-IDF vectorization
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(job_descriptions['Processed_Resume_str'])

# Train a Logistic Regression model
X = tfidf_matrix
y = job_descriptions['Category']  # Assuming you have a 'Category' column

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Save the model
joblib.dump(model, 'model.pkl')


         ID                                         Resume_str  \
0  16852973           HR ADMINISTRATOR/MARKETING ASSOCIATE\...   
1  22323967           HR SPECIALIST, US HR OPERATIONS      ...   
2  33176873           HR DIRECTOR       Summary      Over 2...   
3  27018550           HR SPECIALIST       Summary    Dedica...   
4  17812897           HR MANAGER         Skill Highlights  ...   

                                         Resume_html Category  
0  <div class="fontsize fontface vmargins hmargin...       HR  
1  <div class="fontsize fontface vmargins hmargin...       HR  
2  <div class="fontsize fontface vmargins hmargin...       HR  
3  <div class="fontsize fontface vmargins hmargin...       HR  
4  <div class="fontsize fontface vmargins hmargin...       HR  
Loaded 118 resumes.


['model.pkl']

## Installing libraries

In [53]:
!pip install Flask pandas numpy scikit-learn nltk spacy joblib openpyxl




In [11]:
!pip install pandas PyPDF2 python-docx



## Dataset Preparation

In [12]:
import pandas as pd

# Load job descriptions
job_descriptions = pd.read_csv('sample_resumes/Resume/job_descriptions.xlsx.csv')

# Display the first few rows of the DataFrame to check the content
print(job_descriptions.head())

         ID                                         Resume_str  \
0  16852973           HR ADMINISTRATOR/MARKETING ASSOCIATE\...   
1  22323967           HR SPECIALIST, US HR OPERATIONS      ...   
2  33176873           HR DIRECTOR       Summary      Over 2...   
3  27018550           HR SPECIALIST       Summary    Dedica...   
4  17812897           HR MANAGER         Skill Highlights  ...   

                                         Resume_html Category  
0  <div class="fontsize fontface vmargins hmargin...       HR  
1  <div class="fontsize fontface vmargins hmargin...       HR  
2  <div class="fontsize fontface vmargins hmargin...       HR  
3  <div class="fontsize fontface vmargins hmargin...       HR  
4  <div class="fontsize fontface vmargins hmargin...       HR  


### Load the resumes

In [22]:
import os
import PyPDF2

def extract_text_from_resume(resume_path):
    """Extract text from a PDF resume."""
    text = ""
    try:
        with open(resume_path, "rb") as file:
            reader = PyPDF2.PdfReader(file)
            for page in reader.pages:
                text += page.extract_text() + "\n"
    except Exception as e:
        print(f"Error extracting text from {resume_path}: {e}")
    return text.strip()

def load_resumes(folder_path):
    """Load resumes from the specified folder and extract text."""
    resumes = []
    for filename in os.listdir(folder_path):
        if filename.endswith('.pdf'):
            resume_path = os.path.join(folder_path, filename)
            resume_text = extract_text_from_resume(resume_path)
            resumes.append(resume_text)
    return resumes

# Update the path to the absolute path on your system
resumes_folder = r'C:\Users\Lenovo\Contacts\sample_resumes\data\data\accountant'  # Use your actual path here
resumes = load_resumes(resumes_folder)

# Display the number of resumes loaded
print(f"Loaded {len(resumes)} resumes.")

# Optional: Check the first resume text
if resumes:
    print("Sample Resume Text:", resumes[0])
else:
    print("No resumes loaded.")


Loaded 118 resumes.
Sample Resume Text: ACCOUNTANT
Summary
Financial Accountant specializing in financial planning, reporting and analysis within the Department of Defense.
Highlights
Account reconciliations
Results-oriented
Financial reporting
Critical thinking
Accounting operations professional
Analysis of financial systems
ERP (Enterprise Resource Planning) software.
Excellent facilitator
Accomplishments
Served on a tiger team which identified and resolved General Ledger postings in DEAMS totaling $360B in accounting adjustments. This allowed
for the first successful fiscal year-end close for 2012.
In collaboration with DFAS Europe, developed an automated tool that identified duplicate obligations. This tool allowed HQ USAFE to
deobligate over $5M in duplicate obligations.
Experience
Company Name
 
July 2011
 
to 
November 2012
 
Accountant
 
City
 
, 
State
Enterprise Resource Planning Office (ERO)
In this position as an Accountant assigned to the Defense Enterprise Accounting and 

### Clean Job Descriptions

In [29]:
!pip install openpyxl



In [37]:
import pandas as pd

# Load job descriptions
job_descriptions = pd.read_csv('sample_resumes/Resume/job_descriptions.xlsx.csv')
# Display the first few rows of the DataFrame to check the content
print(job_descriptions.head())

         ID                                         Resume_str  \
0  16852973           HR ADMINISTRATOR/MARKETING ASSOCIATE\...   
1  22323967           HR SPECIALIST, US HR OPERATIONS      ...   
2  33176873           HR DIRECTOR       Summary      Over 2...   
3  27018550           HR SPECIALIST       Summary    Dedica...   
4  17812897           HR MANAGER         Skill Highlights  ...   

                                         Resume_html Category  
0  <div class="fontsize fontface vmargins hmargin...       HR  
1  <div class="fontsize fontface vmargins hmargin...       HR  
2  <div class="fontsize fontface vmargins hmargin...       HR  
3  <div class="fontsize fontface vmargins hmargin...       HR  
4  <div class="fontsize fontface vmargins hmargin...       HR  


In [38]:
print("Columns in the Excel file:", job_descriptions.columns)

Columns in the Excel file: Index(['ID', 'Resume_str', 'Resume_html', 'Category'], dtype='object')


In [41]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
# Clean Job Descriptions
def clean_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove special characters, numbers, and extra whitespace
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Apply cleaning function to the 'Resume_str' column
job_descriptions['Cleaned_Resume_str'] = job_descriptions['Resume_str'].apply(clean_text)

# Display cleaned job descriptions
print(job_descriptions[['Resume_str', 'Cleaned_Resume_str']].head())

# Clean Resumes (assuming you have resumes in a list)
def clean_resumes(resumes):
    cleaned_resumes = []
    for resume in resumes:
        cleaned_resumes.append(clean_text(resume))
    return cleaned_resumes

# Assuming you have already loaded resumes
cleaned_resumes = clean_resumes(resumes)

# Display some cleaned resumes
print("Sample Cleaned Resume Text:", cleaned_resumes[0])

                                          Resume_str  \
0           HR ADMINISTRATOR/MARKETING ASSOCIATE\...   
1           HR SPECIALIST, US HR OPERATIONS      ...   
2           HR DIRECTOR       Summary      Over 2...   
3           HR SPECIALIST       Summary    Dedica...   
4           HR MANAGER         Skill Highlights  ...   

                                  Cleaned_Resume_str  
0  hr administratormarketing associate hr adminis...  
1  hr specialist us hr operations summary versati...  
2  hr director summary over years experience in r...  
3  hr specialist summary dedicated driven and dyn...  
4  hr manager skill highlights hr skills hr depar...  
Sample Cleaned Resume Text: accountant summary financial accountant specializing in financial planning reporting and analysis within the department of defense highlights account reconciliations resultsoriented financial reporting critical thinking accounting operations professional analysis of financial systems erp enterprise resou

In [47]:
import pandas as pd
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer

# Load the spaCy English model
nlp = spacy.load("en_core_web_sm")

# Load job descriptions
job_descriptions = pd.read_csv('sample_resumes/Resume/job_descriptions.xlsx.csv')

# Display columns to confirm the structure
print("Columns in the DataFrame:", job_descriptions.columns)

# Check the first few rows of the DataFrame
print("Job Descriptions Sample:")
print(job_descriptions.head())

# Step 1: Text Processing Function
def preprocess_text(text):
    # Tokenization and Lemmatization
    doc = nlp(text)
    # Remove stop words and create a list of clean tokens
    tokens = [token.lemma_ for token in doc if not token.is_stop and token.is_alpha]
    return ' '.join(tokens)

# Apply the preprocessing function to the first few rows
sample_resumes = job_descriptions['Resume_str'].head()  # Get a sample of the first few resumes
print("\nProcessing Sample Resumes:")
for resume in sample_resumes:
    print("Original Resume:", resume)
    processed_resume = preprocess_text(resume)
    print("Processed Resume:", processed_resume)

# If the above works, apply the preprocessing function to the entire column
job_descriptions['Processed_Resume_str'] = job_descriptions['Resume_str'].apply(preprocess_text)

# Display processed job descriptions
print("\nProcessed Job Descriptions Sample:")
print(job_descriptions[['Resume_str', 'Processed_Resume_str']].head())

# Step 2: Vectorization using TF-IDF
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(job_descriptions['Processed_Resume_str'])

# Convert TF-IDF matrix to DataFrame for better visualization
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())

# Display TF-IDF DataFrame
print("\nTF-IDF Matrix Sample:")
print(tfidf_df.head())


Columns in the DataFrame: Index(['ID', 'Resume_str', 'Resume_html', 'Category'], dtype='object')
Job Descriptions Sample:
         ID                                         Resume_str  \
0  16852973           HR ADMINISTRATOR/MARKETING ASSOCIATE\...   
1  22323967           HR SPECIALIST, US HR OPERATIONS      ...   
2  33176873           HR DIRECTOR       Summary      Over 2...   
3  27018550           HR SPECIALIST       Summary    Dedica...   
4  17812897           HR MANAGER         Skill Highlights  ...   

                                         Resume_html Category  
0  <div class="fontsize fontface vmargins hmargin...       HR  
1  <div class="fontsize fontface vmargins hmargin...       HR  
2  <div class="fontsize fontface vmargins hmargin...       HR  
3  <div class="fontsize fontface vmargins hmargin...       HR  
4  <div class="fontsize fontface vmargins hmargin...       HR  

Processing Sample Resumes:
Original Resume:          HR ADMINISTRATOR/MARKETING ASSOCIATE

HR AD

Processed Resume: HR SPECIALIST hr operation Summary Versatile medium professional background Communications Marketing Human Resources Technology experience current HR Specialist HR Operations Company City State manage communication launch Operations group policy change system outage design standard work job aid create comprehensive training program new employee contractor audit job posting old pende hold draft position Audited union hourly non union hourly salary background check drug screen conduct monthly new hire benefit briefing new employee business unit serve link HR Managers vendor handle question resolve system relate issue provide real time process improvement feedback key metric initiative successfully brand HR Operations SharePoint site Business Unit project manager RFI RFP Background Check Drug Screen vendor Marketing Communications Co op Company City state post new article change update corporate SharePoint site include graphic visual communication research draft article 

Processed Resume: hr director Summary year experience recruiting plus year Human Resources Executive Management year HRIS development maintenance year work Healthcare Enviroment Skills Recruiting FMLA EEO FLSA HRIS Development Benefit Administration Policy Development Web Page Development Accomplishments Kansas Health Institute Outcomes State Kansas Memberships Accolades Project Management Institute Member SHRM Chamber Commerce Friends University President Honor Roll Friends University Dean Honor Roll Student Liaison Friends University Topeka member mother Mother member Topeka Advertising Federation production piece create nominate ADDY Awards receive recognition outstanding customer service assistance State Kansas Travel Tourism Department ASHHRA KAHHR ACM additional Information lead Change instrumental development implementation Adjutant General Retention Research project involve survey development analyze result survey present Adjutant General help retain qualified talent Department

Processed Resume: HR MANAGER Skill Highlights hr SKILLS HR Department Startup New Organization Startups Employment Law FMLA ADA EEO WC Mediation Advocacy HR Policies Procedures Staff Recruitment Retention Salary Negotiations Employee Relations Benefits Administration Unemployment Administration Worker Compensation Administration Orientation Boarding HRIS Technologies Training Development Performance Management Organizational Development HR Program Project Management HRIS application Lawson Paychex Kronos ADP MS Office Word Excel PowerPoint Publisher Access Visio Outlook Professional Experience HR Manager Jan current Company City State manage Human Resource function e Cycle corporate office fulfillment datum center consist benefit compensation administration payroll employee relation policy compliance recruitment Key result Foster associate orient culture emphasize continuous improvement work high performance quality ensure organizational conformation applicable hr relate regulation sta

### Model Training

In [61]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Example data (replace with your actual job descriptions and resume data)
# Assuming 'Category' is the label for classification

# Step 1: Define Features (TF-IDF matrix) and Labels (Job categories or another target label)
X = tfidf_df  # TF-IDF features
y = job_descriptions['Category']  # Target label (job categories)

# Step 2: Split the data into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 3: Initialize and train a Logistic Regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Step 4: Make predictions on the test set
y_pred = model.predict(X_test)

# Step 5: Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.2f}")

# Step 6: Display detailed classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Model Accuracy: 0.64

Classification Report:
                        precision    recall  f1-score   support

            ACCOUNTANT       0.81      0.90      0.85        29
              ADVOCATE       0.55      0.57      0.56        30
           AGRICULTURE       0.50      0.12      0.20         8
               APPAREL       0.53      0.40      0.46        20
                  ARTS       0.12      0.11      0.12        18
            AUTOMOBILE       0.00      0.00      0.00         6
              AVIATION       0.72      0.86      0.78        21
               BANKING       0.75      0.65      0.70        23
                   BPO       0.00      0.00      0.00         2
  BUSINESS-DEVELOPMENT       0.84      0.59      0.70        27
                  CHEF       0.85      0.71      0.77        24
          CONSTRUCTION       0.87      0.76      0.81        34
            CONSULTANT       0.45      0.25      0.32        20
              DESIGNER       0.75      0.79      0.77     

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### Random Forest model

In [62]:
from sklearn.ensemble import RandomForestClassifier

# Initialize the Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
rf_model.fit(X_train, y_train)

# Make predictions
y_pred_rf = rf_model.predict(X_test)

# Evaluate the model
rf_accuracy = accuracy_score(y_test, y_pred_rf)
print(f"Random Forest Model Accuracy: {rf_accuracy:.2f}")

# Detailed classification report
print("\nRandom Forest Classification Report:")
print(classification_report(y_test, y_pred_rf))


Random Forest Model Accuracy: 0.64

Random Forest Classification Report:
                        precision    recall  f1-score   support

            ACCOUNTANT       0.70      0.97      0.81        29
              ADVOCATE       0.84      0.70      0.76        30
           AGRICULTURE       0.50      0.12      0.20         8
               APPAREL       0.50      0.50      0.50        20
                  ARTS       0.17      0.06      0.08        18
            AUTOMOBILE       0.00      0.00      0.00         6
              AVIATION       0.82      0.86      0.84        21
               BANKING       0.70      0.61      0.65        23
                   BPO       0.00      0.00      0.00         2
  BUSINESS-DEVELOPMENT       0.64      0.52      0.57        27
                  CHEF       0.74      0.71      0.72        24
          CONSTRUCTION       0.90      0.82      0.86        34
            CONSULTANT       0.60      0.30      0.40        20
              DESIGNER       0

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [81]:
!pip install joblib




In [82]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import joblib  # Change this line

# Example function to load and preprocess data
def load_and_preprocess_data(resume_path, job_description_path):
    # Load your resume and job description files
    resumes = pd.read_csv(resume_path)  # Update this if necessary
    job_descriptions = pd.read_excel(job_description_path)

    # Your data processing code goes here
    # For example, extracting text from resumes and preparing the TF-IDF Vectorizer
    vectorizer = TfidfVectorizer()
    
    # Fit and transform your data here if required
    # Example:
    # X = vectorizer.fit_transform(resumes['text_column'])
    
    return resumes, job_descriptions, vectorizer

# Add other necessary functions or models


In [79]:
import joblib

# Assuming 'model' is your trained model
joblib.dump(model, 'project_folder/model.pkl')


['project_folder/model.pkl']

In [80]:
import joblib

# Save the model
joblib.dump(model, 'model.pkl')


['model.pkl']

In [None]:
!pip install Flask pandas numpy scikit-learn nltk spacy joblib openpyxl
import pandas as pd

# Load job descriptions
job_descriptions = pd.read_csv('sample_resumes/Resume/job_descriptions.xlsx.csv')

# Display the first few rows of the DataFrame to check the content
print(job_descriptions.head())

import os
import PyPDF2

def extract_text_from_resume(resume_path):
    """Extract text from a PDF resume."""
    text = ""
    try:
        with open(resume_path, "rb") as file:
            reader = PyPDF2.PdfReader(file)
            for page in reader.pages:
                text += page.extract_text() + "\n"
    except Exception as e:
        print(f"Error extracting text from {resume_path}: {e}")
    return text.strip()

def load_resumes(folder_path):
    """Load resumes from the specified folder and extract text."""
    resumes = []
    for filename in os.listdir(folder_path):
        if filename.endswith('.pdf'):
            resume_path = os.path.join(folder_path, filename)
            resume_text = extract_text_from_resume(resume_path)
            resumes.append(resume_text)
    return resumes

# Update the path to the absolute path on your system
resumes_folder = r'C:\Users\Lenovo\Contacts\sample_resumes\data\data\accountant'  # Use your actual path here
resumes = load_resumes(resumes_folder)

# Display the number of resumes loaded
print(f"Loaded {len(resumes)} resumes.")

# Optional: Check the first resume text
if resumes:
    print("Sample Resume Text:", resumes[0])
else:
    print("No resumes loaded.")
    
!pip install openpyxl

import pandas as pd

# Load job descriptions
job_descriptions = pd.read_csv('sample_resumes/Resume/job_descriptions.xlsx.csv')
# Display the first few rows of the DataFrame to check the content
print(job_descriptions.head())

print("Columns in the Excel file:", job_descriptions.columns)

import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
# Clean Job Descriptions
def clean_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove special characters, numbers, and extra whitespace
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Apply cleaning function to the 'Resume_str' column
job_descriptions['Cleaned_Resume_str'] = job_descriptions['Resume_str'].apply(clean_text)

# Display cleaned job descriptions
print(job_descriptions[['Resume_str', 'Cleaned_Resume_str']].head())

# Clean Resumes (assuming you have resumes in a list)
def clean_resumes(resumes):
    cleaned_resumes = []
    for resume in resumes:
        cleaned_resumes.append(clean_text(resume))
    return cleaned_resumes

# Assuming you have already loaded resumes
cleaned_resumes = clean_resumes(resumes)

# Display some cleaned resumes
print("Sample Cleaned Resume Text:", cleaned_resumes[0])

import pandas as pd
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer

# Load the spaCy English model
nlp = spacy.load("en_core_web_sm")

# Load job descriptions
job_descriptions = pd.read_csv('sample_resumes/Resume/job_descriptions.xlsx.csv')

# Display columns to confirm the structure
print("Columns in the DataFrame:", job_descriptions.columns)

# Check the first few rows of the DataFrame
print("Job Descriptions Sample:")
print(job_descriptions.head())

# Step 1: Text Processing Function
def preprocess_text(text):
    # Tokenization and Lemmatization
    doc = nlp(text)
    # Remove stop words and create a list of clean tokens
    tokens = [token.lemma_ for token in doc if not token.is_stop and token.is_alpha]
    return ' '.join(tokens)

# Apply the preprocessing function to the first few rows
sample_resumes = job_descriptions['Resume_str'].head()  # Get a sample of the first few resumes
print("\nProcessing Sample Resumes:")
for resume in sample_resumes:
    print("Original Resume:", resume)
    processed_resume = preprocess_text(resume)
    print("Processed Resume:", processed_resume)

# If the above works, apply the preprocessing function to the entire column
job_descriptions['Processed_Resume_str'] = job_descriptions['Resume_str'].apply(preprocess_text)

# Display processed job descriptions
print("\nProcessed Job Descriptions Sample:")
print(job_descriptions[['Resume_str', 'Processed_Resume_str']].head())

# Step 2: Vectorization using TF-IDF
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(job_descriptions['Processed_Resume_str'])

# Convert TF-IDF matrix to DataFrame for better visualization
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())

# Display TF-IDF DataFrame
print("\nTF-IDF Matrix Sample:")
print(tfidf_df.head())

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Example data (replace with your actual job descriptions and resume data)
# Assuming 'Category' is the label for classification

# Step 1: Define Features (TF-IDF matrix) and Labels (Job categories or another target label)
X = tfidf_df  # TF-IDF features
y = job_descriptions['Category']  # Target label (job categories)

# Step 2: Split the data into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 3: Initialize and train a Logistic Regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Step 4: Make predictions on the test set
y_pred = model.predict(X_test)

# Step 5: Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.2f}")

# Step 6: Display detailed classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

from sklearn.ensemble import RandomForestClassifier

# Initialize the Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
rf_model.fit(X_train, y_train)

# Make predictions
y_pred_rf = rf_model.predict(X_test)

# Evaluate the model
rf_accuracy = accuracy_score(y_test, y_pred_rf)
print(f"Random Forest Model Accuracy: {rf_accuracy:.2f}")

# Detailed classification report
print("\nRandom Forest Classification Report:")
print(classification_report(y_test, y_pred_rf))

import joblib

# Assuming 'model' is your trained model
joblib.dump(model, 'project_folder/model.pkl')

import joblib

# Save the model
joblib.dump(model, 'model.pkl')
