In [16]:
import pandas as pd

# Load the dataset
dataset = pd.read_csv('../ds/job_dataset.csv')
print(f"Original shape: {dataset.shape}")

# Check missing values per column
print("Missing values per column:")
print(dataset.isnull().sum())

# Drop rows with any null values
dataset.dropna(inplace=True)
print(f"Shape after dropping nulls: {dataset.shape}")

# Check and drop duplicates
print(f"Duplicates found: {dataset.duplicated().sum()}")
dataset.drop_duplicates(inplace=True)

# Final cleaned dataset
print(f"Final shape: {dataset.shape}")
print("\nCleaned dataset preview:")
dataset.head()

Original shape: (17880, 18)
Missing values per column:
job_id                     0
title                      0
location                 346
department             11547
salary_range           15012
company_profile         3308
description                1
requirements            2696
benefits                7212
telecommuting              0
has_company_logo           0
has_questions              0
employment_type         3471
required_experience     7050
required_education      8105
industry                4903
function                6455
fraudulent                 0
dtype: int64
Shape after dropping nulls: (774, 18)
Duplicates found: 0
Final shape: (774, 18)

Cleaned dataset preview:


Unnamed: 0,job_id,title,location,department,salary_range,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent
6,7,Head of Content (m/f),"DE, BE, Berlin",ANDROIDPIT,20000-28000,"Founded in 2009, the Fonpit AG rose with its i...",Your Responsibilities: Manage the English-spea...,Your Know-How: ...,Your Benefits: Being part of a fast-growing co...,0,1,1,Full-time,Mid-Senior level,Master's Degree,Online Media,Management,0
15,16,VP of Sales - Vault Dragon,"SG, 01, Singapore",Sales,120000-150000,Jungle Ventures is the leading Singapore based...,About Vault Dragon Vault Dragon is Dropbox for...,Key Superpowers3-5 years of high-pressure sale...,"Basic: SGD 120,000Equity negotiable for a rock...",0,1,1,Full-time,Executive,Bachelor's Degree,Facilities Services,Sales,0
23,24,"Vice President, Sales and Sponsorship (Busines...","US, CA, Carlsbad",Businessfriend.com,100000-120000,"WDM Group is an innovative, forward thinking d...",#URL_eda2500ddcedb60957fcd7f5b164e092966f8c4e8...,"Job Requirements:A reputation as a ""go-getter""...",Businessfriend will offer a competitive six fi...,0,1,0,Full-time,Executive,Unspecified,Internet,Sales,0
98,99,IC&E Technician,"US, , Stocton, CA",Oil & Energy,95000-115000,...,"IC&amp;E Technician | Bakersfield, CA Mt. Poso...","QualificationsKnowledge, Skills &amp; Abilitie...",BENEFITSWhat is offered:Competitive compensati...,0,1,1,Full-time,Mid-Senior level,High School or equivalent,Oil & Energy,Other,1
102,103,Marketing Administrator,"GB, WAR, Coventry",Marketplace,15000-18000,Renewable Energy and Environmental Protection ...,The job is to support the growth of the #URL_9...,"Computer literateAble to work with HTML, altho...",For a suitably motivated and success orientate...,1,1,0,Full-time,Entry level,Bachelor's Degree,Internet,Marketing,0


In [17]:
dataset.to_csv('../ds/job_dataset_cleaned.csv', index=False)
print("\n✅ Cleaned dataset saved as 'job_dataset_cleaned.csv'")


✅ Cleaned dataset saved as 'job_dataset_cleaned.csv'


In [2]:
import pip
pip.main(['install', 'sqlalchemy'])
from sqlalchemy import create_engine
import pandas as pd

# Database connection parameters
DB_HOST = 'localhost'  # or your server IP
DB_PORT = '5432'       # default PostgreSQL port
DB_NAME = 'job_prediction_db'  # replace with your database name
DB_USER = 'postgres'       # replace with your username
DB_PASSWORD = 'admin'   # replace with your password

# Create connection string
connection_string = f'postgresql://{DB_USER}:{DB_PASSWORD}@{DB_HOST}:{DB_PORT}/{DB_NAME}'

# # Create engine
# import pip
# pip.main(['install', 'psycopg2-binary'])
# engine = create_engine(connection_string)
# # Save dataset to PostgreSQL
# table_name = 'job_dataset_cleaned'  # name for your table

# try:
#     dataset.to_sql(table_name, engine, if_exists='replace', index=False)
#     print(f"✅ Dataset successfully saved to PostgreSQL table: {table_name}")
#     print(f"📊 Saved {len(dataset)} rows and {len(dataset.columns)} columns")
# except Exception as e:
#     print(f"❌ Error saving to database: {e}")

Please see https://github.com/pypa/pip/issues/5599 for advice on fixing the underlying issue.
To avoid this problem you can invoke Python with '-m pip' instead of running pip directly.


In [4]:
%pip install nltk

import pandas as pd
import string
import joblib
import nltk

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import classification_report


nltk.download('stopwords')
from nltk.corpus import stopwords

df = pd.read_csv('../ds/job_dataset_cleaned.csv')  

# Combine important text fields into one feature column
def combine_text_columns(row):
    return ' '.join([
        str(row['title']),
        str(row['description']),
        str(row['requirements']),
        str(row['benefits'])
    ])

df['combined_text'] = df.apply(combine_text_columns, axis=1)

# Text cleaning function
def clean_text(text):
    text = text.lower()
    # Remove punctuation
    text = ''.join(c for c in text if c not in string.punctuation)
    # Remove stopwords
    words = text.split()
    words = [w for w in words if w not in stopwords.words('english')]
    return ' '.join(words)

df['cleaned_text'] = df['combined_text'].apply(clean_text)



Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.2 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dadia\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [30]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Your data preparation
X = df['cleaned_text']
y = df['fraudulent']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the models
models = {
    'Random Forest': RandomForestClassifier(n_estimators=200, random_state=42, max_depth=20, max_features=10000),
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'SVM': SVC(random_state=42)
}

# Train and evaluate each model
print("Model Accuracy Results:")
print("=" * 30)

for name, model in models.items():
    pipeline = Pipeline([
        ('tfidf', TfidfVectorizer(max_features=5000, ngram_range=(1,2))),
        ('clf', model)
    ])
    
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    
    print(f"{name}: {accuracy:.4f} ({accuracy*100:.2f}%)")

Model Accuracy Results:
Random Forest: 0.9806 (98.06%)
Logistic Regression: 0.9161 (91.61%)
SVM: 0.9613 (96.13%)


In [35]:
# Find the best model
best_accuracy = 0
best_model_name = ""
best_pipeline = None

print("Finding best model...")
for name, model in models.items():
    pipeline = Pipeline([
        ('tfidf', TfidfVectorizer(max_features=8000, ngram_range=(1, 2))),
        ('clf', model)
    ])
    
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_model_name = name
        best_pipeline = pipeline

# Train and evaluate the best model
print(f"\nBest Model: {best_model_name}")
print(f"Accuracy: {best_accuracy:.4f} ({best_accuracy*100:.2f}%)")

# Final predictions with best model
final_predictions = best_pipeline.predict(X_test)
print(f"\nFinal Model Performance:")
print(classification_report(y_test, final_predictions))

# Save the best model (optional)
import joblib
joblib.dump(best_pipeline, 'best_job_fraud_model.pkl')
print(f"\n✅ Best model ({best_model_name}) saved as 'best_job_fraud_model.pkl'")


Finding best model...

Best Model: Random Forest
Accuracy: 0.9806 (98.06%)

Final Model Performance:
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       142
           1       1.00      0.77      0.87        13

    accuracy                           0.98       155
   macro avg       0.99      0.88      0.93       155
weighted avg       0.98      0.98      0.98       155


✅ Best model (Random Forest) saved as 'best_job_fraud_model.pkl'


In [5]:
import numpy as np
df['emp_id'] = ['EMP'+ str(i).zfill(6) for i in range(1, len(df)+1)]
cols = ['emp_id'] + [col for col in df.columns if col != 'emp_id']
df = df[cols]
print(df[['emp_id'] + list(df.columns[1:6])].head(10))

      emp_id  job_id                                              title  \
0  EMP000001       7                              Head of Content (m/f)   
1  EMP000002      16                         VP of Sales - Vault Dragon   
2  EMP000003      24  Vice President, Sales and Sponsorship (Busines...   
3  EMP000004      99                                    IC&E Technician   
4  EMP000005     103                            Marketing Administrator   
5  EMP000006     135                Senior Business Development Manager   
6  EMP000007     154                             Senior Project Manager   
7  EMP000008     161                          Field Services Supervisor   
8  EMP000009     180                                 Internal Recruiter   
9  EMP000010     198                                 Software developer   

                     location                      department   salary_range  
0              DE, BE, Berlin                      ANDROIDPIT    20000-28000  
1           SG, 

In [6]:
df.to_csv('../ds/job_dataset_cleaned.csv', index=False)
print("\n💾 Updated dataset saved as 'job_dataset_with_empid.csv'")


💾 Updated dataset saved as 'job_dataset_with_empid.csv'
