In [2]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from scipy.sparse import csr_matrix

In [3]:
df2 = pd.read_csv('New_fake_jobs_dataset.csv')

In [4]:
df2.head()

Unnamed: 0,job_id,title,location,department,salary_range,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent
0,1,Marketing Intern,"US, NY, New York",Marketing,515577-812572,"We're Food52, and we've created a groundbreaki...","Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,,0,1,0,Other,Internship,Not Specified,Other,Marketing,0
1,2,Customer Service - Cloud Video Production,"NZ, , Auckland",Success,515577-812572,"90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:Your key responsibilit...,What you will get from usThrough being part of...,0,1,0,Full-time,Not Specified,Not Specified,Marketing and Advertising,Customer Service,0
2,3,Commissioning Machinery Assistant (CMA),"US, IA, Wever",,515577-812572,Valor Services provides Workforce Solutions th...,"Our client, located in Houston, is actively se...",Implement pre-commissioning and commissioning ...,,0,1,0,Other,Not Specified,Not Specified,Other,Other,0
3,4,Account Executive - Washington DC,"US, DC, Washington",Sales,515577-812572,Our passion for improving quality of life thro...,THE COMPANY: ESRI – Environmental Systems Rese...,"EDUCATION: Bachelor’s or Master’s in GIS, busi...",Our culture is anything but corporate—we have ...,0,1,0,Full-time,Mid-Senior level,Bachelor's Degree,Computer Software,Sales,0
4,5,Bill Review Manager,"US, FL, Fort Worth",,515577-812572,SpotSource Solutions LLC is a Global Human Cap...,JOB TITLE: Itemization Review ManagerLOCATION:...,QUALIFICATIONS:RN license in the State of Texa...,Full Benefits Offered,0,1,1,Full-time,Mid-Senior level,Bachelor's Degree,Hospital & Health Care,Health Care Provider,0


In [5]:
df2['department'].isnull().sum()

11547

In [6]:
df2["department"] = df2['department'].fillna("")
df2['department'].isnull().sum()

0

In [7]:
df2['salary_range'] = df2['salary_range'].apply(lambda x: "1328149" if "515577-812572" in str(x) else x)

# Logistic Regression Implementation

In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [9]:
df2.columns

Index(['job_id', 'title', 'location', 'department', 'salary_range',
       'company_profile', 'description', 'requirements', 'benefits',
       'telecommuting', 'has_company_logo', 'has_questions', 'employment_type',
       'required_experience', 'required_education', 'industry', 'function',
       'fraudulent'],
      dtype='object')

In [10]:
X = df2.drop(['fraudulent'], axis=1)
y = df2['fraudulent'] 

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101) 

In [12]:
categorical_columns = ['job_id', 'title', 'location', 'department', 'salary_range', 
                       'company_profile', 'description', 'requirements', 'benefits', 
                       'telecommuting', 'has_company_logo', 'has_questions', 
                       'employment_type', 'required_experience', 'required_education', 
                       'industry', 'function']

# Use sparse matrix for OneHotEncoder
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=True)

# Preprocessor step
preprocessor = encoder  # Replace with any other preprocessing steps if needed

# Create pipeline
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('scaler', StandardScaler(with_mean=False)),  # Scaling sparse matrices, without centering
    ('classifier', LogisticRegression(solver='liblinear'))  # 'liblinear' for smaller datasets
])

# Fit the encoder and transform categorical columns into sparse format
encoded_data_sparse = encoder.fit_transform(df2[categorical_columns])

# Convert sparse matrix to a DataFrame (this avoids memory overhead for dense matrices)
encoded_df = pd.DataFrame.sparse.from_spmatrix(encoded_data_sparse, columns=encoder.get_feature_names_out(categorical_columns))

# Drop categorical columns from the original data
data = df2.drop(columns=categorical_columns).reset_index(drop=True)

# Concatenate the encoded features with the original data
data = pd.concat([data, encoded_df], axis=1)

In [13]:
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [14]:
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {np.round(accuracy,2)*100} %")

Accuracy: 97.0 %
