In [1]:
import joblib
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.tokenize import word_tokenize
from wordcloud import WordCloud,STOPWORDS
from collections import defaultdict
from nltk import ngrams
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
import warnings
warnings.filterwarnings('ignore')
import tqdm


In [2]:
loaded_model = joblib.load('fraud.sav')

In [3]:
data_raw=pd.read_csv("fake_job_postings.csv")

In [5]:
def fea(text):
    if text=="":
        return 0
    else:
        return 1


In [6]:
def code(string):
    return string.split(",")[0]

In [7]:
def clean(text):
    
    text=text.lower()
    obj=re.compile(r"<.*?>")                     #removing html tags
    text=obj.sub(r" ",text)
    obj=re.compile(r"https://\S+|http://\S+")    #removing url
    text=obj.sub(r" ",text)
    obj=re.compile(r"[^\w\s]")                   #removing punctuations
    text=obj.sub(r" ",text)
    obj=re.compile(r"\d{1,}")                    #removing digits
    text=obj.sub(r" ",text)
    obj=re.compile(r"_+")                        #removing underscore
    text=obj.sub(r" ",text)
    obj=re.compile(r"\s\w\s")                    #removing single character
    text=obj.sub(r" ",text)
    obj=re.compile(r"\s{2,}")                    #removing multiple spaces
    text=obj.sub(r" ",text)
   
    
    stemmer = SnowballStemmer("english")
    text=[stemmer.stem(word) for word in text.split() if word not in stop]
    
    return " ".join(text)

In [8]:
def generate(text,ngram):
    n_grams=ngrams(word_tokenize(text),ngram)
    grams=[" ".join(val) for val in n_grams]
    return grams

In [10]:

data = data_raw
data.dropna(axis=0)['location'].apply(lambda x: x.split(',')[0])
text_data=data.select_dtypes(include="object")
text_data.drop(["location","salary_range"],axis=1,inplace=True)

text_col=text_data.columns
data[text_col]=data[text_col].replace(np.nan,"")
data["text"]=""
for col in text_data.columns:
    data["text"]=data["text"]+" "+data[col]
for col in text_col:
    data[col]=data[col].apply(fea)
data.drop(["salary_range","job_id"],axis=1,inplace=True)
data.dropna(axis=0,inplace=True)
data["text_len"]=data["text"].str.len()
drop_col=['title','department', 'description', 'requirements',
    'benefits', 'employment_type', 'required_experience',
    'required_education', 'industry', 'function']

data.drop(drop_col,axis=1,inplace=True)
data["country"]=data["location"].apply(code)
p=data.groupby("country")["country"].count().sort_values(ascending=False)
data=data[data["country"]=="US"]
data.drop(columns=["country","location"],axis=1,inplace=True)
data.reset_index(drop=True,inplace=True)

stop=set(stopwords.words("english"))
data["text"]=data["text"].apply(clean)

data_processed = data.copy()


In [13]:
vectorizer=TfidfVectorizer(strip_accents='unicode',
                            analyzer='word',
                            ngram_range=(1, 2),
                            max_features=15000,
                            smooth_idf=True,
                            sublinear_tf=True)
vectorizer.fit(data_processed['text'])

In [46]:
X_text = vectorizer.transform(data_processed[data_processed['fraudulent']==1]['text'][68].split(' '))

In [15]:
X_text.shape

(256, 15000)

In [16]:
X_training_vectorized = vectorizer.fit_transform(data_processed['text'])

In [17]:
X_training_vectorized.shape

(10656, 15000)

In [18]:
pca = PCA(n_components=0.95)

In [19]:
pca.fit(X_training_vectorized.toarray())

In [47]:
X_test_pca = pca.transform(X_text.toarray())

In [48]:
X_test_pca.shape

(501, 3079)

In [49]:
y_test = loaded_model.predict(X_test_pca)

In [25]:
len(y_test)

256

In [26]:
y_test.sum()

2

In [42]:
data_processed

Unnamed: 0,company_profile,telecommuting,has_company_logo,has_questions,fraudulent,text,text_len
0,1,0,1,0,0,market intern market food creat groundbreak aw...,2702
1,1,0,1,0,0,commiss machineri assist cma valor servic prov...,2647
2,1,0,1,0,0,account execut washington dc sale passion impr...,5538
3,1,0,1,1,0,bill review manag spotsourc solut llc global h...,4040
4,0,0,0,0,0,account clerk job overviewapex environment con...,3445
...,...,...,...,...,...,...,...
10651,1,0,1,0,0,product manag product develop flite deliv ad i...,2671
10652,0,0,1,0,0,recruit coordin respons facilit recruit hire p...,2090
10653,0,0,0,0,0,javascript develop sr javascript develop exper...,586
10654,1,0,1,1,0,payrol account account weblinc commerc platfor...,4826


In [44]:
data_processed[data_processed['fraudulent']==1]

Unnamed: 0,company_profile,telecommuting,has_company_logo,has_questions,fraudulent,text,text_len
68,1,0,1,1,1,ic technician oil energi staf amp recruit done...,5258
117,1,0,1,1,1,technician instrument control power plant ener...,6697
144,1,0,1,1,1,ic technician mt poso oil energi staf amp recr...,5311
230,1,0,0,0,1,financ auto car sale hr look adventur peopl jo...,479
315,0,0,0,0,1,admin assist receptionist newli establish comp...,714
...,...,...,...,...,...,...,...
10621,0,0,0,0,1,administr offic assist administr amp offic ass...,1194
10622,0,0,0,0,1,custom servic account clerk account financ cig...,1152
10623,0,0,0,0,1,student posit part time full time student posi...,495
10624,0,0,0,0,1,payrol clerk job descriptionw seek full time p...,842


In [45]:
data_processed[data_processed['fraudulent']==1]['text'][68]

'ic technician oil energi staf amp recruit done right oil amp energi industri repres candid automat grant follow perk expert negoti behalf maxim compens packag impliment ongo increas signific sign bonus refin resourc addit potenti sign bonus client compani offer year access anyperk signific corpor discount cell phone event ticket hous clean everyth inbetween save thousand daili expenditur profession reloc servic town candid candid encourag particip referr bonus program rang anywher success hire candid refer direct refin resourc teampleas submit referr via onlin referr formthank look forward work soon click enlarg imag ic amp technician bakersfield ca mt posoprincip duti respons calibr test maintain troubleshoot instal power plant instrument control system electr equip perform mainten motor control center motor oper valv generat excit equip motor perform prevent predict correct mainten equip coordin work various team member design instal new equip system modif troubleshoot perform maint

In [50]:
probability_fraudulent = ((y_test.sum())/len(y_test))*100

In [52]:
y_test

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [51]:
probability_fraudulent

0.0

In [51]:

for i in range(len(data_processed)):
    try:
        data_temp = data_processed.loc[i:i]
        
        X = vectorizer.transform(data_temp["text"])
        print(type(X))
        X = pca.fit_transform(X.toarray())
        predictions = loaded_model.predict_proba(X)
        print(predictions)
    except Exception as e:
        print(e)

<class 'scipy.sparse._csr.csr_matrix'>
Feature shape mismatch, expected: 3079, got 1
<class 'scipy.sparse._csr.csr_matrix'>
Feature shape mismatch, expected: 3079, got 1
<class 'scipy.sparse._csr.csr_matrix'>
Feature shape mismatch, expected: 3079, got 1
<class 'scipy.sparse._csr.csr_matrix'>
Feature shape mismatch, expected: 3079, got 1
<class 'scipy.sparse._csr.csr_matrix'>
Feature shape mismatch, expected: 3079, got 1
<class 'scipy.sparse._csr.csr_matrix'>
Feature shape mismatch, expected: 3079, got 1
<class 'scipy.sparse._csr.csr_matrix'>
Feature shape mismatch, expected: 3079, got 1
<class 'scipy.sparse._csr.csr_matrix'>
Feature shape mismatch, expected: 3079, got 1
<class 'scipy.sparse._csr.csr_matrix'>
Feature shape mismatch, expected: 3079, got 1
<class 'scipy.sparse._csr.csr_matrix'>
Feature shape mismatch, expected: 3079, got 1
<class 'scipy.sparse._csr.csr_matrix'>
Feature shape mismatch, expected: 3079, got 1
<class 'scipy.sparse._csr.csr_matrix'>
Feature shape mismatch, ex

In [11]:
data = data_raw.loc[2:2]

In [12]:
data_raw

Unnamed: 0,job_id,title,location,department,salary_range,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent
0,1,Marketing Intern,"US, NY, New York",Marketing,,"We're Food52, and we've created a groundbreaki...","Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,,0,1,0,Other,Internship,,,Marketing,0
1,2,Customer Service - Cloud Video Production,"NZ, , Auckland",Success,,"90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:Your key responsibilit...,What you will get from usThrough being part of...,0,1,0,Full-time,Not Applicable,,Marketing and Advertising,Customer Service,0
2,3,Commissioning Machinery Assistant (CMA),"US, IA, Wever",,,Valor Services provides Workforce Solutions th...,"Our client, located in Houston, is actively se...",Implement pre-commissioning and commissioning ...,,0,1,0,,,,,,0
3,4,Account Executive - Washington DC,"US, DC, Washington",Sales,,Our passion for improving quality of life thro...,THE COMPANY: ESRI – Environmental Systems Rese...,"EDUCATION: Bachelor’s or Master’s in GIS, busi...",Our culture is anything but corporate—we have ...,0,1,0,Full-time,Mid-Senior level,Bachelor's Degree,Computer Software,Sales,0
4,5,Bill Review Manager,"US, FL, Fort Worth",,,SpotSource Solutions LLC is a Global Human Cap...,JOB TITLE: Itemization Review ManagerLOCATION:...,QUALIFICATIONS:RN license in the State of Texa...,Full Benefits Offered,0,1,1,Full-time,Mid-Senior level,Bachelor's Degree,Hospital & Health Care,Health Care Provider,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17875,17876,Account Director - Distribution,"CA, ON, Toronto",Sales,,Vend is looking for some awesome new talent to...,Just in case this is the first time you’ve vis...,To ace this role you:Will eat comprehensive St...,What can you expect from us?We have an open cu...,0,1,1,Full-time,Mid-Senior level,,Computer Software,Sales,0
17876,17877,Payroll Accountant,"US, PA, Philadelphia",Accounting,,WebLinc is the e-commerce platform and service...,The Payroll Accountant will focus primarily on...,- B.A. or B.S. in Accounting- Desire to have f...,Health &amp; WellnessMedical planPrescription ...,0,1,1,Full-time,Mid-Senior level,Bachelor's Degree,Internet,Accounting/Auditing,0
17877,17878,Project Cost Control Staff Engineer - Cost Con...,"US, TX, Houston",,,We Provide Full Time Permanent Positions for m...,Experienced Project Cost Control Staff Enginee...,At least 12 years professional experience.Abil...,,0,0,0,Full-time,,,,,0
17878,17879,Graphic Designer,"NG, LA, Lagos",,,,Nemsia Studios is looking for an experienced v...,1. Must be fluent in the latest versions of Co...,Competitive salary (compensation will be based...,0,0,1,Contract,Not Applicable,Professional,Graphic Design,Design,0


In [13]:
data_raw.columns

Index(['job_id', 'title', 'location', 'department', 'salary_range',
       'company_profile', 'description', 'requirements', 'benefits',
       'telecommuting', 'has_company_logo', 'has_questions', 'employment_type',
       'required_experience', 'required_education', 'industry', 'function',
       'fraudulent'],
      dtype='object')

In [14]:
X.shape

(1, 1)

In [15]:
data = data_raw.loc[:1000].copy()
text_data=data.select_dtypes(include="object")
text_data.drop(["location","salary_range"],axis=1,inplace=True)

text_col=text_data.columns
data[text_col]=data[text_col].replace(np.nan,"")
data["text"]=""
for col in text_data.columns:
    data["text"]=data["text"]+" "+data[col]
for col in text_col:
    data[col]=data[col].apply(fea)
data.drop(["salary_range","job_id"],axis=1,inplace=True)
data.dropna(axis=0,inplace=True)
data["text_len"]=data["text"].str.len()
drop_col=['title','department', 'description', 'requirements',
        'benefits', 'employment_type', 'required_experience',
        'required_education', 'industry', 'function']

data.drop(drop_col,axis=1,inplace=True)
data["country"]=data["location"].apply(code)
p=data.groupby("country")["country"].count().sort_values(ascending=False)
data=data[data["country"]=="US"]
data.drop(columns=["country","location"],axis=1,inplace=True)
data.reset_index(drop=True,inplace=True)

stop=set(stopwords.words("english"))
data["text"]=data["text"].apply(clean)

real_job=data[data["fraudulent"]==1]["text"].values
vectorizer=TfidfVectorizer(strip_accents='unicode',
                        analyzer='word',
                        ngram_range=(1, 2),
                        max_features=15000,
                        smooth_idf=True,
                        sublinear_tf=True)
print('-------------------------------------------------------------------')
print(data)
vectorizer.fit(data["text"])
X = vectorizer.transform(data["text"])
pca = PCA(n_components=0.95)

X = pca.fit_transform(X.toarray())
y=data["fraudulent"]
predictions = loaded_model.predict_proba(X)
print(predictions)


-------------------------------------------------------------------
     company_profile  telecommuting  has_company_logo  has_questions  \
0                  1              0                 1              0   
1                  1              0                 1              0   
2                  1              0                 1              0   
3                  1              0                 1              1   
4                  0              0                 0              0   
..               ...            ...               ...            ...   
636                0              0                 0              0   
637                1              0                 1              1   
638                0              0                 0              1   
639                1              0                 1              1   
640                1              0                 1              1   

     fraudulent                                               text 

ValueError: Feature shape mismatch, expected: 3079, got 364