In [60]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import gensim
import nltk
from gensim.utils import simple_preprocess
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
from sklearn.decomposition import LatentDirichletAllocation as LDA
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from gensim import corpora, models
from pprint import pprint

from dfunc import lemmatize
from dfunc import preprocess

sns.set_style('whitegrid')
%matplotlib inline

In [61]:
# Set DataFrame to only be 'descriptions' and 'fraudulent'
df = pd.read_csv('fake_job_postings.csv')
df = df.loc[df['location'].str[:2] == 'US']
df = df[['description', 'fraudulent']]

In [62]:
# Preprocess 'description' feature and tokenize
processed_des = df['description'].map(preprocess)

In [63]:
# Generate dictionary from tokens
dictionary = gensim.corpora.Dictionary(processed_des)

In [64]:
# Filter out extremely common and uncommon words
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

In [65]:
# Create bag of words for each description
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_des]

In [76]:
bow_corpus[10]

[(19, 2),
 (47, 1),
 (70, 1),
 (76, 1),
 (77, 1),
 (135, 1),
 (283, 1),
 (375, 1),
 (427, 1),
 (445, 1),
 (461, 1),
 (463, 2),
 (467, 1),
 (480, 1),
 (542, 1),
 (550, 2),
 (555, 1),
 (624, 1),
 (630, 1),
 (634, 1),
 (639, 1),
 (640, 1),
 (675, 2),
 (677, 1),
 (693, 1),
 (709, 2),
 (710, 1),
 (711, 1),
 (712, 2),
 (713, 2),
 (714, 1),
 (715, 1),
 (716, 1),
 (717, 2),
 (718, 1),
 (719, 1),
 (720, 1),
 (721, 2),
 (722, 1),
 (723, 1),
 (724, 2),
 (725, 1),
 (726, 1),
 (727, 1),
 (728, 1),
 (729, 1),
 (730, 1),
 (731, 1),
 (732, 1),
 (733, 1),
 (734, 1),
 (735, 1),
 (736, 1)]

In [66]:
# Instantiate TfidfModel and transform bag of words
tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]

In [73]:
corpus_tfidf[0]

[(0, 0.0829669716282516),
 (1, 0.577344689349353),
 (2, 0.05040693248424424),
 (3, 0.09713113465660053),
 (4, 0.10759131466018691),
 (5, 0.13903840458916264),
 (6, 0.08480890963439518),
 (7, 0.07404303918097747),
 (8, 0.15042943778028478),
 (9, 0.16574497812035438),
 (10, 0.14263233563606934),
 (11, 0.06294590841563445),
 (12, 0.17048647407264067),
 (13, 0.06265182601771495),
 (14, 0.16445690286068715),
 (15, 0.0804507537246992),
 (16, 0.0811914922618119),
 (17, 0.10172944438029373),
 (18, 0.061532670617842164),
 (19, 0.05889165019531335),
 (20, 0.07143396045036507),
 (21, 0.38391575382633975),
 (22, 0.041734993416117916),
 (23, 0.10788465441048817),
 (24, 0.10316981825486221),
 (25, 0.1086325884521191),
 (26, 0.08097772343462079),
 (27, 0.16445690286068715),
 (28, 0.06875643671354088),
 (29, 0.03551137016706402),
 (30, 0.041104543897128074),
 (31, 0.08444223282447973),
 (32, 0.0554183099503354),
 (33, 0.0707748339726415),
 (34, 0.064927209426889),
 (35, 0.08372381282271865),
 (36, 0.0

In [53]:
# Create LDAMulticore model with bag of words
lda_model = gensim.models.LdaMulticore(bow_corpus,
                                       num_topics=2,
                                       id2word=dictionary,
                                       passes=2,
                                       workers=2)