In [6]:
import numpy as np
from nltk.util import ngrams
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

In [7]:
spark = SparkSession.builder\
    .master("local")\
    .appName("BD-Project-1")\
    .config("spark.some.config.option", "some-value")\
    .getOrCreate()

## Loading the data

In [8]:
csv_path = "fake_job_postings.csv" 
df_jobs = spark.read.csv(csv_path, escape="\"", header=True, inferSchema=True)

In [9]:
print("rows : ", df_jobs.count())
columns = df_jobs.columns
print("Columns: ",columns)
df_jobs.show(10)

rows :  17880
Columns:  ['job_id', 'title', 'location', 'department', 'salary_range', 'company_profile', 'description', 'requirements', 'benefits', 'telecommuting', 'has_company_logo', 'has_questions', 'employment_type', 'required_experience', 'required_education', 'industry', 'function', 'fraudulent']
+------+--------------------+--------------------+----------+------------+--------------------+--------------------+--------------------+--------------------+-------------+----------------+-------------+---------------+-------------------+--------------------+--------------------+--------------------+----------+
|job_id|               title|            location|department|salary_range|     company_profile|         description|        requirements|            benefits|telecommuting|has_company_logo|has_questions|employment_type|required_experience|  required_education|            industry|            function|fraudulent|
+------+--------------------+--------------------+----------+-------

## Spliting the dataset to Real/Fake

In [10]:
real_jobs = df_jobs.filter(col("fraudulent") == 0)
fake_jobs = df_jobs.filter(col("fraudulent") == 1)

print("Real jobs: ", real_jobs.count())
print("Fake jobs: ", fake_jobs.count())

Real jobs:  17014
Fake jobs:  866


### F

In [11]:
fake_ranges = fake_jobs.select("salary_range").filter(col('salary_range').isNotNull())
max_ranges = fake_ranges.rdd.map(lambda r: int(r.salary_range.split("-")[1]))
fake_mean = max_ranges.mean()
fake_stdev = max_ranges.stdev()

In [12]:
print("Fake Max: mean = ", fake_mean, " stdev=", fake_stdev)

Fake Max: mean =  159218.71300448422  stdev= 633739.2454231088


### G

In [13]:
real_range = real_jobs.select("salary_range").filter(col('salary_range').isNotNull())
min_ranges = real_range.rdd.map(lambda r: r.salary_range.split("-")[0])\
    .filter(lambda r: r.isnumeric()).map(lambda r: int(r))
median = np.median(min_ranges.collect())

In [14]:
print("Real median: ", median)

Real median:  36000.0


# Most common bi/tri_grams in Real Jobs

In [15]:
bi_gram = real_jobs.select(col("description"))\
    .filter(col("description").isNotNull())\
    .rdd\
    .flatMap(lambda line: [bgrams[0] + " " + bgrams[1] for bgrams in ngrams(line.description.split(), 2)])\
    .map(lambda x: (x, 1))\
    .reduceByKey(lambda x,y: x+y).sortBy(lambda x: x[1], False)
    
# 

In [16]:
bi_gram.take(10)

[('of the', 10331),
 ('in the', 8958),
 ('will be', 8776),
 ('looking for', 6485),
 ('in a', 5411),
 ('is a', 5349),
 ('to the', 5104),
 ('for the', 4999),
 ('with the', 4943),
 ('for a', 4833)]

In [17]:
tri_gram = real_jobs.select(col("description"))\
    .filter(col("description").isNotNull())\
    .rdd\
    .flatMap(lambda line: [trigram[0] + " " + trigram[1] + " " + trigram[2] for trigram in ngrams(line.description.split(), 3)])\
    .map(lambda x: (x, 1))\
    .reduceByKey(lambda x,y: x+y).sortBy(lambda x: x[1], False)

In [18]:
tri_gram.take(10)

[('are looking for', 3014),
 ('looking for a', 2661),
 ('as well as', 2255),
 ('to join our', 1652),
 ('be responsible for', 1617),
 ('We are looking', 1517),
 ('you will be', 1453),
 ('be able to', 1432),
 ('will be responsible', 1429),
 ('is looking for', 1248)]

# Most common bi/tri_grams in Fake Jobs

In [19]:
bi_gram = fake_jobs.select(col("description"))\
    .filter(col("description").isNotNull())\
    .rdd\
    .flatMap(lambda line: [bgrams[0] + " " + bgrams[1] for bgrams in ngrams(line.description.split(), 2)])\
    .map(lambda x: (x, 1))\
    .reduceByKey(lambda x,y: x+y).sortBy(lambda x: x[1], False)
    
# 

In [20]:
bi_gram.take(10)

[('of the', 489),
 ('in the', 406),
 ('looking for', 311),
 ('to the', 308),
 ('to work', 256),
 ('We are', 244),
 ('in a', 231),
 ('are looking', 229),
 ('with the', 228),
 ('is a', 222)]

In [21]:
tri_gram = fake_jobs.select(col("description"))\
    .filter(col("description").isNotNull())\
    .rdd\
    .flatMap(lambda line: [trigram[0] + " " + trigram[1] + " " + trigram[2] for trigram in ngrams(line.description.split(), 3)])\
    .map(lambda x: (x, 1))\
    .reduceByKey(lambda x,y: x+y).sortBy(lambda x: x[1], False)

In [22]:
tri_gram.take(10)

[('are looking for', 182),
 ('oil and gas', 134),
 ('looking for a', 102),
 ('- For the', 84),
 ('be responsible for', 81),
 ('We are looking', 78),
 ('Solutions is a', 71),
 ('as well as', 71),
 ('be able to', 68),
 ('6* Ultra Luxury', 63)]