# Финальный проект: выявление фейковых вакансий

### Установка библиотек для работы с PySpark

In [1]:
import findspark
findspark.init()
findspark.find()

'C:\\Users\\Laptop\\Downloads\\spark-2.4.7-bin-hadoop2.7\\spark-2.4.7-bin-hadoop2.7'

In [2]:
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

In [3]:
from pyspark.sql import functions as F
from pyspark.sql.types import StructType, StringType, StructField, IntegerType, BooleanType, DoubleType

### Описание датасета

Датасет взят отсюда: https://www.kaggle.com/amruthjithrajvr/recruitment-scam/

В нем содержатся данные о 17880 вакансиях, которые были размещены на сайте Workable(https://www.workable.com/) за период с 2012 по 2014 годы. Они были помечены вручную как фейковые или реальные работниками Workable.

Структура датасета:
- title - название вакансии
- location - расположение офиса компании
- department - отдел, в котором открыта вакансия
- salary_range - зарплата
- company_profile - описание компании
- description - описание вакансии
- requirements - требования
- benefits - условия работы
- telecommuting - возможность удаленной работы
- has_company_logo - присутствует ли логотип компании на сайте
- has_questions - присутствуют ли в описании вакансии отсеивающие вопросы
- employment_type - тип занятости
- required_experience - требуемый опыт
- industry - отрасль компании
- function - профессиональная область
- fraudulent - целевая переменная
- in_balanced_dataset - вакансия отобрана для сбалансированного датасета

In [4]:
my_schema = StructType([
    StructField(name='title', dataType=StringType(), nullable=True),
    StructField(name='location', dataType=StringType(), nullable=True),
    StructField(name='department', dataType=StringType(), nullable=True),
    StructField(name='salary_range', dataType=StringType(), nullable=True),
    StructField(name='company_profile', dataType=StringType(), nullable=True),
    StructField(name='description', dataType=StringType(), nullable=True),
    StructField(name='requirements', dataType=StringType(), nullable=True),
    StructField(name='benefits', dataType=StringType(), nullable=True),
    StructField(name='telecommuting', dataType=StringType(), nullable=True),
    StructField(name='has_company_logo', dataType=StringType(), nullable=True),
    StructField(name='has_questions', dataType=StringType(), nullable=True),
    StructField(name='employment_type', dataType=StringType(), nullable=True),
    StructField(name='required_experience', dataType=StringType(), nullable=True),
    StructField(name='required_education', dataType=StringType(), nullable=True),
    StructField(name='industry', dataType=StringType(), nullable=True),
    StructField(name='function', dataType=StringType(), nullable=True),
    StructField(name='fraudulent', dataType=StringType(), nullable=True),
    StructField(name='in_balanced_dataset', dataType=StringType(), nullable=True)])


In [5]:
data = spark \
    .read \
    .format("csv") \
    .schema(my_schema) \
    .options(path="DataSet.csv", header=True,multiline = True,escape='"') \
    .load()

In [6]:
data.printSchema()

root
 |-- title: string (nullable = true)
 |-- location: string (nullable = true)
 |-- department: string (nullable = true)
 |-- salary_range: string (nullable = true)
 |-- company_profile: string (nullable = true)
 |-- description: string (nullable = true)
 |-- requirements: string (nullable = true)
 |-- benefits: string (nullable = true)
 |-- telecommuting: string (nullable = true)
 |-- has_company_logo: string (nullable = true)
 |-- has_questions: string (nullable = true)
 |-- employment_type: string (nullable = true)
 |-- required_experience: string (nullable = true)
 |-- required_education: string (nullable = true)
 |-- industry: string (nullable = true)
 |-- function: string (nullable = true)
 |-- fraudulent: string (nullable = true)
 |-- in_balanced_dataset: string (nullable = true)



In [7]:
data.show(2)

+--------------------+----------------+----------+------------+--------------------+--------------------+--------------------+--------------------+-------------+----------------+-------------+---------------+-------------------+------------------+--------------------+----------------+----------+-------------------+
|               title|        location|department|salary_range|     company_profile|         description|        requirements|            benefits|telecommuting|has_company_logo|has_questions|employment_type|required_experience|required_education|            industry|        function|fraudulent|in_balanced_dataset|
+--------------------+----------------+----------+------------+--------------------+--------------------+--------------------+--------------------+-------------+----------------+-------------+---------------+-------------------+------------------+--------------------+----------------+----------+-------------------+
|    Marketing Intern|US, NY, New York| Marketing

In [8]:
print((data.count(), len(data.columns)))

(17880, 18)


In [9]:
print(data.where(data['fraudulent'] == "t").count())
print(data.where(data['fraudulent'] == "f").count())

866
17014


Датасет несбалансирован, в нем содержатся 17014 реальных вакансий и 866 фейковых.

Поскольку я намереваюсь использовать весь датасет, то от столбца in_balanced_dataset можно избавиться

In [10]:
data = data.drop("in_balanced_dataset")

In [11]:
for col in data.columns:
    print(col, "unique values: ", data.select([col]).distinct().count())

title unique values:  11231
location unique values:  3106
department unique values:  1338
salary_range unique values:  875
company_profile unique values:  1711
description unique values:  15095
requirements unique values:  12120
benefits unique values:  6511
telecommuting unique values:  2
has_company_logo unique values:  2
has_questions unique values:  2
employment_type unique values:  6
required_experience unique values:  8
required_education unique values:  14
industry unique values:  132
function unique values:  38
fraudulent unique values:  2


In [12]:
for col in data.columns:
    print(col, "with null values: ", data.filter(data[col].isNull()).count())

title with null values:  0
location with null values:  346
department with null values:  11547
salary_range with null values:  15012
company_profile with null values:  3308
description with null values:  0
requirements with null values:  2689
benefits with null values:  7196
telecommuting with null values:  0
has_company_logo with null values:  0
has_questions with null values:  0
employment_type with null values:  3471
required_experience with null values:  7050
required_education with null values:  8105
industry with null values:  4903
function with null values:  6455
fraudulent with null values:  0


In [13]:
data = data.na.fill(value="",subset=["location", "department","company_profile","description","requirements","benefits",
                                    "employment_type","required_experience","required_education",
                                   "industry","function"])

In [14]:
data = data.withColumn('full_description', F.concat_ws(" ", F.col('location'), F.col('department'), F.col('company_profile'), 
                       F.col('description'), F.col('requirements'),F.col('benefits'), F.col('employment_type'), 
                       F.col('required_experience'), F.col('required_education'), F.col('industry'), F.col('function')))

In [15]:
data.show(2)

+--------------------+----------------+----------+------------+--------------------+--------------------+--------------------+--------------------+-------------+----------------+-------------+---------------+-------------------+------------------+--------------------+----------------+----------+--------------------+
|               title|        location|department|salary_range|     company_profile|         description|        requirements|            benefits|telecommuting|has_company_logo|has_questions|employment_type|required_experience|required_education|            industry|        function|fraudulent|    full_description|
+--------------------+----------------+----------+------------+--------------------+--------------------+--------------------+--------------------+-------------+----------------+-------------+---------------+-------------------+------------------+--------------------+----------------+----------+--------------------+
|    Marketing Intern|US, NY, New York| Market

In [16]:
data.printSchema()

root
 |-- title: string (nullable = true)
 |-- location: string (nullable = false)
 |-- department: string (nullable = false)
 |-- salary_range: string (nullable = true)
 |-- company_profile: string (nullable = false)
 |-- description: string (nullable = false)
 |-- requirements: string (nullable = false)
 |-- benefits: string (nullable = false)
 |-- telecommuting: string (nullable = true)
 |-- has_company_logo: string (nullable = true)
 |-- has_questions: string (nullable = true)
 |-- employment_type: string (nullable = false)
 |-- required_experience: string (nullable = false)
 |-- required_education: string (nullable = false)
 |-- industry: string (nullable = false)
 |-- function: string (nullable = false)
 |-- fraudulent: string (nullable = true)
 |-- full_description: string (nullable = false)



In [20]:
new_data = data.drop("location","department","company_profile","salary_range","description","requirements","benefits",
                                    "employment_type","required_experience","required_education",
                                   "industry","function")

In [21]:
new_data.show()

+--------------------+-------------+----------------+-------------+----------+--------------------+
|               title|telecommuting|has_company_logo|has_questions|fraudulent|    full_description|
+--------------------+-------------+----------------+-------------+----------+--------------------+
|    Marketing Intern|            f|               t|            f|         f|US, NY, New York ...|
|Customer Service ...|            f|               t|            f|         f|NZ, , Auckland Su...|
|Commissioning Mac...|            f|               t|            f|         f|US, IA, Wever  <h...|
|Account Executive...|            f|               t|            f|         f|US, DC, Washingto...|
| Bill Review Manager|            f|               t|            t|         f|US, FL, Fort Wort...|
|    Accounting Clerk|            f|               f|            f|         f|US, MD,    <p><b>...|
|Head of Content (...|            f|               t|            t|         f|DE, BE, Berlin AN...|


In [22]:
new_data.write.mode('overwrite').options(header='True', delimiter=',').csv("DataSet")