# Финальный проект: выявление фейковых вакансий

### Установка библиотек для работы с PySpark

In [1]:
import findspark
findspark.init()
findspark.find()

'C:\\Users\\Laptop\\Downloads\\spark-2.4.7-bin-hadoop2.7\\spark-2.4.7-bin-hadoop2.7'

In [2]:
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

In [35]:
from pyspark.sql import functions as F
from pyspark.sql.types import StructType, StringType, StructField, IntegerType
from pyspark.ml.feature import StringIndexer,VectorAssembler
from pyspark.ml.stat import Correlation

### Описание датасета

Датасет взят отсюда: https://www.kaggle.com/amruthjithrajvr/recruitment-scam/

В нем содержатся данные о 17880 вакансиях, которые были размещены на сайте Workable(https://www.workable.com/) за период с 2012 по 2014 годы. Они были помечены вручную как фейковые или реальные работниками Workable.

Структура датасета:
- title - название вакансии
- location - расположение офиса компании
- department - отдел, в котором открыта вакансия
- salary_range - зарплата
- company_profile - описание компании
- description - описание вакансии
- requirements - требования
- benefits - условия работы
- telecommuting - возможность удаленной работы
- has_company_logo - присутствует ли логотип компании на сайте
- has_questions - присутствуют ли в описании вакансии отсеивающие вопросы
- employment_type - тип занятости
- required_experience - требуемый опыт
- industry - отрасль компании
- function - профессиональная область
- fraudulent - целевая переменная
- in_balanced_dataset - вакансия отобрана для сбалансированного датасета

In [4]:
my_schema = StructType([
    StructField(name='title', dataType=StringType(), nullable=True),
    StructField(name='location', dataType=StringType(), nullable=True),
    StructField(name='department', dataType=StringType(), nullable=True),
    StructField(name='salary_range', dataType=StringType(), nullable=True),
    StructField(name='company_profile', dataType=StringType(), nullable=True),
    StructField(name='description', dataType=StringType(), nullable=True),
    StructField(name='requirements', dataType=StringType(), nullable=True),
    StructField(name='benefits', dataType=StringType(), nullable=True),
    StructField(name='telecommuting', dataType=StringType(), nullable=True),
    StructField(name='has_company_logo', dataType=StringType(), nullable=True),
    StructField(name='has_questions', dataType=StringType(), nullable=True),
    StructField(name='employment_type', dataType=StringType(), nullable=True),
    StructField(name='required_experience', dataType=StringType(), nullable=True),
    StructField(name='required_education', dataType=StringType(), nullable=True),
    StructField(name='industry', dataType=StringType(), nullable=True),
    StructField(name='function', dataType=StringType(), nullable=True),
    StructField(name='fraudulent', dataType=StringType(), nullable=True),
    StructField(name='in_balanced_dataset', dataType=StringType(), nullable=True)])


In [5]:
data = spark \
    .read \
    .format("csv") \
    .schema(my_schema) \
    .options(path="DataSet.csv", header=True,multiline = True,escape='"') \
    .load()

In [6]:
data.printSchema()

root
 |-- title: string (nullable = true)
 |-- location: string (nullable = true)
 |-- department: string (nullable = true)
 |-- salary_range: string (nullable = true)
 |-- company_profile: string (nullable = true)
 |-- description: string (nullable = true)
 |-- requirements: string (nullable = true)
 |-- benefits: string (nullable = true)
 |-- telecommuting: string (nullable = true)
 |-- has_company_logo: string (nullable = true)
 |-- has_questions: string (nullable = true)
 |-- employment_type: string (nullable = true)
 |-- required_experience: string (nullable = true)
 |-- required_education: string (nullable = true)
 |-- industry: string (nullable = true)
 |-- function: string (nullable = true)
 |-- fraudulent: string (nullable = true)
 |-- in_balanced_dataset: string (nullable = true)



In [7]:
data.show(2)

+--------------------+----------------+----------+------------+--------------------+--------------------+--------------------+--------------------+-------------+----------------+-------------+---------------+-------------------+------------------+--------------------+----------------+----------+-------------------+
|               title|        location|department|salary_range|     company_profile|         description|        requirements|            benefits|telecommuting|has_company_logo|has_questions|employment_type|required_experience|required_education|            industry|        function|fraudulent|in_balanced_dataset|
+--------------------+----------------+----------+------------+--------------------+--------------------+--------------------+--------------------+-------------+----------------+-------------+---------------+-------------------+------------------+--------------------+----------------+----------+-------------------+
|    Marketing Intern|US, NY, New York| Marketing

In [8]:
print((data.count(), len(data.columns)))

(17880, 18)


In [9]:
print(data.where(data['fraudulent'] == "t").count())
print(data.where(data['fraudulent'] == "f").count())

866
17014


Датасет несбалансирован, в нем содержатся 17014 реальных вакансий и 866 фейковых.

Поскольку я намереваюсь использовать весь датасет, то от столбца in_balanced_dataset можно избавиться

In [10]:
data = data.drop("in_balanced_dataset")

In [11]:
for col in data.columns:
    print(col, "unique values: ", data.select([col]).distinct().count())

title unique values:  11231
location unique values:  3106
department unique values:  1338
salary_range unique values:  875
company_profile unique values:  1711
description unique values:  15095
requirements unique values:  12120
benefits unique values:  6511
telecommuting unique values:  2
has_company_logo unique values:  2
has_questions unique values:  2
employment_type unique values:  6
required_experience unique values:  8
required_education unique values:  14
industry unique values:  132
function unique values:  38
fraudulent unique values:  2


In [12]:
for col in data.columns:
    print(col, "with null values: ", data.filter(data[col].isNull()).count())

title with null values:  0
location with null values:  346
department with null values:  11547
salary_range with null values:  15012
company_profile with null values:  3308
description with null values:  0
requirements with null values:  2689
benefits with null values:  7196
telecommuting with null values:  0
has_company_logo with null values:  0
has_questions with null values:  0
employment_type with null values:  3471
required_experience with null values:  7050
required_education with null values:  8105
industry with null values:  4903
function with null values:  6455
fraudulent with null values:  0


In [13]:
data = data.na.fill(value="",subset=["location", "department","company_profile","description","requirements","benefits",
                                    "employment_type","required_experience","required_education","industry","function"])

In [14]:
data = data.withColumn('full_description', F.concat_ws(" ", F.col('department'), F.col('company_profile'), 
                       F.col('description'), F.col('requirements'),F.col('benefits'), F.col('employment_type'), 
                       F.col('required_experience'), F.col('required_education'), F.col('industry'), F.col('function')))

In [15]:
data.show(10)

+--------------------+--------------------+----------+------------+--------------------+--------------------+--------------------+--------------------+-------------+----------------+-------------+---------------+-------------------+--------------------+--------------------+--------------------+----------+--------------------+
|               title|            location|department|salary_range|     company_profile|         description|        requirements|            benefits|telecommuting|has_company_logo|has_questions|employment_type|required_experience|  required_education|            industry|            function|fraudulent|    full_description|
+--------------------+--------------------+----------+------------+--------------------+--------------------+--------------------+--------------------+-------------+----------------+-------------+---------------+-------------------+--------------------+--------------------+--------------------+----------+--------------------+
|    Marketing I

In [16]:
data.printSchema()

root
 |-- title: string (nullable = true)
 |-- location: string (nullable = false)
 |-- department: string (nullable = false)
 |-- salary_range: string (nullable = true)
 |-- company_profile: string (nullable = false)
 |-- description: string (nullable = false)
 |-- requirements: string (nullable = false)
 |-- benefits: string (nullable = false)
 |-- telecommuting: string (nullable = true)
 |-- has_company_logo: string (nullable = true)
 |-- has_questions: string (nullable = true)
 |-- employment_type: string (nullable = false)
 |-- required_experience: string (nullable = false)
 |-- required_education: string (nullable = false)
 |-- industry: string (nullable = false)
 |-- function: string (nullable = false)
 |-- fraudulent: string (nullable = true)
 |-- full_description: string (nullable = false)



In [17]:
new_data = data.drop("location","department","company_profile","salary_range","description","requirements","benefits",
                                    "employment_type","required_experience","required_education",
                                   "industry","function","country","state","city","title_len","company_profile_len",
                     "description_len","requirements_len","benefits_len","full_description_len")

In [18]:
new_data.show()

+--------------------+-------------+----------------+-------------+----------+--------------------+
|               title|telecommuting|has_company_logo|has_questions|fraudulent|    full_description|
+--------------------+-------------+----------------+-------------+----------+--------------------+
|    Marketing Intern|            f|               t|            f|         f|Marketing <h3>We'...|
|Customer Service ...|            f|               t|            f|         f|Success <h3>90 Se...|
|Commissioning Mac...|            f|               t|            f|         f| <h3></h3>
<p>Val...|
|Account Executive...|            f|               t|            f|         f|Sales <p>Our pass...|
| Bill Review Manager|            f|               t|            t|         f| <p>SpotSource So...|
|    Accounting Clerk|            f|               f|            f|         f|  <p><b>Job Overv...|
|Head of Content (...|            f|               t|            t|         f|ANDROIDPIT <p>Fou...|


In [19]:
tel_fraud = new_data.stat.crosstab("fraudulent", "telecommuting")
tel_fraud = tel_fraud.withColumn("f_rp",F.round(F.col("f") / 
                                    (F.col("f") + F.col("t")), 2)).withColumn("t_rp",
                                                                                       F.round(F.col("t") / 
                                                                                               (F.col("f") + F.col("t")), 2))
tel_fraud.show()

+------------------------+-----+---+----+----+
|fraudulent_telecommuting|    f|  t|f_rp|t_rp|
+------------------------+-----+---+----+----+
|                       t|  802| 64|0.93|0.07|
|                       f|16311|703|0.96|0.04|
+------------------------+-----+---+----+----+



In [20]:
logo_fraud = new_data.stat.crosstab("fraudulent", "has_company_logo")
logo_fraud = logo_fraud.withColumn("f_rp",F.round(F.col("f") / 
                                    (F.col("f") + F.col("t")), 2)).withColumn("t_rp",
                                                                                       F.round(F.col("t") / 
                                                                                               (F.col("f") + F.col("t")), 2))
logo_fraud.show()

+---------------------------+----+-----+----+----+
|fraudulent_has_company_logo|   f|    t|f_rp|t_rp|
+---------------------------+----+-----+----+----+
|                          t| 583|  283|0.67|0.33|
|                          f|3077|13937|0.18|0.82|
+---------------------------+----+-----+----+----+



In [21]:
questions_fraud = new_data.stat.crosstab("fraudulent", "has_questions")
questions_fraud = questions_fraud.withColumn("f_rp",F.round(F.col("f") / 
                                    (F.col("f") + F.col("t")), 2)).withColumn("t_rp",
                                                                                       F.round(F.col("t") / 
                                                                                               (F.col("f") + F.col("t")), 2))
questions_fraud.show()

+------------------------+----+----+----+----+
|fraudulent_has_questions|   f|   t|f_rp|t_rp|
+------------------------+----+----+----+----+
|                       t| 616| 250|0.71|0.29|
|                       f|8472|8542| 0.5| 0.5|
+------------------------+----+----+----+----+



In [22]:
questions_logo = new_data.stat.crosstab("has_company_logo", "has_questions")
questions_logo = questions_logo.withColumn("f_rp",F.round(F.col("f") / 
                                    (F.col("f") + F.col("t")), 2)).withColumn("t_rp",
                                                                                       F.round(F.col("t") / 
                                                                                               (F.col("f") + F.col("t")), 2))
questions_logo.show()

+------------------------------+----+----+----+----+
|has_company_logo_has_questions|   f|   t|f_rp|t_rp|
+------------------------------+----+----+----+----+
|                             t|6384|7836|0.45|0.55|
|                             f|2704| 956|0.74|0.26|
+------------------------------+----+----+----+----+



In [24]:
new_data[['full_description']].where(new_data.fraudulent=="t").show(truncate=False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [26]:
len_data = data.withColumn('title_len', F.length('title')) \
.withColumn('company_profile_len', F.length('company_profile')) \
.withColumn('description_len', F.length('description')) \
.withColumn('requirements_len', F.length('requirements')) \
.withColumn('benefits_len', F.length('benefits')) \
.withColumn('full_description_len', F.length('full_description'))

In [27]:
len_data = len_data["fraudulent","title_len","company_profile_len",
                     "description_len","requirements_len","benefits_len","full_description_len"]

In [28]:
len_data.printSchema()

root
 |-- fraudulent: string (nullable = true)
 |-- title_len: integer (nullable = true)
 |-- company_profile_len: integer (nullable = false)
 |-- description_len: integer (nullable = false)
 |-- requirements_len: integer (nullable = false)
 |-- benefits_len: integer (nullable = false)
 |-- full_description_len: integer (nullable = false)



In [29]:
len_data.show()

+----------+---------+-------------------+---------------+----------------+------------+--------------------+
|fraudulent|title_len|company_profile_len|description_len|requirements_len|benefits_len|full_description_len|
+----------+---------+-------------------+---------------+----------------+------------+--------------------+
|         f|       16|                918|            993|             972|           0|                2925|
|         f|       41|               1845|           2180|            1842|        1465|                7412|
|         f|       39|                972|            362|            1531|           0|                2874|
|         f|       33|                633|           2849|            1621|         989|                6170|
|         f|       19|               1687|           1929|             954|          28|                4691|
|         f|       16|                  0|           3672|               0|           0|                3681|
|         

In [30]:
label_indexer = StringIndexer(inputCol="fraudulent", outputCol="label")
len_data = label_indexer.fit(len_data).transform(len_data)

In [33]:
assembler = VectorAssembler(
    inputCols=['title_len',"company_profile_len","description_len", "requirements_len",
               "benefits_len", "full_description_len","label"],
    outputCol="features")

assembled = assembler.transform(len_data)

In [36]:
spearman_corr = Correlation.corr(assembled, "features", method='spearman')

corr_list = spearman_corr.head()[0].toArray().tolist()
spearman_corr_df = spark.createDataFrame(corr_list)
spearman_corr_df.show(truncate=False)

+---------------------+-------------------+---------------------+--------------------+---------------------+--------------------+---------------------+
|_1                   |_2                 |_3                   |_4                  |_5                   |_6                  |_7                   |
+---------------------+-------------------+---------------------+--------------------+---------------------+--------------------+---------------------+
|1.0                  |0.0309102209042493 |-0.014480930674691132|9.754525983275385E-4|-0.010665237510306621|0.002601430292887335|0.034914748196164816 |
|0.0309102209042493   |1.0                |0.15753004081806135  |0.20362371944492044 |0.1701720541219026   |0.5745361521073139  |-0.1811879475268473  |
|-0.014480930674691132|0.15753004081806135|1.0                  |0.14227017230553085 |0.06765078487150551  |0.6909157794970568  |-0.03919948695096575 |
|9.754525983275385E-4 |0.20362371944492044|0.14227017230553085  |1.0                 |0.