In [1]:
import pandas as pd

In [2]:
# 数据导入和token映射
file_path = './src/dataset/DataSet.csv' 
data = pd.read_csv(file_path)
data['telecommuting']=data['telecommuting'].map({'f':0,'t':1})
data['has_company_logo']=data['has_company_logo'].map({'f':0,'t':1})
data['has_questions']=data['has_questions'].map({'f':0,'t':1})
data['fraudulent']=data['fraudulent'].map({'f':0,'t':1})

data.head()

Unnamed: 0,title,location,department,salary_range,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent,in_balanced_dataset
0,Marketing Intern,"US, NY, New York",Marketing,,"<h3>We're Food52, and we've created a groundbr...","<p>Food52, a fast-growing, James Beard Award-w...",<ul>\r\n<li>Experience with content management...,,0,1,0,Other,Internship,,,Marketing,0,f
1,Customer Service - Cloud Video Production,"NZ, , Auckland",Success,,"<h3>90 Seconds, the worlds Cloud Video Product...",<p>Organised - Focused - Vibrant - Awesome!<br...,<p><b>What we expect from you:</b></p>\r\n<p>Y...,<h3><b>What you will get from us</b></h3>\r\n<...,0,1,0,Full-time,Not Applicable,,Marketing and Advertising,Customer Service,0,f
2,Commissioning Machinery Assistant (CMA),"US, IA, Wever",,,<h3></h3>\r\n<p>Valor Services provides Workfo...,"<p>Our client, located in Houston, is actively...",<ul>\r\n<li>Implement pre-commissioning and co...,,0,1,0,,,,,,0,f
3,Account Executive - Washington DC,"US, DC, Washington",Sales,,<p>Our passion for improving quality of life t...,<p><b>THE COMPANY: ESRI – Environmental System...,<ul>\r\n<li>\r\n<b>EDUCATION: </b>Bachelor’s o...,<p>Our culture is anything but corporate—we ha...,0,1,0,Full-time,Mid-Senior level,Bachelor's Degree,Computer Software,Sales,0,f
4,Bill Review Manager,"US, FL, Fort Worth",,,<p>SpotSource Solutions LLC is a Global Human ...,<p><b>JOB TITLE:</b> Itemization Review Manage...,<p><b>QUALIFICATIONS:</b></p>\r\n<ul>\r\n<li>R...,<p>Full Benefits Offered</p>,0,1,1,Full-time,Mid-Senior level,Bachelor's Degree,Hospital & Health Care,Health Care Provider,0,f


In [3]:
#删除一些特征由于后续用于预测中文文本，而数据集地区都是美国，故删除地区特征
columns=['in_balanced_dataset', 'telecommuting', 'has_company_logo', 'has_questions', 'salary_range', 'employment_type','location']
for col in columns:
    del data[col]

In [4]:
#空格填充空值
data.fillna(' ', inplace=True)

# 去重
data.drop_duplicates(inplace=True)
data.reset_index(drop=True, inplace=True)

In [5]:
#合并文本数据类型特征，开始数据清洗
data['text']=(data['title']+' '+data['department']
              +' '+data['company_profile']+' '+data['description']+' '+data['requirements']
              +' '+data['benefits']+' '+data['required_experience']+' '+data['required_education']
              +' '+data['industry']+' '+data['function'])
del data['title']
del data['department']
del data['company_profile']
del data['description']
del data['requirements']
del data['benefits']
del data['required_experience']
del data['required_education']
del data['industry']
del data['function']

In [6]:
# 逐行处理HTML文本
from bs4 import BeautifulSoup
for i in range(len(data)):
    html_text = data.loc[i, 'text']
    # 使用BeautifulSoup解析HTML文本
    soup = BeautifulSoup(html_text, "html.parser")
    # 获取纯文本内容
    plain_text = soup.get_text(strip=True)
    # 将处理后的文本保存回数据集
    data.loc[i, 'text'] = plain_text

In [7]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

nltk.download('stopwords')
nltk.download('punkt')

def preprocess_text(text):
    # 去除特殊字符、标点符号和数字
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # 转换为小写
    text = text.lower()
    # 分词
    words = word_tokenize(text)
    # 去除停用词
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
    # 词干提取
    stemmer = PorterStemmer()
    words = [stemmer.stem(word) for word in words]
    # 拼接处理后的单词为字符串
    processed_text = ' '.join(words)
    return processed_text

# 预处理文本特征列
data['clean_text'] = data['text'].apply(preprocess_text)

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(data['clean_text'], data['fraudulent'], test_size=0.2, random_state=42)

# TF-IDF向量化
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\10690\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\10690\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [8]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint
from sklearn.metrics import classification_report, accuracy_score

# 创建梯度提升树分类器
gb_classifier = GradientBoostingClassifier()

# 定义参数空间
param_dist = {
    'n_estimators': randint(50, 200),  # 决策树数量
    'max_depth': randint(3, 10),        # 树的最大深度
    'min_samples_split': randint(2, 20), # 内部节点再划分所需最小样本数
    'min_samples_leaf': randint(1, 10),  # 叶子节点最少样本数
    'learning_rate': [0.01, 0.05, 0.1, 0.2, 0.3] # 学习率
}

# 使用随机搜索进行参数调优
random_search = RandomizedSearchCV(gb_classifier, param_distributions=param_dist, n_iter=50, cv=5, scoring='accuracy', n_jobs=-1)

# 在训练数据上拟合模型
random_search.fit(X_train_tfidf, y_train)

# 输出最佳参数
print("Best parameters found: ", random_search.best_params_)

# 输出最佳得分
print("Best score found: ", random_search.best_score_)

# 使用最佳参数的模型进行预测
best_gb_classifier = random_search.best_estimator_
y_pred = best_gb_classifier.predict(X_test_tfidf)

print("Classification Report:")
print(classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))                                                                                                                                                 

Best parameters found:  {'learning_rate': 0.2, 'max_depth': 6, 'min_samples_leaf': 7, 'min_samples_split': 6, 'n_estimators': 162}
Best score found:  0.9805002894381392
Classification Report:
              precision    recall  f1-score   support

           0       0.98      1.00      0.99      3110
           1       0.89      0.69      0.78       160

    accuracy                           0.98      3270
   macro avg       0.94      0.84      0.88      3270
weighted avg       0.98      0.98      0.98      3270

Accuracy: 0.9807339449541285
