In [3]:
import re
import numpy as np
import pandas as pd
from collections import Counter
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer as TF
from sklearn.naive_bayes import MultinomialNB as MNB
from sklearn.linear_model import LogisticRegression as LR
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.model_selection import cross_val_score
from conf import config

In [4]:
def clean_text(origin_text):
    # 去掉html标签
    text = BeautifulSoup(origin_text).get_text()
    # 去掉标点符号和非法字符
    text = re.sub("[^a-zA-Z]", " ", text)
    # 将字符全部转化为小写，并通过空格符进行分词处理
    words = text.lower().split()
    # 去停用词
    stop_words = set(stopwords.words("english"))
    meaningful_words = [w for w in words if w not in stop_words]
    # 将剩下的词还原成str类型
    cleaned_text = " ".join(meaningful_words)
    return cleaned_text

In [5]:
train_df = pd.read_csv('../data/train.csv')
test_df = pd.read_csv('../data/test.csv')
train_df.head()

Unnamed: 0,title,location,department,salary_range,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent
0,Office Manager,"GR, I, Athens",Administration,,Workable is a venture-backed startup making cl...,We are looking for an experienced office manag...,"Excellent use of English, speaking and writing...",Our goal is to create a company where employee...,0,1,1,,,,,,0
1,Front End Engineer,"US, CA, Emeryville",Engineering,,Abakus is a software platform built on game th...,As Front End Engineer you will be leading the ...,3-5 years of UI/UX development experience with...,Competitive salaryStock optionsComprehensive b...,0,1,1,Full-time,Mid-Senior level,,Computer Software,Engineering,0
2,Software Engineer,"US, MA, Wilmington",,0-130000,,Our client is one of the leading SaaS-based pr...,At least 5 years experience developing large-s...,,0,0,0,Full-time,,Bachelor's Degree,,,0
3,Product Manager,"US, CA, Manhattan Beach",,,The Boston Consulting Group (#URL_45423e1e1670...,We are looking for a Product Manager to be a p...,BASIC JOB REQUIREMENTS:Bachelors Degree form a...,The Boston Consulting Group (BCG) is a global ...,0,1,0,Full-time,Associate,Bachelor's Degree,Computer Software,Product Management,0
4,Sales Intern,"US, NY, New York",Sales,,,"Lean Startup Machine has trained over 25,000 a...",Experience in sales preferredInterest in tech ...,Be part of a growing and global team that has ...,0,0,1,Part-time,Internship,Some College Coursework Completed,Management Consulting,Sales,0


In [18]:
def test(row):
    columns = ['company_profile', 'description', 'requirements', 'benefits']
    ans = ''
    for column in columns:
        if not pd.isnull(row[column]):
                ans += row[column]
    return ans    

In [19]:
train_df['text'] = train_df.apply(lambda row: test(row), axis=1)
test_df['text'] = test_df.apply(lambda row: test(row), axis=1)

In [21]:
train_df['text'] = train_df['text'].apply(lambda x: clean_text(x))
test_df['text'] = test_df['text'].apply(lambda x: clean_text(x))

In [24]:
test_df['text'].head()

0    long term well established optometric practice...
1    full service marketing staffing firm serving c...
2    globally connected world forced businesses ret...
3    indicative changing way internet business make...
4    eroad established modernise new zealand paper ...
Name: text, dtype: object

In [28]:
train_df = train_df.sample(frac=1).reset_index(drop=True)
tfidf = TF(analyzer="word",
           tokenizer=None,
           preprocessor=None,
           stop_words=None,
           max_features=5000)

# 数据向量化
print("Creating the tfidf vector...\n")
tfidf.fit(train_df['text'])
x_train = tfidf.transform(train_df['text'])
x_train = x_train.toarray()

x_test = tfidf.transform(test_df['text'])
x_test = x_test.toarray()

print(x_train.shape)
print(x_test.shape)

Creating the tfidf vector...

(17680, 5000)
(200, 5000)


In [30]:
y_train = train_df["fraudulent"]
# x_train, x_val, fraudulent, y_cal = train_test_split(x_train, y_train, test_size=0.2, random_state=0)

In [31]:
print(Counter(y_train))

Counter({0: 16914, 1: 766})


In [32]:
model = LR(solver='liblinear')
# model.fit(x_train, y_train)
model.fit(x_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [36]:
print("10折交叉验证：")
print(np.mean(cross_val_score(model, x_train, y_train, cv=10, scoring="accuracy")))

10折交叉验证：
0.9743212669683258


In [35]:
preds = model.predict(x_test)
submission = pd.DataFrame({'id': range(len(preds)), 'pred': preds})
submission['id'] = submission['id'] + 1
submission.to_csv("../data/ml_submission.csv", index=False, header=False)
submission.head()

Unnamed: 0,id,pred
0,1,0
1,2,1
2,3,0
3,4,0
4,5,0
