In [1]:
import re
import numpy as np
import pandas as pd
from collections import Counter
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from imblearn.over_sampling import SMOTE
from sklearn.feature_extraction.text import TfidfVectorizer as TF
from sklearn.naive_bayes import MultinomialNB as MNB
from sklearn.linear_model import LogisticRegression as LR
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.model_selection import cross_val_score
from conf import config

Using TensorFlow backend.


In [2]:
def clean_text(origin_text):
    # 去掉html标签
    text = BeautifulSoup(origin_text).get_text()
    # 去掉标点符号和非法字符
    text = re.sub("[^a-zA-Z]", " ", text)
    # 将字符全部转化为小写，并通过空格符进行分词处理
    words = text.lower().split()
    # 去停用词
    stop_words = set(stopwords.words("english"))
    meaningful_words = [w for w in words if w not in stop_words]
    # 将剩下的词还原成str类型
    cleaned_text = " ".join(meaningful_words)
    return cleaned_text

In [3]:
# train = pd.read_csv(config.train_path)
# test = pd.read_csv(config.test_path)
train = pd.read_csv(config.origin_train_path, sep='\t', header=None, names=['label', 'text'])
test = pd.read_csv(config.origin_test_path, sep='\t', header=None, names=['text'])
train.head()

Unnamed: 0,label,text
0,ham,Ü collecting ur laptop then going to configure...
1,ham,"Sorry, I can't text &amp; drive coherently, se..."
2,spam,PRIVATE! Your 2003 Account Statement for shows...
3,ham,What's up. Do you want me to come online?
4,ham,The guy did some bitching but I acted like i'd...


In [4]:
train['text'] = train['text'].apply(lambda x: clean_text(x))
test['text'] = test['text'].apply(lambda x: clean_text(x))

In [5]:
train['text'].head()

0    collecting ur laptop going configure da settin...
1               sorry text drive coherently see twenty
2    private account statement shows un redeemed po...
3                                     want come online
4    guy bitching acted like interested buying some...
Name: text, dtype: object

In [6]:
train = train.sample(frac=1).reset_index(drop=True)
tfidf = TF(analyzer="word",
           tokenizer=None,
           preprocessor=None,
           stop_words=None,
           max_features=5000)

# 数据向量化
print("Creating the tfidf vector...\n")
tfidf.fit(train['text'])
x_train = tfidf.transform(train['text'])
x_train = x_train.toarray()

x_test = tfidf.transform(test['text'])
x_test = x_test.toarray()

print(x_train.shape)
print(x_test.shape)

Creating the tfidf vector...

(3448, 5000)
(1672, 5000)


In [7]:
y_train = train["label"]
# x_train, x_val, y_train, y_cal = train_test_split(x_train, y_train, test_size=0.2, random_state=0)

In [8]:
print(Counter(y_train))

Counter({'ham': 3041, 'spam': 407})


In [9]:
smo = SMOTE(random_state=0)
x_smo, y_smo = smo.fit_sample(x_train, y_train)
print(Counter(y_smo))

Counter({'ham': 3041, 'spam': 3041})


In [10]:
model = LR(solver='liblinear')
# model.fit(x_train, y_train)
model.fit(x_smo, y_smo)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [11]:
print("10折交叉验证：")
# print(np.mean(cross_val_score(model, x_train, train["sentiment"], cv=10, scoring="accuracy")))
print(np.mean(cross_val_score(model, x_train, y_train, cv=10, scoring="roc_auc")))

10折交叉验证：
0.9843684943785643


In [12]:
# preds = model.predict(x_test)

In [13]:
preds = model.predict(x_test)
submission = pd.DataFrame({'id': range(len(preds)), 'pred': preds})
submission['id'] = submission['id'] + 1
submission.to_csv("../data/ml_submission.csv", index=False, header=False)
submission.head()

Unnamed: 0,id,pred
0,1,ham
1,2,ham
2,3,ham
3,4,ham
4,5,ham


In [18]:
from sklearn.metrics import accuracy_score
answer = pd.read_csv('../data/origin_data/answer.csv')
print('%.2f' % (accuracy_score(answer.label, submission.pred) * 100))

97.07
