In [1]:
import pandas as pd
import numpy as np
import jieba
import jieba.posseg as pseg
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn import svm
from sklearn.model_selection import train_test_split  
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV


In [2]:
# 导入停用词
global stop_list
stop_list=list(set(pd.read_table('chinese-stopword.txt',sep='\t',names=['stop'])['stop']))

In [3]:
def is_chinese(uchar):
#   判断一个unicode是否是汉字
    if uchar >= u'\u4e00' and uchar <= u'\u9fa5':
        return True
    else:
        return False

In [4]:
def get_data_df():
#   获取正负例dataframe 和 停用词list
    neg_df=pd.read_table('neg.txt',sep='\t',names=['label','chat'])
    pos_df=pd.read_table('pos.txt',sep='\t',names=['label','chat'])
    stop_list=list(set(pd.read_table('chinese-stopword.txt',sep='\t',names=['stop'])['stop']))
    return neg_df,pos_df,stop_list

In [5]:
def jieba_getdata(str_str,stopwords=stop_list):
#   结巴切词
    cut_list=jieba.lcut(str_str)
    cut_list=[iword for iword in cut_list if iword not in stopwords] #过滤停用词
    cut_list=[iword for iword in cut_list if is_chinese(iword) is True] #过滤非中文字符
    cut_str=' '.join(cut_list) #空格连接
    return cut_str

In [6]:
def jieba_getdata_adjective(str_str,stopwords=stop_list):
#   利用结巴词性标注的选取形容词
    words=pseg.cut(str_str)
    cut_list=[i.word for i in words if i.flag=='a'] #选择分词后的形容词
    cut_list=[iword for iword in cut_list if iword not in stopwords]
    cut_list=[iword for iword in cut_list if is_chinese(iword) is True]
    cut_str=' '.join(cut_list)
    return cut_str

In [7]:
def get_cut_word(method_select):
#   选取两种方式进行切词 : jieba_getdata_adjective or jieba_getdata
    neg_df,pos_df,stop_list = get_data_df()
    neg_df['cut']=neg_df['chat'].map(str).apply(method_select)
    del neg_df['chat']
    pos_df['cut']=pos_df['chat'].map(str).apply(method_select)
    del pos_df['chat']
    return neg_df,pos_df

In [8]:
def get_xdata_label(method_select):
#   词频特征的向量化
    neg_df,pos_df = get_cut_word(method_select)
    xdata=[]
    ylabel=[]
    neg_pos = pd.concat([neg_df,pos_df])
    for i in range(len(neg_pos)):
        ixdata = neg_pos.iloc[i,1]
        ilabel = neg_pos.iloc[i,0]
        if ixdata!='':
            xdata.append(ixdata)
            ylabel.append(ilabel)
    return xdata,ylabel

In [9]:
neg_df,pos_df,stop_list = get_data_df() #获取正负样本数据

In [10]:
xdata,ylabel=get_xdata_label(jieba_getdata) #结巴切词

Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/_t/wschnxms2rlgr_txwwx6p1g40000gn/T/jieba.cache
Loading model cost 1.440 seconds.
Prefix dict has been built succesfully.


In [11]:
# 统计词频作为特征
count_vec = CountVectorizer() #引入统计词频模块
X_count_train = count_vec.fit_transform(xdata) #词频特征 
# X_count_data = count_vec.transform(xdata)
X_train_count, X_test_count, y_train_count, y_test_count = train_test_split(X_count_train, ylabel, test_size=0.1)  

In [12]:
# X_count_train.toarray()[0]

In [13]:
# tf-idf作为特征
tfidf_vec = TfidfVectorizer() #引入tfidf模块
X_tfidf_train = tfidf_vec.fit_transform(xdata)  
# X_tfidf_data = tfidf_vec.transform(xdata)
X_train_tfidf, X_test_tfidf, y_train_tfidf, y_test_tfidf = train_test_split(X_tfidf_train, ylabel, test_size=0.25)  

In [14]:
# 引入模型
SVM_count = svm.SVC(kernel='linear',probability=True)
SVM_tfidf = svm.SVC(kernel='linear',probability=True)

In [None]:
# 准备模型评估特征
SVM_count.fit(X_train_count,y_train_count)
SVM_tfidf.fit(X_train_tfidf,y_train_tfidf)

In [None]:
y_pred_count_svm = SVM_count.predict_proba(X_test_count)[:,1]
fpr_count_svm, tpr_count_svm, _ = roc_curve(y_test_count, y_pred_count_svm) #roc曲线

y_pred_tfidf_svm = SVM_tfidf.predict_proba(X_test_tfidf)[:,1]
fpr_tfidf_svm, tpr_tfidf_svm, _ = roc_curve(y_test_tfidf, y_pred_tfidf_svm) #roc曲线

In [None]:
plt.figure(1)
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr_count_svm, tpr_count_svm, label='SVM count')
plt.plot(fpr_tfidf_svm, tpr_tfidf_svm, label='SVM tfidf')
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.title('ROC curve')
plt.legend(loc='best')
plt.show()
print("AUC SVM COUNT Score (Train): %f" % roc_auc_score(y_test_count, y_pred_count_svm))
print("AUC SVM TFIDF Score (Train): %f" % roc_auc_score(y_test_tfidf, y_pred_tfidf_svm))


In [15]:
# 上述过程实在是太慢了，速度慢，对于调参也是一大障碍，开始用特征选择
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
print(X_train_count.shape)
select_model=SelectKBest(chi2, k=1500)
select_model_count = select_model.fit(X_train_count, y_train_count)
select_model_tfidf = select_model.fit(X_train_tfidf, y_train_tfidf)

X_train_count=select_model_count.transform(X_train_count)
X_test_count=select_model_count.transform(X_test_count)

X_train_tfidf=select_model_tfidf.transform(X_train_tfidf)
X_test_tfidf=select_model_tfidf.transform(X_test_tfidf)

(8907, 25803)


In [16]:
X_train_count.shape

(8907, 1500)

In [17]:
# # 特征选择后模型表现
SVM_count.fit(X_train_count,y_train_count)
SVM_tfidf.fit(X_train_tfidf,y_train_tfidf)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [None]:
# Grid_search 自动调参
tuned_parameters = [{'kernel': ['linear'],'C': [0.1, 1, 10],'probability':[True]}]
grid_search = GridSearchCV(SVM_count, tuned_parameters, n_jobs=4, cv=3)    
grid_search.fit(X_train_count, y_train_count)    
best_parameters = grid_search.best_estimator_.get_params()    
for para, val in list(best_parameters.items()):    
    print(para, val)

In [None]:
# 引入模型和新的参数
SVM_count = svm.SVC(kernel='linear',probability=True,C=1)
SVM_tfidf = svm.SVC(kernel='linear',probability=True,C=1)
SVM_count.fit(X_train_count,y_train_count)
SVM_tfidf.fit(X_train_tfidf,y_train_tfidf)

In [None]:
y_pred_count_svm = SVM_count.predict_proba(X_test_count)[:,1]
fpr_count_svm, tpr_count_svm, _ = roc_curve(y_test_count, y_pred_count_svm) #roc曲线

y_pred_tfidf_svm = SVM_tfidf.predict_proba(X_test_tfidf)[:,1]
fpr_tfidf_svm, tpr_tfidf_svm, _ = roc_curve(y_test_tfidf, y_pred_tfidf_svm) #roc曲线

In [None]:
plt.figure(1)
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr_count_svm, tpr_count_svm, label='SVM count')
plt.plot(fpr_tfidf_svm, tpr_tfidf_svm, label='SVM tfidf')
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.title('ROC curve')
plt.legend(loc='best')
plt.show()
print("AUC SVM COUNT Score (Train): %f" % roc_auc_score(y_test_count, y_pred_count_svm))
print("AUC SVM TFIDF Score (Train): %f" % roc_auc_score(y_test_tfidf, y_pred_tfidf_svm))


In [None]:
# save model
from sklearn.externals import joblib
model_save=[SVM_count,count_vec,select_model_count]
joblib.dump(model_save, 'SVM_COUNT_CLASSIFICATION.pkl')