In [None]:
import sklearn
from sklearn import datasets
from sklearn import svm
import random
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics
import numpy
import pandas as pd
from sklearn.model_selection import StratifiedShuffleSplit

def inputdata(filename):
    data = pd.read_csv(filename,encoding='latin-1')
    x = data['x_input']
    y = data['y_label']
    return x,y

def splitset(x,y,test_size=0.2):
    sss = StratifiedShuffleSplit(n_splits=1, test_size=test_size, random_state=0)
    for train_index, test_index in sss.split(X, y):
        print("TRAIN:", train_index, "TEST:", test_index)
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index],y[test_index]

    return X_train,y_train,X_test,y_test


def tfvectorize(X_train,X_test):
    v = TfidfVectorizer(decode_error = 'ignore',stop_words = 'english')
    train_tfidf = v.fit_transform(X_train)
    test_tfidf = v.transform(X_test)
    
    return train_tfidf,test_tfidf


#得到准确率和召回率
def evaluate(actual, pred):
    m_acc = metrics.accuracy_score(actual,pred)
    m_precision = metrics.precision_score(actual, pred,average='macro')
    m_recall = metrics.recall_score(actual,pred,average='macro')
    print( 'precision:{}'.format(m_precision))
    print ('recall:{}'.format(m_recall))
    print ('acc:{}'.format(m_acc))

#创建svm分类器
def train_clf(train_data, train_tags):
    clf = svm.SVC(C=10000,cache_size=200,gamma='auto',kernel='rbf',probability=True,shrinking=True)
    clf.fit(train_data, train_tags)

    return clf

if __name__ == '__main__':
    X,y = inputdata(r'F:\post graduate\raw_data\svm_total.csv')

    X_train,y_train,X_test,y_test = splitset(X, y, 0.2)

    train_tfidf,test_tfidf = tfvectorize(X_train,X_test)

    clf = train_clf(train_tfidf,y_train)

    y_pred =  clf.predict(test_tfidf)

    evaluate(y_test,y_pred)

TRAIN: [152221  48481  47085 ...  33320 197930  15665] TEST: [ 66135 104912   1633 ... 102487 103605  38552]
