In [1]:
import os
import pandas as pd
from sklearn import svm
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
#: 加载原始数据
def load_raw_data(data_dir = '.'):
    train_data_path = os.path.join(data_dir,'train_set.csv')
    test_data_path = os.path.join(data_dir,'test_set.csv')
    
    df_train = pd.read_csv(train_data_path) # 读取训练集数据
    df_test = pd.read_csv(test_data_path) # 读取测试集数据
    
    df_train.drop(columns = ['article','id'], inplace = True) # 删除训练集中的article列
    df_test.drop(columns = ['article'], inplace = True) # 删除测试集中的article列
    
    return df_train, df_test

In [3]:
#: 特征提取
def extract_features(raw_train, raw_test):
    vectorizer = TfidfVectorizer(
        ngram_range = (1,2), 
        min_df = 3, 
        max_df = 0.9, 
        max_features = 100000) # 初始化一个CountVectorizer对象
    vectorizer.fit(raw_train['word_seg']) # 构建词汇表
    
    x_train = vectorizer.transform(raw_train['word_seg']) # 将一篇文章转为与其对应的一个特征向量
    x_test = vectorizer.transform(raw_test['word_seg']) # 将一篇文章转为与其对应的一个特征向量
    y_train = raw_train['class']-1 # 因为从0开始计算，所以要将原值-1
    y_test = None # to predict
    
    return x_train, y_train, x_test, y_test

In [None]:
print ('开始训练...')
print ('-[stage I] Loading Data ...')
%time raw_train, raw_test = load_raw_data('.')

print ('-[stage Ⅱ] Extracting Features ...')
%time x_train, y_train, x_test, y_test = extract_features(raw_train, raw_test)
print ('-[stage Ⅲ] Traing and Predicting ...')
%time
clf = svm.SVC(kernel='rbf', C=10, gamma=0.001, verbose=0) # 初始化一个分类器
clf.fit(x_train, y_train) # 训练这个分类器

# 根据上面训练好的分类器对测试集的每个样本进行预测
y_test = clf.predict(x_test)

In [None]:
raw_test['class'] = y_test.tolist() # 转换为Python的List形式
raw_test['class'] = raw_test['class'] + 1 # 将class + 1，保证和官方的预测值一致
raw_result = raw_test.loc[:, ['id', 'class']]
raw_result.to_csv('./result.csv', index = False) # 将结果保存至本地文件