In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn import svm

## 读取数据，只用到了词信息

In [2]:
column = "word_seg"

In [None]:
train_df = pd.read_csv('train_set.csv.zip',usecols=['id',column,'class'], index_col='id')
vec = CountVectorizer(ngram_range=(1, 2), min_df=3, max_df=0.98, binary=True)
vec.fit(train_df[column])
train_feather_names = vec.get_feature_names()
del vec

In [None]:
test_df = pd.read_csv('test_set.csv.zip',usecols=['id','word_seg'], index_col='id')
vec = CountVectorizer(ngram_range=(1, 2), min_df=3, max_df=0.98,  binary=True)
vec.fit(test_df[column])
test_feather_names = vec.get_feature_names()
del vec

## 选出训练集与测试集都存在的特征，防止把分类的“注意力”放在没用的特征上

In [None]:
valide_feather_name = list(set(train_feather_names+test_feather_names))

In [9]:
len(valide_feather_name)

3702540

## 尝试改进TFIDF
### 个人认为TFIDF突出的是相对于整个语料的词特异度，而不能反应类别的特征。所以想根据类别把语料聚合起来，然后再计算idf值。事实证明这样做是有效的。大概在A榜上从0.777+提升到了0.7804。但是值得注意的是在做stack的时候，这里很容易产生leak，也可能就是这个原因，我做的stack效果奇差无比。



In [15]:
join_df = train_df[['word_seg','class']].groupby('class').agg(lambda ele:' '.join(ele))

In [16]:
vec = TfidfVectorizer(ngram_range=(1, 2), use_idf=1, smooth_idf=1, 
                           sublinear_tf=1,vocabulary=valide_feather_name)

In [17]:
vec.fit(join_df['word_seg'])

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), norm='l2', preprocessor=None, smooth_idf=1,
        stop_words=None, strip_accents=None, sublinear_tf=1,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=1,
        vocabulary=['816903 502544', '617854 472396', '447986 572782', '1195450 851011', '1226448 126663', '701424 657515', '566705 960615', '591310 886801', '933052 937565', '192766 378766', '905676 35621', '520477 1041972', '530030 328314', '816903 824325', '1171925 916922', '1163183 1259103', '100833 895...8682 342847', '353916 567228', '907326 859563', '912149 716116', '1090389 1164235', '520477 572373'])

In [None]:
train_X = vec.transform(train_df[column])
train_Y = train_df['class']

In [None]:
test_X = vec.transform(test_df[column])

## 因为测试到数据在训练集上严重过拟合，所以增大误差容许范围，尝试降低过拟合，这样使成绩从0.7799提升到0.7804

In [None]:
from sklearn import svm
clf = svm.LinearSVC(tol=1, class_weight='balanced')
clf.fit(train_X,train_Y)
pred = clf.predict(test_X)

In [None]:
out_df = pd.DataFrame(index=test_df.index)

In [None]:
out_df['class']=pred

In [None]:
out_df.to_csv('submit.csv',index=True, header=True)