In [3]:
# -*- coding: utf-8 -*-

import pandas as pd
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
import time

print("开始......")
t_start = time.time()

df_train = pd.read_csv('C:/daguan_text_classification/new_data/train_set.csv')
df_test = pd.read_csv('C:/daguan_text_classification/new_data/test_set.csv')
df_all = pd.concat(objs=[df_train, df_test], axis=0, sort=True)
print('data preparation is done')

#tfidf
vectorizer = TfidfVectorizer(ngram_range=(1,2),min_df=3, max_df=0.9,use_idf=1,smooth_idf=1, sublinear_tf=1)
vectorizer.fit(df_all['word_seg'])
x_train = vectorizer.transform(df_train['word_seg'])
x_test = vectorizer.transform(df_test['word_seg'])
y_train = (df_train['class']-1).values
print("特征工程结束！")
 

x_train, x_vali, y_train, y_vali = train_test_split(x_train, y_train, test_size=0.1, random_state=0)


print('training begins')
classifier = LinearSVC(verbose = 3)
classifier.fit(x_train, y_train)
print('training done')

pre_vali = classifier.predict(x_vali)
score_vali = f1_score(y_vali, pre_vali, average='macro')
print("验证集分数：{}".format(score_vali))


y_test = classifier.predict(x_test) 


df_test['class'] = y_test.tolist()
df_test['class'] = df_test['class'] + 1
df_result = df_test.loc[:, ['id', 'class']]
df_result.to_csv('C:/daguan_text_classification/results/linearsvc2.csv', index=False)

t_end = time.time()
print("训练结束，耗时:{}s".format(t_end-t_start))

开始......
data preparation is done
特征工程结束！
training begins
[LibLinear]training done
验证集分数：0.7813188717138635
训练结束，耗时:1376.0823409557343s


In [None]:
# -*- coding: utf-8 -*-
#Gridsearch CV

import pandas as pd
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
import time

print("开始......")
t_start = time.time()

df_train = pd.read_csv('C:/daguan_text_classification/new_data/train_set.csv')
df_test = pd.read_csv('C:/daguan_text_classification/new_data/test_set.csv')
df_all = pd.concat(objs=[df_train, df_test], axis=0, sort=True)

vectorizer = TfidfVectorizer(ngram_range=(1,2),min_df=3, max_df=0.9,use_idf=1,smooth_idf=1, sublinear_tf=1)
vectorizer.fit(df_all['word_seg'])
x_train = vectorizer.transform(df_train['word_seg'])
x_test = vectorizer.transform(df_test['word_seg'])
y_train = (df_train['class']-1).values
print("特征工程结束！")
 

"""训练分类器"""
params = {'penalty':['l2', 'l1'], 'C':[1.0, 2.0, 3.0]}
svc = LinearSVC(dual=False)
clf = GridSearchCV(estimator=svc, param_grid=params, scoring='f1_macro', n_jobs=1, cv=5, verbose=3)
clf.fit(x_train, y_train)

"""根据上面训练好的分类器对测试集的每个样本进行预测"""
y_test = clf.predict(x_test) 

"""将测试集的预测结果保存至本地"""
df_test['class'] = y_test.tolist()
df_test['class'] = df_test['class'] + 1
df_result = df_test.loc[:, ['id', 'class']]
df_result.to_csv('../results/linearSVC_with_reg.csv', index=False)

t_end = time.time()
print("训练结束，耗时:{}s".format(t_end-t_start))

In [None]:
 # -*- coding: utf-8 -*-

#GridSeachCV with both tf-idf and SVC
    
import pandas as pd
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import Pipeline
import time

print("开始......")
t_start = time.time()

df_train = pd.read_csv('C:/daguan_text_classification/new_data/train_set.csv')
df_test = pd.read_csv('C:/daguan_text_classification/new_data/test_set.csv')

df_train_x = df_train['word_seg']
df_train_y = df_train['class'] - 1
featurer = TfidfVectorizer(ngram_range=(1,2),min_df=3 )
classifier = LinearSVC()
pipeline = Pipeline([('tfidf', featurer),('clf', classifier)])

parameters = {'tfidf__ngram_range': ((1, 2), (1, 3)),
              'tfidf__min_df': (4, 6, 8),
              'tfidf__max_df':(0.7, 0.9),
              'clf__C': (1.0, 2.0, 3.0)}

skf = StratifiedKFold(n_splits=5, random_state=1)
gs = GridSearchCV(estimator=pipeline, param_grid=parameters, n_jobs= 1, scoring='f1_macro', cv=skf, verbose=3)
gs.fit(df_train_x, df_train_y)

"""打印最优的参数值"""
print("Best score: %0.3f" % gs.best_score_)
best_parameters = gs.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

x_test = gs.best_estimator_.named_steps['tfidf'].transform(df_test['word_seg'])

"""根据上面训练好的分类器对测试集的每个样本进行预测"""
y_test = gs.best_estimator_.named_steps['clf'].predict(x_test)
 
"""将测试集的预测结果保存至本地"""
df_test['class'] = y_test.tolist()
df_test['class'] = df_test['class'] + 1
df_result = df_test.loc[:, ['id', 'class']]
df_result.to_csv('../results/linearSVC_final.csv', index=False)
t_end = time.time()
print("训练结束，耗时:{}s".format(t_end-t_start))

In [None]:
"""
@简介：tfidf特征/ 决策树算法
@成绩： 
"""

import pandas as pd
from sklearn import tree
from sklearn.feature_extraction.text import TfidfVectorizer

print("开始...............")

"""加载数据，并进行简单处理"""
df_train = pd.read_csv('C:/daguan_text_classification/new_data/train_set.csv')
df_test = pd.read_csv('C:/daguan_text_classification/new_data/test_set.csv')
df_train.drop(columns=['article', 'id'], inplace=True)
df_test.drop(columns=['article'], inplace=True)

"""特征工程"""
# vectorizer = TfidfVectorizer(ngram_range=(1, 2), min_df=3, max_df=0.9)
# vectorizer.fit(df_train['word_seg'])
# x_train = vectorizer.transform(df_train['word_seg'])
# x_test = vectorizer.transform(df_test['word_seg'])
# y_train = df_train['class']-1

"""训练决策树分类器"""
classifier = tree.DecisionTreeClassifier()
classifier.fit(x_train, y_train)

"""根据上面训练好的分类器对测试集的每个样本进行预测"""
y_test = classifier.predict(x_test)

"""将测试集的预测结果保存至本地"""
df_test['class'] = y_test.tolist()
df_test['class'] = df_test['class'] + 1
df_result = df_test.loc[:, ['id', 'class']]
df_result.to_csv('C:/daguan_text_classification/results/tree_model.csv', index=False)

print("完成...............")