In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import precision_recall_fscore_support
from sklearn import svm
from sklearn import preprocessing


In [2]:
stopwords = set(line.strip() for line in open('vietnamese-stopwords.txt',encoding="utf8"))

In [3]:
df = pd.read_csv('data_trainning.csv', encoding = "utf-8")
test = pd.read_csv('data_testing.csv', encoding = "utf-8")
label = pd.read_csv('labels.txt', header=None, encoding = "utf-8")
label.columns=['Label']


In [4]:
le = preprocessing.LabelEncoder()

In [5]:
vectorizer = TfidfVectorizer(min_df = 3,max_features = 17390)
X = vectorizer.fit_transform(df['Content'])
X_testing = vectorizer.fit_transform(test['Content'])

In [6]:
print(X.shape)

(5000, 17390)


In [7]:
print(X_testing.shape)

(5000, 17390)


In [8]:
le.fit(label['Label'])
Y = le.transform(df['Label'])
Y = pd.DataFrame(Y)

In [9]:
Y = Y.values.ravel()

In [10]:
X_train, X_test, y_train, y_true = train_test_split(X, Y, test_size=0.2)

In [11]:
model =  svm.SVC(kernel = 'linear')
model.fit(X_train,y_train)

SVC(kernel='linear')

In [12]:
y_prd = model.predict(X_test)

In [13]:
precision_recall_fscore_support(y_true, y_prd, average='macro')

(0.9014742645381183, 0.8995228661628974, 0.8994684076966151, None)

In [14]:
y_pred = model.predict(X_testing)

In [15]:
y_pred = le.inverse_transform(y_pred)

In [16]:
print(y_pred)

['__CTXH__' '__CTXH__' '__CTXH__' ... '__CTXH__' '__CTXH__' '__CTXH__']


In [18]:
dataset = pd.DataFrame({'Label':y_pred})
dataset.to_csv('predict.txt',index=False, header=False)

In [19]:
rbf_svc = svm.SVC(kernel='rbf')
rbf_svc.fit(X_train, y_train)

SVC()

In [20]:
y_prd_rbf  = rbf_svc.predict(X_test)
precision_recall_fscore_support(y_true, y_prd_rbf, average='macro')

(0.8958788732330685, 0.8915654573698527, 0.891902505470297, None)

In [22]:
scaler = preprocessing.StandardScaler(with_mean=False)
scaler_md = scaler.fit(X_train)
X_train_sd = scaler_md.transform(X_train)
model.fit(X_train_sd,y_train)
X_test_1 = scaler_md.transform(X_test)
y_pred = model.predict(X_test_1)
precision_recall_fscore_support(y_true, y_pred, average='macro')

(0.8722474342267492, 0.870751406125964, 0.8708571104458764, None)