In [1]:
import gensim
import pandas
from gensim import corpora, models
import os

In [2]:
DICT_SIZE = 8000
NUM_TOPIC = 50

In [3]:
docs_ful = pandas.read_csv('./train.csv')
docs = docs_ful['content']

In [4]:
processed_docs = docs.astype(str).apply(lambda x : x.split())

In [5]:
dictionary= gensim.corpora.Dictionary(processed_docs)

In [6]:
len(dictionary)

77581

In [7]:
dictionary.filter_extremes(no_below=2, no_above=0.7, keep_n=DICT_SIZE)

In [8]:
len(dictionary)

8000

In [9]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

In [10]:
tfidf = models.TfidfModel(bow_corpus)

In [11]:
corpus_tfidf = tfidf[bow_corpus]


In [12]:
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=NUM_TOPIC, id2word=dictionary, passes=2, workers=4)


In [13]:
# After lda_model
# Test
docs_ful_val = pandas.read_csv('./test.csv')
docs_val = docs_ful_val['content']

bow_corpus_val = [dictionary.doc2bow(doc) for doc in docs_val.astype(str).apply(lambda x: x.split())]
corpus_tfidf_val = tfidf[bow_corpus_val]

data_after_lda = []
for cor in corpus_tfidf_val:
    temp = [0] * NUM_TOPIC
    for index_topic, val in lda_model_tfidf[cor]:
        temp[index_topic] = val
    data_after_lda.append(temp)
data = pandas.DataFrame(data={'data': data_after_lda, 'label': docs_ful_val['label']})
print(len(data_after_lda))
os.mkdir('./data/v' + str(DICT_SIZE) +'_tp' + str(NUM_TOPIC))
data.to_csv('./data/v' + str(DICT_SIZE)+ '_tp' + str(NUM_TOPIC) + '/after_lda_test.csv')

719


In [14]:
# After lda_model
# Val
docs_ful_val = pandas.read_csv('./val.csv')
docs_val = docs_ful_val['content']

bow_corpus_val = [dictionary.doc2bow(doc) for doc in docs_val.astype(str).apply(lambda x: x.split())]
corpus_tfidf_val = tfidf[bow_corpus_val]

data_after_lda = []
for cor in corpus_tfidf_val:
    temp = [0] * NUM_TOPIC
    for index_topic, val in lda_model_tfidf[cor]:
        temp[index_topic] = val
    data_after_lda.append(temp)
data = pandas.DataFrame(data={'data': data_after_lda, 'label': docs_ful_val['label']})
print(len(data_after_lda))
data.to_csv('./data/v' + str(DICT_SIZE) +'_tp' + str(NUM_TOPIC) + '/after_lda_val.csv')

2255


In [15]:
# After lda_model
# Train
data_after_lda = []
for cor in corpus_tfidf:
    temp = [0] * NUM_TOPIC
    for index_topic, val in lda_model_tfidf[cor]:
        temp[index_topic] = val
    data_after_lda.append(temp)
data = pandas.DataFrame(data={'data': data_after_lda, 'label': docs_ful['label']})
print(len(data_after_lda))
data.to_csv('./data/v' + str(DICT_SIZE) +'_tp' + str(NUM_TOPIC) + '/after_lda_train.csv')

7893


In [16]:
# Convert data
def convert(s):
    if s == 'giao_duc':
        return 0
    elif s == 'the_gioi':
        return 1
    elif s == 'phap_luat':
        return 2
    elif s == 'giai_tri':
        return 3
    elif s == 'kinh_doanh':
        return 4
    elif s == 'van_hoa':
        return 5
    elif s == 'khoa_hoc':
        return 6
    elif s == 'the_thao':
        return 7
    elif s == 'suc_khoe':
        return 8
    elif s == 'xe':
        return 9

In [17]:
temp = pandas.read_csv('./data/v' + str(DICT_SIZE)+ '_tp' + str(NUM_TOPIC) + '/after_lda_train.csv')
temp['label'] = [convert(s) for s in temp['label']]
temp = pandas.DataFrame(data={'data': temp['data'], 'label':temp['label']})
os.mkdir('pj/data/v' + str(DICT_SIZE) +'_tp' + str(NUM_TOPIC))
temp.to_csv('pj/data/v' + str(DICT_SIZE) +'_tp' + str(NUM_TOPIC) + '/after_lda_train.csv', index=False, header=False)

In [18]:
temp = pandas.read_csv('./data/v' + str(DICT_SIZE) +'_tp' + str(NUM_TOPIC) + '/after_lda_test.csv')
temp['label'] = [convert(s) for s in temp['label']]
temp = pandas.DataFrame(data={'data': temp['data'], 'label':temp['label']})
temp.to_csv('pj/data/v' + str(DICT_SIZE)+ '_tp' + str(NUM_TOPIC) + '/after_lda_test.csv', index=False, header=False)

In [19]:
temp = pandas.read_csv('./data/v' + str(DICT_SIZE)+ '_tp' + str(NUM_TOPIC) + '/after_lda_val.csv')
temp['label'] = [convert(s) for s in temp['label']]
temp = pandas.DataFrame(data={'data': temp['data'], 'label':temp['label']})
temp.to_csv('pj/data/v' + str(DICT_SIZE) + '_tp' + str(NUM_TOPIC) + '/after_lda_val.csv', index=False, header=False)

In [20]:
# Preprocess

In [21]:
import numpy as np 
import ast

data = ['test', 'val', 'train']

for type_set in data:
    with open('pj/data/v' + str(DICT_SIZE) +'_tp' + str(NUM_TOPIC) + '/after_lda_{}.csv'.format(type_set), 'r') as f:
        lines = f.readlines()
        X = []
        Y = []
        for line in lines:
            x = ast.literal_eval(line[1:-4])
            y = line[-2]
            X.append(x)
            Y.append(y)
        X = np.array(X).astype(np.float64)
        Y = np.array(Y).reshape(-1,1).astype(np.float64)
        np.save('pj/data/v' + str(DICT_SIZE) + '_tp' + str(NUM_TOPIC) + '/{}_{}.npy'.format(type_set, str(NUM_TOPIC)), np.hstack([X, Y]))

In [22]:
# Run Svm
import numpy as np 
from sklearn.svm import SVC

train_set = np.load('pj/data/v' + str(DICT_SIZE) +'_tp' + str(NUM_TOPIC) +'/train_' + str(NUM_TOPIC)+'.npy')
val_set = np.load('pj/data/v' + str(DICT_SIZE) +'_tp' + str(NUM_TOPIC) +'/val_'+str(NUM_TOPIC)+'.npy')
test_set = np.load('pj/data/v' + str(DICT_SIZE) +'_tp' + str(NUM_TOPIC) +'/test_'+str(NUM_TOPIC)+'.npy')

X_train = train_set[:, :-1]
Y_train = train_set[:, -1]
X_val = val_set[:, :-1]
Y_val = val_set[:, -1]
X_test = test_set[:, :-1]
Y_test = test_set[:, -1]

clf = SVC(kernel='rbf', degree=3, gamma='auto')
clf.fit(X_train, Y_train)

score = clf.score(X_val, Y_val)
print('val', score)
score1 = clf.score(X_test, Y_test)
print('test', score1)

with open('pj/data/v' + str(DICT_SIZE) +'_tp' + str(NUM_TOPIC) +'/result.txt', 'w') as f:
    f.write('val ' + str(score)+'\n')
    f.write('test ' + str(score1))

val 0.5161862527716187
test 0.4603616133518776
