In [2]:
import os
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import random
from sklearn import linear_model
from time import time
from sklearn.model_selection import train_test_split

# I - Loading data

## 1 - Get categories of text

In [3]:
dir_path = os.path.join(os.getcwd(), 'vnexpress')
categories = list()

data = list()
for directory in os.listdir(dir_path):
#     print(directory)
    if '.' not in directory:
        list_file_path = os.path.join(dir_path, directory)
        count = 0
        for file_name in os.listdir(list_file_path):
            data_dict = dict()
            data_dict['category'] = directory
            file_path = os.path.join(list_file_path, file_name)
            file = open(file_path,'r')
            data_dict['data'] = file.read()
            data.append(data_dict)

## sample 10000 items to training and testing

In [24]:
data_df = pd.DataFrame(data)
sample_df = data_df.sample(10000)
train, test = train_test_split(sample_df, test_size = 0.3)

In [25]:
training_data = train.data
training_label = train.category
testing_data = test.data
testing_label = test.category

## Fitting tf-idf model

In [26]:
t0 = time()
vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5)
vectorizer.fit(training_data)
print("tf-idf learning time:", time() - t0)

tf-idf learning time: 2.3387908935546875


In [27]:
vectorizer

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.5, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

## Extracting features for training and testing data

In [28]:
t0 = time()
training_matrix = vectorizer.transform(training_data)
testing_matrix = vectorizer.transform(testing_data)
print("feature extraction time of training and testing dataset:", time() - t0)

feature extraction time of training and testing dataset: 3.980196952819824


In [29]:
print('training',training_matrix.shape)
print('testing',training_matrix.shape)

training (7000, 56962)
testing (3000, 56962)


In [30]:
training_vector = training_matrix.toarray()
testing_vector = testing_matrix.toarray()

## Dimensionality Reduction

In [None]:
from sklearn import (manifold, datasets, decomposition, ensemble,discriminant_analysis, random_projection)
reducer = decomposition.FactorAnalysis(n_components = 1000)
reducer.fit(training_vector)

In [None]:
training_vector_reduce = reducer.transform(training_vector)
testing_vector_reduce = reducer.transform(testing_vector)

## Classification by Logistic Regression of Scikit Learn

In [31]:
t0 = time()
logreg = linear_model.LogisticRegression(C=1e5)
logreg.fit(training_vector,training_label)
print('Logistic Regression time:',time()-t0)

Logistic Regression time: 8.803112030029297


In [32]:
t0 = time()
predicted_result = logreg.predict(testing_vector)
print("testing time:", time()-t0)

testing time: 1.897918939590454


In [33]:
from sklearn import svm,metrics,model_selection
print(metrics.classification_report(testing_label, predicted_result))

             precision    recall  f1-score   support

     dulich       0.95      0.94      0.94       250
    giaitri       0.98      0.99      0.98       290
    giaoduc       0.95      0.96      0.95       251
    khoahoc       0.95      0.95      0.95       295
  kinhdoanh       0.94      0.93      0.93       273
   otoxemay       0.98      0.96      0.97       267
   phapluat       0.94      0.94      0.94       296
      sohoa       0.97      0.94      0.95       252
    thegioi       0.95      0.96      0.95       262
    thethao       0.98      1.00      0.99       295
     thoisu       0.87      0.88      0.87       269

avg / total       0.95      0.95      0.95      3000

