Importing modules

In [5]:
import os
import json

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics

Loading data

In [6]:
posts_data_path = os.path.join("data", "cleaned_posts.json")

train = []
test = []

with open(posts_data_path) as file_obj:
    data = json.load(file_obj)
    
    train, test = zip(*data)

Splitting data into train and test sets

In [7]:
x_train, x_test, y_train, y_test = train_test_split(train, test, test_size=0.3)

print(len(x_train), len(x_test))
print(len(y_train), len(y_test))

3193 1369
3193 1369


Naive bayes

In [8]:
pipe = Pipeline([("vect", CountVectorizer(ngram_range=(1, 2))),
                 ("tfidf", TfidfTransformer()),
                 ("clf", MultinomialNB())])

parameters = {"vect__ngram_range": [(1, 1), (1, 2)],
              "tfidf__use_idf": (True, False),
              "clf__alpha": (1e-2, 1e-3)}

grid = GridSearchCV(pipe, cv=3, n_jobs=1, param_grid=parameters)
grid.fit(train, test)

print(grid.best_score_)
print(grid.best_params_)

0.6876370013152127
{'clf__alpha': 0.01, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 1)}


Ranfom forest

In [9]:
pipe = Pipeline([("vect", CountVectorizer()),
                 ("tfidf", TfidfTransformer()),
                 ("clf", RandomForestClassifier())])

parameters = {"vect__ngram_range": [(1, 1), (1, 2)],
              "tfidf__use_idf": (True, False),
              "clf__n_estimators": (5, 10, 20, 50)}

grid = GridSearchCV(pipe, cv=3, n_jobs=1, param_grid=parameters)
grid.fit(train, test)

print(grid.best_score_)
print(grid.best_params_)

0.674923279263481
{'clf__n_estimators': 50, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 1)}


Decision tree

In [10]:
pipe = Pipeline([("vect", CountVectorizer()),
                 ("tfidf", TfidfTransformer()),
                 ("clf", DecisionTreeClassifier())])

parameters = {"vect__ngram_range": [(1, 1), (1, 2)],
              "tfidf__use_idf": (True, False)}

grid = GridSearchCV(pipe, cv=3, n_jobs=1, param_grid=parameters)
grid.fit(train, test)

print(grid.best_score_)
print(grid.best_params_)

0.5640070144673389
{'tfidf__use_idf': True, 'vect__ngram_range': (1, 1)}


Nearest neighbors

In [11]:
pipe = Pipeline([("vect", CountVectorizer()),
                 ("tfidf", TfidfTransformer()),
                 ("clf", KNeighborsClassifier())])

parameters = {"vect__ngram_range": [(1, 1), (1, 2)],
              "tfidf__use_idf": (True, False),
              "clf__n_neighbors": (1, 3, 5, 11, 15, 21, 25, 31)}

grid = GridSearchCV(pipe, cv=3, n_jobs=1, param_grid=parameters)
grid.fit(train, test)

print(grid.best_score_)
print(grid.best_params_)

0.6874177992108724
{'clf__n_neighbors': 21, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 1)}


Train classifier

In [13]:
pipe = Pipeline([("vect", CountVectorizer(ngram_range=(1, 1))),
                 ("tfidf", TfidfTransformer(use_idf=True)),
                 ("clf", KNeighborsClassifier(n_neighbors=21))])

clf = pipe.fit(x_train, y_train)

Test classifier

In [17]:
predicted = clf.predict(x_test)

print(metrics.accuracy_score(y_test, predicted))
print(metrics.classification_report(y_test, predicted))
print(metrics.confusion_matrix(y_test, predicted))

0.7027027027027027
                                      precision    recall  f1-score   support

     Administravimas/sekretoriavimas       0.70      0.46      0.56        41
                             Apsauga       1.00      0.70      0.82        10
           Apskaita/finansai/auditas       0.81      0.83      0.82        69
               Dizainas/architektūra       0.50      0.23      0.32        13
                           Draudimas       0.67      0.83      0.74        12
                           Eksportas       0.00      0.00      0.00         4
              Energetika/elektronika       0.50      0.47      0.49        40
          Informacinės technologijos       0.64      0.91      0.75        87
                Inžinerija/mechanika       0.69      0.54      0.61       104
      Klientų aptarnavimas/paslaugos       0.69      0.48      0.56       147
                       Maisto gamyba       0.79      0.62      0.70        48
                 Marketingas/reklama       0

  'recall', 'true', average, warn_for)
