In [60]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [61]:
import os
import logging

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from hydra import initialize, compose
import hydra

from cgpos.utils.util import import_pkl, export_pkl, get_abs_dir

In [62]:
# Reset hydra
hydra.core.global_hydra.GlobalHydra.instance().clear()
# Load hydra params
initialize("../conf/", version_base=None)
config = compose(config_name='main')
# Init logger
logging.basicConfig(level=logging.INFO) 

In [174]:
uid, features, targets = import_pkl(config.data.cleaned)

INFO:cgpos.utils.util:Importing /home/tejomay/cgpos/data/processed/cleaned.pkl


In [184]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

In [181]:
X = [" ".join(syllables) for syllables in features]
# y = [pos_class[0] for pos_class in targets]
y = targets
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=20)

In [185]:
text_clf = Pipeline([
    ('vect', CountVectorizer(lowercase=False, ngram_range=(1,5))),
    ('tfidf', TfidfTransformer()),
    # ('clf', MultinomialNB()),
    ('clf', DecisionTreeClassifier())
    # ('clf', SGDClassifier(loss='hinge', penalty='l2',
    #                       alpha=1e-3, random_state=42,
    #                       max_iter=5, tol=None)),  # SVM
])

In [186]:
text_clf.fit(X_train, y_train)
y_pred = text_clf.predict(X_test)
print(f"Accuracy: {np.mean(y_pred == y_test) * 100:.2f}%")

Accuracy: 94.17%


In [167]:
parameters = {
    'tfidf__use_idf': (True, False),
    'clf__alpha': (1e-2, 1e-3),
}
gs_clf = GridSearchCV(text_clf, parameters, cv=5, n_jobs=-1)

gs_clf.fit(X_train, y_train)
y_pred = gs_clf.predict(X_test)
print(f"Accuracy: {np.mean(y_pred == y_test) * 100:.2f}%")

Accuracy: 75.77%
