In [1]:
from google.colab import drive
drive.mount('/content/gdrive')
import os
os.chdir('/content/gdrive/My Drive/finch/tensorflow2/text_classification/clue/main')

Mounted at /content/gdrive


In [2]:
import json

from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.metrics import classification_report

In [3]:
def get_vocab(f_path):
  word2idx = {}
  with open(f_path) as f:
    for i, line in enumerate(f):
      line = line.rstrip()
      word2idx[line] = i
  return word2idx


def get_data():
  x_train = []
  y_train = []
  x_test = []
  y_test = []

  with open('../data/train.txt') as f:
    for line in f:
      line = json.loads(line.rstrip())
      text, label = line['content'], line['label']
      x_train.append(''.join(list(text)))
      y_train.append(label2idx[line['label']])

  with open('../data/test.txt') as f:
    for line in f:
      line = json.loads(line.rstrip())
      text, label = line['content'], line['label']
      x_test.append(''.join(list(text)))
      y_test.append(label2idx[line['label']])

  return (x_train, y_train), (x_test, y_test)

In [4]:
label2idx = get_vocab('../vocab/label.txt')
(x_train, y_train), (x_test, y_test) = get_data()

count_model = CountVectorizer(binary = True,
                              ngram_range = (1,2),
                              tokenizer = lambda x: list(x))
count_model.fit(x_train)

tfidf_model = TfidfTransformer()
tfidf_model.fit(count_model.transform(x_train))
X_train_tfidf = tfidf_model.transform(count_model.transform(x_train))
X_test_tfidf = tfidf_model.transform(count_model.transform(x_test))



In [5]:
lr_model = LogisticRegression(solver='lbfgs', max_iter=1000)
y_pred = lr_model.fit(X_train_tfidf, y_train).predict(X_test_tfidf)
final_acc = (y_pred == y_test).mean()
print("Testing Accuracy: {:.3f}".format(final_acc))
print('\n'+classification_report(y_true = y_test,
                                 y_pred = y_pred,
                                 labels = list(label2idx.values()),
                                 target_names = list(label2idx.keys()),
                                 digits = 3,))

Testing Accuracy: 0.591

              precision    recall  f1-score   support

     sadness      0.566     0.840     0.676      1448
   happiness      0.640     0.715     0.675       978
        like      0.667     0.362     0.469       453
       anger      0.477     0.233     0.313       447
        fear      0.615     0.119     0.200        67
    surprise      0.769     0.097     0.172       103
     disgust      0.605     0.306     0.406       471

    accuracy                          0.591      3967
   macro avg      0.620     0.382     0.416      3967
weighted avg      0.597     0.591     0.558      3967

