In [2]:
pip install ua_datasets

In [3]:
from ua_datasets import NewsClassificationDataset
train_data = NewsClassificationDataset(root = 'data/', split = 'train', return_tags = True)
test_data = NewsClassificationDataset(root = 'data/', split = 'test', return_tags = True)

In [3]:
import pandas as pd
import nltk
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
def read_text(name_txt):
  with open(name_txt, 'r') as file:
    text = file.readlines()
  return ' '.join(text)

In [5]:
train_data = pd.read_csv('data/train.csv')
test_data = pd.read_csv('data/test.csv')

In [6]:
train_data = train_data[:10000]
test_data = test_data[:2000]

In [7]:
train_x = train_data['text']
train_y = train_data['target']

test_x = test_data['text']
test_y = test_data['target']

print(train_x.head(), '\n')
print(train_y.head())

0    Головний тренер солігорського «Шахтаря» Юрій В...
1    Про це на своїй сторінці у Facebook написав пр...
2    Про це повідомляється в доповіді некомерційної...
3     Легенда НБА Шакіл О’Ніл продав свій маєток у ...
4     Засновник фінансової піраміди B2B Jewelry Мик...
Name: text, dtype: object 

0     спорт
1    новини
2    новини
3     спорт
4    бізнес
Name: target, dtype: object


In [8]:
def tokenize(data):
  tokens = []
  for line in data:
    tokens.append(nltk.word_tokenize(line.lower()))
  return tokens

test_x = tokenize(test_x)
train_x = tokenize(train_x)

In [9]:
def remove_punctuation(data):
  punctuation = [',', '.', '/', '\\', '|', '\'', '\'\'', '\"', '«', '»', '-', '—', '%', '`', '``', '(', ')', '!', '?', ':', '’'] 
  tokens_without_punct = []
  for token_sentence in data:
    tok = [token for token in token_sentence if token not in punctuation]
    tokens_without_punct.append(tok)
  return tokens_without_punct

test_x = remove_punctuation(test_x)
train_x = remove_punctuation(train_x)

In [10]:
def remove_stopwords(data):
  uk_stop_words = set(read_text('stopwords-uk.txt').split())
  tokens_without_sw = []
  for token_sentence in data:
    tok = [token for token in token_sentence if token not in uk_stop_words]
    tokens_without_sw.append(tok)
  return tokens_without_sw

test_x = remove_stopwords(test_x)
train_x = remove_stopwords(train_x)

In [11]:
def list_to_string(data):
  list_text = []
  for text in data:
    list_text.append(' '.join(text))
  return list_text

test_x = list_to_string(test_x)
train_x = list_to_string(train_x)

Classifier


In [12]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from sklearn.model_selection import GridSearchCV

In [13]:
sgd_ppl_clf = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('sgd_clf', SGDClassifier(random_state=42))])
knb_ppl_clf = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('knb_clf', KNeighborsClassifier(n_neighbors=10))])

In [14]:
sgd_ppl_clf.fit(train_x, train_y)
knb_ppl_clf.fit(train_x, train_y)

Pipeline(steps=[('tfidf', TfidfVectorizer()),
                ('knb_clf', KNeighborsClassifier(n_neighbors=10))])

In [15]:
predicted_sgd = sgd_ppl_clf.predict(test_x)
print(metrics.classification_report(predicted_sgd, test_y))

              precision    recall  f1-score   support

      бізнес       0.82      0.82      0.82       256
      новини       0.78      0.85      0.81       374
    політика       0.92      0.88      0.90       674
       спорт       0.99      0.99      0.99       484
  технології       0.88      0.84      0.86       212

    accuracy                           0.89      2000
   macro avg       0.88      0.88      0.88      2000
weighted avg       0.89      0.89      0.89      2000



In [16]:
predicted_knb = knb_ppl_clf.predict(test_x)
print(metrics.classification_report(predicted_knb, test_y))

              precision    recall  f1-score   support

      бізнес       0.83      0.66      0.74       323
      новини       0.67      0.79      0.73       348
    політика       0.84      0.87      0.85       630
       спорт       0.99      0.97      0.98       490
  технології       0.80      0.78      0.79       209

    accuracy                           0.84      2000
   macro avg       0.83      0.81      0.82      2000
weighted avg       0.84      0.84      0.84      2000



Clusterer

In [24]:
from sklearn.cluster import KMeans

In [25]:
km_ppl_clr = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('knb_clf', KMeans(n_clusters=5))])

In [26]:
km_ppl_clr.fit(train_x, train_y)

Pipeline(steps=[('tfidf', TfidfVectorizer()),
                ('knb_clf', KMeans(n_clusters=5))])

In [27]:
predicted_km = km_ppl_clr.predict(test_x[:5])
for i in range(5):
  print(test_y[i], predicted_km[i])

спорт 4
політика 1
політика 1
бізнес 2
політика 1
