In [19]:
!pip install pymorphy3

Collecting pymorphy3
  Downloading pymorphy3-2.0.1-py3-none-any.whl (53 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.2/53.2 kB[0m [31m792.1 kB/s[0m eta [36m0:00:00[0m
[?25hCollecting dawg-python>=0.7.1 (from pymorphy3)
  Downloading DAWG_Python-0.7.2-py2.py3-none-any.whl (11 kB)
Collecting pymorphy3-dicts-ru (from pymorphy3)
  Downloading pymorphy3_dicts_ru-2.4.417150.4580142-py2.py3-none-any.whl (8.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.4/8.4 MB[0m [31m27.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pymorphy3-dicts-ru, dawg-python, pymorphy3
Successfully installed dawg-python-0.7.2 pymorphy3-2.0.1 pymorphy3-dicts-ru-2.4.417150.4580142


In [20]:
import pandas as pd
import numpy as np
import pymorphy3
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from collections import Counter

In [21]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [24]:
max_words = 10000
random_state = 42

In [22]:
import pandas as pd
classes = open('classes.txt', 'r', encoding='utf-8').readlines()
train = pd.read_csv('train.csv', usecols=[0,2])
test = pd.read_csv('test.csv', usecols=[0,2])

In [23]:
def preprocess(text, stop_words, punctuation_marks, morph):
    tokens = word_tokenize(text.lower())
    preprocessed_text = []
    for token in tokens:
        if token not in punctuation_marks:
            lemma = morph.parse(token)[0].normal_form
            if lemma not in stop_words:
                preprocessed_text.append(lemma)
    return preprocessed_text

In [25]:
punctuation_marks = ['!', ',', '(', ')', ':', '-', '?', '.', '..', '...', '«', '»', ';', '–', '--']
stop_words = stopwords.words("russian")
morph = pymorphy3.MorphAnalyzer()

In [26]:
test['Preprocessed_texts'] = test.apply(lambda row: preprocess(row['text'], punctuation_marks, stop_words, morph), axis=1)
train['Preprocessed_texts'] = train.apply(lambda row: preprocess(row['text'], punctuation_marks, stop_words, morph), axis=1)

In [27]:
test

Unnamed: 0,score,text,Preprocessed_texts
0,3,Unions representing workers at Turner Newall...,"[unions, representing, workers, at, turner, ne..."
1,4,"SPACE.com - TORONTO, Canada -- A second\team o...","[space.com, toronto, canada, a, second\team, o..."
2,4,AP - A company founded by a chemistry research...,"[ap, a, company, founded, by, a, chemistry, re..."
3,4,AP - It's barely dawn when Mike Fitzpatrick st...,"[ap, it, 's, barely, dawn, when, mike, fitzpat..."
4,4,AP - Southern California's smog-fighting agenc...,"[ap, southern, california, 's, smog-fighting, ..."
...,...,...,...
7595,1,Ukrainian presidential candidate Viktor Yushch...,"[ukrainian, presidential, candidate, viktor, y..."
7596,2,With the supply of attractive pitching options...,"[with, the, supply, of, attractive, pitching, ..."
7597,2,Like Roger Clemens did almost exactly eight ye...,"[like, roger, clemens, did, almost, exactly, e..."
7598,3,SINGAPORE : Doctors in the United States have ...,"[singapore, doctors, in, the, united, states, ..."


In [28]:
words = Counter()

In [29]:
for txt in test['Preprocessed_texts']:
    words.update(txt)
for txt in train['Preprocessed_texts']:
    words.update(txt)
# Словарь, отображающий слова в коды
word_to_index = dict()
# Словарь, отображающий коды в слова
index_to_word = dict()

In [30]:
for i, word in enumerate(words.most_common(max_words - 2)):
    word_to_index[word[0]] = i + 2
    index_to_word[i + 2] = word[0]

In [31]:
def text_to_sequence(txt, word_to_index):
    seq = []
    for word in txt:
        index = word_to_index.get(word, 1) # 1 означает неизвестное слово
        # Неизвестные слова не добавляем в выходную последовательность
        if index != 1:
            seq.append(index)
    return seq

In [32]:
test['Sequences'] = test.apply(lambda row: text_to_sequence(row['Preprocessed_texts'], word_to_index), axis=1)
train['Sequences'] = train.apply(lambda row: text_to_sequence(row['Preprocessed_texts'], word_to_index), axis=1)

In [33]:
test

Unnamed: 0,score,text,Preprocessed_texts,Sequences
0,3,Unions representing workers at Turner Newall...,"[unions, representing, workers, at, turner, ne...","[1896, 3234, 401, 17, 6401, 222, 61, 37, 264, ..."
1,4,"SPACE.com - TORONTO, Canada -- A second\team o...","[space.com, toronto, canada, a, second\team, o...","[2986, 611, 647, 3, 5, 3345, 9, 2, 10, 407, 23..."
2,4,AP - A company founded by a chemistry research...,"[ap, a, company, founded, by, a, chemistry, re...","[41, 3, 55, 6069, 21, 3, 4726, 17, 2, 481, 5, ..."
3,4,AP - It's barely dawn when Mike Fitzpatrick st...,"[ap, it, 's, barely, dawn, when, mike, fitzpat...","[41, 20, 22, 2894, 4434, 79, 931, 2516, 28, 24..."
4,4,AP - Southern California's smog-fighting agenc...,"[ap, southern, california, 's, smog-fighting, ...","[41, 456, 504, 22, 360, 775, 32, 3774, 5, 2, 4..."
...,...,...,...,...
7595,1,Ukrainian presidential candidate Viktor Yushch...,"[ukrainian, presidential, candidate, viktor, y...","[4391, 326, 1291, 4005, 5066, 29, 9332, 14, 2,..."
7596,2,With the supply of attractive pitching options...,"[with, the, supply, of, attractive, pitching, ...","[14, 2, 896, 5, 7524, 3130, 2131, 1525, 61, 53..."
7597,2,Like Roger Clemens did almost exactly eight ye...,"[like, roger, clemens, did, almost, exactly, e...","[271, 1382, 3167, 421, 566, 3110, 569, 91, 593..."
7598,3,SINGAPORE : Doctors in the United States have ...,"[singapore, doctors, in, the, united, states, ...","[973, 1661, 6, 2, 85, 132, 33, 546, 12, 7, 605..."


In [34]:
mapping = {'World': 1, 'Sports': 2, 'Business': 3, 'Sci/Tech': 4}

In [36]:
x_train_seq = train['Sequences']
y_train = train['score']

In [37]:
x_train_seq

0                 [31, 414, 325, 22, 5, 37, 3518, 806, 423]
1         [31, 837, 720, 327, 94, 23, 3, 3855, 9, 491, 7...
2         [31, 2140, 452, 104, 1658, 2, 362, 7, 2, 999, ...
3         [31, 662, 33, 5242, 83, 27, 2, 734, 3097, 6, 4...
4         [146, 56, 83, 104, 9925, 1713, 7, 8524, 3509, ...
                                ...                        
119995    [6065, 31, 1511, 75, 2114, 2002, 23, 19, 40, 3...
119996    [243, 357, 279, 445, 5172, 9876, 3569, 8489, 9...
119997    [2, 791, 1696, 30, 403, 35, 5, 4319, 218, 3427...
119998    [1381, 17, 2953, 703, 96, 1575, 568, 2658, 21,...
119999    [1676, 3398, 4492, 2874, 29, 2310, 21, 2, 611,...
Name: Sequences, Length: 120000, dtype: object

In [38]:
y_train

0         3
1         3
2         3
3         3
4         3
         ..
119995    1
119996    2
119997    2
119998    2
119999    2
Name: score, Length: 120000, dtype: int64

In [39]:
x_test_seq = test['Sequences']
y_test = test['score']

In [40]:
x_test_seq

0       [1896, 3234, 401, 17, 6401, 222, 61, 37, 264, ...
1       [2986, 611, 647, 3, 5, 3345, 9, 2, 10, 407, 23...
2       [41, 3, 55, 6069, 21, 3, 4726, 17, 2, 481, 5, ...
3       [41, 20, 22, 2894, 4434, 79, 931, 2516, 28, 24...
4       [41, 456, 504, 22, 360, 775, 32, 3774, 5, 2, 4...
                              ...                        
7595    [4391, 326, 1291, 4005, 5066, 29, 9332, 14, 2,...
7596    [14, 2, 896, 5, 7524, 3130, 2131, 1525, 61, 53...
7597    [271, 1382, 3167, 421, 566, 3110, 569, 91, 593...
7598    [973, 1661, 6, 2, 85, 132, 33, 546, 12, 7, 605...
7599    [2023, 164, 4, 363, 2, 5251, 7, 159, 3683, 151...
Name: Sequences, Length: 7600, dtype: object

In [41]:
y_test

0       3
1       4
2       4
3       4
4       4
       ..
7595    1
7596    2
7597    2
7598    3
7599    3
Name: score, Length: 7600, dtype: int64

In [42]:
def vectorize_sequences(sequences, dimension=10000):
    results = np.zeros((len(sequences), dimension))
    for i, sequence in enumerate(sequences):
        for index in sequence:
            results[i, index] += 1.
    return results

In [43]:
x_train = vectorize_sequences(x_train_seq, max_words)

In [44]:
x_test = vectorize_sequences(x_test_seq, max_words)

In [45]:
x_train[0][:100]

array([0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
       0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [46]:
len(x_train[0])

10000

In [56]:
lr = LogisticRegression(random_state=random_state, max_iter=200)

In [57]:
lr.fit(x_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [58]:
lr.score(x_test, y_test)

0.895

In [59]:
testStr = 'The third try turned out to be closer to the charm for Elon Musk and SpaceX, as his company’s mammoth Starship rocket launched on Thursday and traveled about halfway around the Earth before it was lost as it re-entered the atmosphere. The test flight achieved several key milestones in the development of the vehicle, which could alter the future of space transportation and help NASA return astronauts to the moon.'

In [62]:
testStr

'The third try turned out to be closer to the charm for Elon Musk and SpaceX, as his company’s mammoth Starship rocket launched on Thursday and traveled about halfway around the Earth before it was lost as it re-entered the atmosphere. The test flight achieved several key milestones in the development of the vehicle, which could alter the future of space transportation and help NASA return astronauts to the moon.'

In [64]:
preprocessed_text = preprocess(testStr, stop_words, punctuation_marks, morph)

In [65]:
preprocessed_text

['the',
 'third',
 'try',
 'turned',
 'out',
 'to',
 'be',
 'closer',
 'to',
 'the',
 'charm',
 'for',
 'elon',
 'musk',
 'and',
 'spacex',
 'as',
 'his',
 'company',
 '’',
 's',
 'mammoth',
 'starship',
 'rocket',
 'launched',
 'on',
 'thursday',
 'and',
 'traveled',
 'about',
 'halfway',
 'around',
 'the',
 'earth',
 'before',
 'it',
 'was',
 'lost',
 'as',
 'it',
 're-entered',
 'the',
 'atmosphere',
 'the',
 'test',
 'flight',
 'achieved',
 'several',
 'key',
 'milestones',
 'in',
 'the',
 'development',
 'of',
 'the',
 'vehicle',
 'which',
 'could',
 'alter',
 'the',
 'future',
 'of',
 'space',
 'transportation',
 'and',
 'help',
 'nasa',
 'return',
 'astronauts',
 'to',
 'the',
 'moon']

In [68]:
seq = text_to_sequence(preprocessed_text, word_to_index)

In [69]:
seq

[2,
 177,
 915,
 935,
 76,
 4,
 34,
 1071,
 4,
 2,
 9,
 7,
 15,
 28,
 55,
 13,
 1232,
 489,
 8,
 57,
 7,
 7906,
 70,
 5574,
 368,
 2,
 1010,
 141,
 20,
 29,
 534,
 15,
 20,
 2,
 3633,
 2,
 309,
 957,
 5872,
 405,
 364,
 6,
 2,
 754,
 5,
 2,
 1910,
 86,
 92,
 7911,
 2,
 510,
 5,
 237,
 3530,
 7,
 216,
 519,
 555,
 2654,
 4,
 2,
 1203]

In [70]:
bow = vectorize_sequences([seq], max_words)

In [71]:
bow

array([[0., 0., 9., ..., 0., 0., 0.]])

In [73]:
result = lr.predict(bow)


In [74]:
result

array([4])

In [76]:
from tensorflow.keras.models import load_model

In [80]:
lr1 = lr

In [83]:
import pickle

In [85]:
pkl_model = 'pickle_model.h5'
with open(pkl_model, 'wb') as file:
  pickle.dump(lr, file)

AttributeError: 'LogisticRegression' object has no attribute 'save'