In [60]:
import time
import os
import re
from wordcloud import WordCloud,STOPWORDS
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
import seaborn as sns
os.chdir(r'c:/users/vasil/desktop/parsing_data')
from nltk.tokenize import word_tokenize
import string
from nltk.corpus import stopwords
import pymorphy2
morph = pymorphy2.MorphAnalyzer()


In [61]:

def clear(txt):
    tokens=word_tokenize(txt)
    tokens=[morph.parse(w)[0].normal_form for w in tokens]
    tbl=str.maketrans('', '',string.punctuation)
    stripped=[w.translate(tbl) for w in tokens]
    stop_words = stopwords.words('russian')                                                               
    words=[word for word in stripped if word.isalpha() and word not in stop_words] 
    return str(words)

#vectorizer = TfidfVectorizer(strip_accents='unicode', analyzer='word', ngram_range=(1,3), norm='l2')

data = pd.read_csv('clean_data_1.csv', sep=';', encoding='windows-1251')
data = data.drop('Unnamed: 0', axis=1)
labels = data.loc[:,'salary':'court_threat'].astype('category')
data['text'] = data['text'].map(lambda x: clear(str(x)))
num_labels = 7




In [62]:
#train, test = train_test_split(data['text'], labels, random_state=42, test_size=0.20, shuffle=True)
train, test = train_test_split(data,random_state=42, test_size=0.2, shuffle=True)

In [63]:

X_train = train.text
X_test = test.text

In [64]:
# using binary relevance
from skmultilearn.problem_transform import BinaryRelevance
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV

#https://towardsdatascience.com/multi-class-text-classification-model-comparison-and-selection-5eb066197568

SVC_pipeline = Pipeline([
                ('tfidf', TfidfVectorizer(stop_words=stopwords.words('russian'))),
                ('clf', BinaryRelevance(LinearSVC())),
            ])

NB_pipeline = Pipeline([
                ('tfidf', TfidfVectorizer(stop_words=stopwords.words('russian'))),
                ('clf', BinaryRelevance(GaussianNB(
                    ))),
            ])
LogReg_pipeline = Pipeline([
               ('tfidf', TfidfVectorizer(stop_words=stopwords.words('russian'))),
               ('clf', BinaryRelevance(LogisticRegression(C=1500))),
           ])

sgd_pipeline = Pipeline([
                ('tfidf', TfidfVectorizer(stop_words=stopwords.words('russian'))),
                ('clf', BinaryRelevance(SGDClassifier(loss='hinge', penalty='l2',
                    alpha=1e-3, random_state=42, max_iter=5, tol=None))),
               ])

NB_pipeline.fit(X_train, train.loc[:,'salary':'court_threat'])
prediction = NB_pipeline.predict(X_test)
print(classification_report(test.loc[:,'salary':'court_threat'], prediction,target_names=labels))
print('accuracy_score: ', round(accuracy_score(test.loc[:,'salary':'court_threat'], prediction),4))
print('macro_avg f1-score: ', round(f1_score(test.loc[:,'salary':'court_threat'], prediction, average='macro'),4))
print('micro_avg f1-score: ', round(f1_score(test.loc[:,'salary':'court_threat'], prediction, average='micro'),4))


              precision    recall  f1-score   support

      salary       0.10      0.08      0.09        24
        boss       0.26      0.31      0.28        32
  atmosphere       0.41      0.49      0.45        35
 fire_threat       0.21      0.30      0.25        10
   work_cond       0.51      0.56      0.54        34
     neutral       0.88      0.62      0.73        24
court_threat       0.08      0.12      0.10         8

 avg / total       0.40      0.40      0.39       167

accuracy_score:  0.2358
macro_avg f1-score:  0.3481
micro_avg f1-score:  0.3862


  .format(len(labels), len(target_names))


In [69]:
from skmultilearn.problem_transform import LabelPowerset

LogReg_pipeline = Pipeline([
               ('tfidf', TfidfVectorizer(stop_words=stopwords.words('russian'))),
               ('clf', LabelPowerset(LogisticRegression(penalty = 'l2', C = 1500,random_state = 0))),
           ])
NB_pipeline = Pipeline([
                ('tfidf', TfidfVectorizer(stop_words=stopwords.words('russian'))),
                ('clf', LabelPowerset(GaussianNB(
                    ))),
            ])
sgd_pipeline = Pipeline([
                ('tfidf', TfidfVectorizer(stop_words=stopwords.words('russian'))),
                ('clf', BinaryRelevance(SGDClassifier(loss='hinge', penalty='l2',
                    alpha=1e-3, random_state=42, max_iter=5, tol=None))),
               ])


LogReg_pipeline.fit(X_train, train.loc[:,'salary':'court_threat'])
prediction = LogReg_pipeline .predict(X_test)
print(classification_report(test.loc[:,'salary':'court_threat'], prediction,target_names=labels))
print('accuracy_score: ', round(accuracy_score(test.loc[:,'salary':'court_threat'], prediction),4))
print('macro_avg f1-score: ', round(f1_score(test.loc[:,'salary':'court_threat'], prediction, average='macro'),4))
print('micro_avg f1-score: ', round(f1_score(test.loc[:,'salary':'court_threat'], prediction, average='micro'),4))


              precision    recall  f1-score   support

      salary       0.92      0.46      0.61        24
        boss       0.76      0.50      0.60        32
  atmosphere       0.48      0.40      0.44        35
 fire_threat       0.44      0.40      0.42        10
   work_cond       0.67      0.41      0.51        34
     neutral       0.50      0.79      0.61        24
court_threat       0.33      0.12      0.18         8

 avg / total       0.63      0.47      0.52       167

accuracy_score:  0.3821
macro_avg f1-score:  0.4825
micro_avg f1-score:  0.5267


  .format(len(labels), len(target_names))


In [54]:
from skmultilearn.adapt import MLkNN
from scipy.sparse import csr_matrix, lil_matrix

vectorizer = TfidfVectorizer()
vectorizer.fit(train.text)
vectorizer.fit(test.text)
X_train = vectorizer.transform(train.text)
X_test = vectorizer.transform(test.text)


x_train = lil_matrix(X_train).toarray()
y_train = lil_matrix(train.loc[:,'salary':'court_threat']).toarray()
x_test = lil_matrix(X_test).toarray()

"""
from sklearn.model_selection import GridSearchCV
parameters = {'k': [1,2,3,4,5]}
mlknn = MLkNN()
gs_clf_svm = GridSearchCV(mlknn, parameters, scoring='f1_micro')
gs_clf_svm = gs_clf_svm.fit(X_train,y_train)

print(gs_clf_svm.best_score_)
print(gs_clf_svm.best_params_)
"""
mlknn = MLkNN(2)
mlknn.fit(X_train,y_train)
prediction = mlknn.predict(x_test)
print(accuracy_score(test.loc[:,'salary':'court_threat'], prediction))
print(classification_report(test.loc[:,'salary':'court_threat'], prediction,target_names=labels))
print('macro_avg f1-score: ', round(f1_score(test.loc[:,'salary':'court_threat'], prediction, average='macro'),4))
print('micro_avg f1-score: ', round(f1_score(test.loc[:,'salary':'court_threat'], prediction, average='micro'),4))


0.2032520325203252
              precision    recall  f1-score   support

      salary       0.50      0.04      0.08        24
        boss       0.50      0.12      0.20        32
  atmosphere       0.67      0.06      0.11        35
 fire_threat       0.67      0.20      0.31        10
   work_cond       0.33      0.03      0.05        34
     neutral       0.20      0.92      0.33        24
court_threat       0.00      0.00      0.00         8

 avg / total       0.44      0.19      0.15       167

macro_avg f1-score:  0.1535
micro_avg f1-score:  0.2162


  .format(len(labels), len(target_names))


In [49]:
SVC_pipeline = Pipeline([
                ('tfidf', TfidfVectorizer(stop_words=stopwords.words('russian'))),
                ('clf', OneVsRestClassifier(LinearSVC(), n_jobs=1)),
            ])

NB_pipeline = Pipeline([
                ('tfidf', TfidfVectorizer(stop_words=stopwords.words('russian'))),
                ('clf', OneVsRestClassifier(MultinomialNB(
                    fit_prior=True, class_prior=None))),
            ])
SVC_pipeline.fit(X_train,train.loc[:,'salary':'court_threat'])



prediction = SVC_pipeline.predict(X_test)
print(accuracy_score(test.loc[:,'salary':'court_threat'], prediction))

0.2764227642276423


In [33]:
model = Pipeline([('tfidf', TfidfVectorizer(stop_words=stopwords.words('russian'))),
 ('clf', OneVsRestClassifier(LinearSVC(class_weight='balanced')))])
#the class_weight="balanced" option tries to remove the biasedness of model towards majority sample

#paramater selection
    
from sklearn.model_selection import GridSearchCV
parameters = {'vectorizer__ngram_range': [(1, 1), (1, 2),(2,2)],
               'tfidf__use_idf': (True, False)}
gs_clf_svm = GridSearchCV(model, parameters, n_jobs=-1)
gs_clf_svm = gs_clf_svm.fit(data['text'], data.loc[:,'salary':'court_threat'])

print(gs_clf_svm.best_score_)
print(gs_clf_svm.best_params_)

JoblibValueError: JoblibValueError
___________________________________________________________________________
Multiprocessing exception:
...........................................................................
C:\Users\vasil\Anaconda3\envs\dp\lib\runpy.py in _run_module_as_main(mod_name='ipykernel_launcher', alter_argv=1)
    188         sys.exit(msg)
    189     main_globals = sys.modules["__main__"].__dict__
    190     if alter_argv:
    191         sys.argv[0] = mod_spec.origin
    192     return _run_code(code, main_globals, None,
--> 193                      "__main__", mod_spec)
        mod_spec = ModuleSpec(name='ipykernel_launcher', loader=<_f...\\dp\\lib\\site-packages\\ipykernel_launcher.py')
    194 
    195 def run_module(mod_name, init_globals=None,
    196                run_name=None, alter_sys=False):
    197     """Execute a module's code without importing it

...........................................................................
C:\Users\vasil\Anaconda3\envs\dp\lib\runpy.py in _run_code(code=<code object <module> at 0x000001D3881134B0, fil...lib\site-packages\ipykernel_launcher.py", line 5>, run_globals={'__annotations__': {}, '__builtins__': <module 'builtins' (built-in)>, '__cached__': r'C:\Users\vasil\Anaconda3\envs\dp\lib\site-packages\__pycache__\ipykernel_launcher.cpython-36.pyc', '__doc__': 'Entry point for launching an IPython kernel.\n\nTh...orts until\nafter removing the cwd from sys.path.\n', '__file__': r'C:\Users\vasil\Anaconda3\envs\dp\lib\site-packages\ipykernel_launcher.py', '__loader__': <_frozen_importlib_external.SourceFileLoader object>, '__name__': '__main__', '__package__': '', '__spec__': ModuleSpec(name='ipykernel_launcher', loader=<_f...\\dp\\lib\\site-packages\\ipykernel_launcher.py'), 'app': <module 'ipykernel.kernelapp' from 'C:\\Users\\v...dp\\lib\\site-packages\\ipykernel\\kernelapp.py'>, ...}, init_globals=None, mod_name='__main__', mod_spec=ModuleSpec(name='ipykernel_launcher', loader=<_f...\\dp\\lib\\site-packages\\ipykernel_launcher.py'), pkg_name='', script_name=None)
     80                        __cached__ = cached,
     81                        __doc__ = None,
     82                        __loader__ = loader,
     83                        __package__ = pkg_name,
     84                        __spec__ = mod_spec)
---> 85     exec(code, run_globals)
        code = <code object <module> at 0x000001D3881134B0, fil...lib\site-packages\ipykernel_launcher.py", line 5>
        run_globals = {'__annotations__': {}, '__builtins__': <module 'builtins' (built-in)>, '__cached__': r'C:\Users\vasil\Anaconda3\envs\dp\lib\site-packages\__pycache__\ipykernel_launcher.cpython-36.pyc', '__doc__': 'Entry point for launching an IPython kernel.\n\nTh...orts until\nafter removing the cwd from sys.path.\n', '__file__': r'C:\Users\vasil\Anaconda3\envs\dp\lib\site-packages\ipykernel_launcher.py', '__loader__': <_frozen_importlib_external.SourceFileLoader object>, '__name__': '__main__', '__package__': '', '__spec__': ModuleSpec(name='ipykernel_launcher', loader=<_f...\\dp\\lib\\site-packages\\ipykernel_launcher.py'), 'app': <module 'ipykernel.kernelapp' from 'C:\\Users\\v...dp\\lib\\site-packages\\ipykernel\\kernelapp.py'>, ...}
     86     return run_globals
     87 
     88 def _run_module_code(code, init_globals=None,
     89                     mod_name=None, mod_spec=None,

...........................................................................
C:\Users\vasil\Anaconda3\envs\dp\lib\site-packages\ipykernel_launcher.py in <module>()
     11     # This is added back by InteractiveShellApp.init_path()
     12     if sys.path[0] == '':
     13         del sys.path[0]
     14 
     15     from ipykernel import kernelapp as app
---> 16     app.launch_new_instance()

...........................................................................
C:\Users\vasil\Anaconda3\envs\dp\lib\site-packages\traitlets\config\application.py in launch_instance(cls=<class 'ipykernel.kernelapp.IPKernelApp'>, argv=None, **kwargs={})
    653 
    654         If a global instance already exists, this reinitializes and starts it
    655         """
    656         app = cls.instance(**kwargs)
    657         app.initialize(argv)
--> 658         app.start()
        app.start = <bound method IPKernelApp.start of <ipykernel.kernelapp.IPKernelApp object>>
    659 
    660 #-----------------------------------------------------------------------------
    661 # utility functions, for convenience
    662 #-----------------------------------------------------------------------------

...........................................................................
C:\Users\vasil\Anaconda3\envs\dp\lib\site-packages\ipykernel\kernelapp.py in start(self=<ipykernel.kernelapp.IPKernelApp object>)
    500         if self.poller is not None:
    501             self.poller.start()
    502         self.kernel.start()
    503         self.io_loop = ioloop.IOLoop.current()
    504         try:
--> 505             self.io_loop.start()
        self.io_loop.start = <bound method BaseAsyncIOLoop.start of <tornado.platform.asyncio.AsyncIOMainLoop object>>
    506         except KeyboardInterrupt:
    507             pass
    508 
    509 launch_new_instance = IPKernelApp.launch_instance

...........................................................................
C:\Users\vasil\Anaconda3\envs\dp\lib\site-packages\tornado\platform\asyncio.py in start(self=<tornado.platform.asyncio.AsyncIOMainLoop object>)
    127         except (RuntimeError, AssertionError):
    128             old_loop = None
    129         try:
    130             self._setup_logging()
    131             asyncio.set_event_loop(self.asyncio_loop)
--> 132             self.asyncio_loop.run_forever()
        self.asyncio_loop.run_forever = <bound method BaseEventLoop.run_forever of <_Win...EventLoop running=True closed=False debug=False>>
    133         finally:
    134             asyncio.set_event_loop(old_loop)
    135 
    136     def stop(self):

...........................................................................
C:\Users\vasil\Anaconda3\envs\dp\lib\asyncio\base_events.py in run_forever(self=<_WindowsSelectorEventLoop running=True closed=False debug=False>)
    422             sys.set_asyncgen_hooks(firstiter=self._asyncgen_firstiter_hook,
    423                                    finalizer=self._asyncgen_finalizer_hook)
    424         try:
    425             events._set_running_loop(self)
    426             while True:
--> 427                 self._run_once()
        self._run_once = <bound method BaseEventLoop._run_once of <_Windo...EventLoop running=True closed=False debug=False>>
    428                 if self._stopping:
    429                     break
    430         finally:
    431             self._stopping = False

...........................................................................
C:\Users\vasil\Anaconda3\envs\dp\lib\asyncio\base_events.py in _run_once(self=<_WindowsSelectorEventLoop running=True closed=False debug=False>)
   1435                         logger.warning('Executing %s took %.3f seconds',
   1436                                        _format_handle(handle), dt)
   1437                 finally:
   1438                     self._current_handle = None
   1439             else:
-> 1440                 handle._run()
        handle._run = <bound method Handle._run of <Handle IOLoop._run_callback(functools.par...328>, ...]))>))>>
   1441         handle = None  # Needed to break cycles when an exception occurs.
   1442 
   1443     def _set_coroutine_wrapper(self, enabled):
   1444         try:

...........................................................................
C:\Users\vasil\Anaconda3\envs\dp\lib\asyncio\events.py in _run(self=<Handle IOLoop._run_callback(functools.par...328>, ...]))>))>)
    140             self._callback = None
    141             self._args = None
    142 
    143     def _run(self):
    144         try:
--> 145             self._callback(*self._args)
        self._callback = <bound method IOLoop._run_callback of <tornado.platform.asyncio.AsyncIOMainLoop object>>
        self._args = (functools.partial(<function wrap.<locals>.null_w...DFE3E0>, <zmq.sugar.fr...001D397DFE328>, ...]))>),)
    146         except Exception as exc:
    147             cb = _format_callback_source(self._callback, self._args)
    148             msg = 'Exception in callback {}'.format(cb)
    149             context = {

...........................................................................
C:\Users\vasil\Anaconda3\envs\dp\lib\site-packages\tornado\ioloop.py in _run_callback(self=<tornado.platform.asyncio.AsyncIOMainLoop object>, callback=functools.partial(<function wrap.<locals>.null_w...DFE3E0>, <zmq.sugar.fr...001D397DFE328>, ...]))>))
    753         """Runs a callback with error handling.
    754 
    755         For use in subclasses.
    756         """
    757         try:
--> 758             ret = callback()
        ret = undefined
        callback = functools.partial(<function wrap.<locals>.null_w...DFE3E0>, <zmq.sugar.fr...001D397DFE328>, ...]))>)
    759             if ret is not None:
    760                 from tornado import gen
    761                 # Functions that return Futures typically swallow all
    762                 # exceptions and store them in the Future.  If a Future

...........................................................................
C:\Users\vasil\Anaconda3\envs\dp\lib\site-packages\tornado\stack_context.py in null_wrapper(*args=(<Future finished result=(10, 48, <bound method.....7DFE3E0>, <zmq.sugar.fr...001D397DFE328>, ...]))>,), **kwargs={})
    295         # Fast path when there are no active contexts.
    296         def null_wrapper(*args, **kwargs):
    297             try:
    298                 current_state = _state.contexts
    299                 _state.contexts = cap_contexts[0]
--> 300                 return fn(*args, **kwargs)
        args = (<Future finished result=(10, 48, <bound method.....7DFE3E0>, <zmq.sugar.fr...001D397DFE328>, ...]))>,)
        kwargs = {}
    301             finally:
    302                 _state.contexts = current_state
    303         null_wrapper._wrapped = True
    304         return null_wrapper

...........................................................................
C:\Users\vasil\Anaconda3\envs\dp\lib\site-packages\tornado\gen.py in inner(f=None)
   1228             return False
   1229         elif not self.future.done():
   1230             def inner(f):
   1231                 # Break a reference cycle to speed GC.
   1232                 f = None  # noqa
-> 1233                 self.run()
   1234             self.io_loop.add_future(
   1235                 self.future, inner)
   1236             return False
   1237         return True

...........................................................................
C:\Users\vasil\Anaconda3\envs\dp\lib\site-packages\tornado\gen.py in run(self=<tornado.gen.Runner object>)
   1142                         finally:
   1143                             # Break up a reference to itself
   1144                             # for faster GC on CPython.
   1145                             exc_info = None
   1146                     else:
-> 1147                         yielded = self.gen.send(value)
        yielded = undefined
        self.gen.send = <built-in method send of generator object>
        value = (10, 48, <bound method Kernel.dispatch_shell of <ipykernel.ipkernel.IPythonKernel object>>, (<zmq.eventloop.zmqstream.ZMQStream object>, [<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>]))
   1148 
   1149                     if stack_context._state.contexts is not orig_stack_contexts:
   1150                         self.gen.throw(
   1151                             stack_context.StackContextInconsistentError(

...........................................................................
C:\Users\vasil\Anaconda3\envs\dp\lib\site-packages\ipykernel\kernelbase.py in process_one(self=<ipykernel.ipkernel.IPythonKernel object>, wait=True)
    352         else:
    353             try:
    354                 priority, t, dispatch, args = self.msg_queue.get_nowait()
    355             except QueueEmpty:
    356                 return None
--> 357         yield gen.maybe_future(dispatch(*args))
        dispatch = <bound method Kernel.dispatch_shell of <ipykernel.ipkernel.IPythonKernel object>>
        args = (<zmq.eventloop.zmqstream.ZMQStream object>, [<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>])
    358 
    359     @gen.coroutine
    360     def dispatch_queue(self):
    361         """Coroutine to preserve order of message handling

...........................................................................
C:\Users\vasil\Anaconda3\envs\dp\lib\site-packages\tornado\gen.py in wrapper(*args=(<ipykernel.ipkernel.IPythonKernel object>, <zmq.eventloop.zmqstream.ZMQStream object>, [<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>]), **kwargs={})
    321                 # never actually yields, which in turn allows us to
    322                 # use "optional" coroutines in critical path code without
    323                 # performance penalty for the synchronous case.
    324                 try:
    325                     orig_stack_contexts = stack_context._state.contexts
--> 326                     yielded = next(result)
        yielded = undefined
        result = <generator object dispatch_shell>
    327                     if stack_context._state.contexts is not orig_stack_contexts:
    328                         yielded = _create_future()
    329                         yielded.set_exception(
    330                             stack_context.StackContextInconsistentError(

...........................................................................
C:\Users\vasil\Anaconda3\envs\dp\lib\site-packages\ipykernel\kernelbase.py in dispatch_shell(self=<ipykernel.ipkernel.IPythonKernel object>, stream=<zmq.eventloop.zmqstream.ZMQStream object>, msg={'buffers': [], 'content': {'allow_stdin': True, 'code': "model = Pipeline([('tfidf', TfidfVectorizer(stop...f_svm.best_score_)\nprint(gs_clf_svm.best_params_)", 'silent': False, 'stop_on_error': True, 'store_history': True, 'user_expressions': {}}, 'header': {'date': datetime.datetime(2019, 4, 26, 11, 12, 55, 367867, tzinfo=tzutc()), 'msg_id': '5768fbd3da094f2285d1c0938caf2e98', 'msg_type': 'execute_request', 'session': '19361b4e3f56426cb50186b49a66e139', 'username': 'username', 'version': '5.2'}, 'metadata': {}, 'msg_id': '5768fbd3da094f2285d1c0938caf2e98', 'msg_type': 'execute_request', 'parent_header': {}})
    262             try:
    263                 self.pre_handler_hook()
    264             except Exception:
    265                 self.log.debug("Unable to signal in pre_handler_hook:", exc_info=True)
    266             try:
--> 267                 yield gen.maybe_future(handler(stream, idents, msg))
        handler = <bound method Kernel.execute_request of <ipykernel.ipkernel.IPythonKernel object>>
        stream = <zmq.eventloop.zmqstream.ZMQStream object>
        idents = [b'19361b4e3f56426cb50186b49a66e139']
        msg = {'buffers': [], 'content': {'allow_stdin': True, 'code': "model = Pipeline([('tfidf', TfidfVectorizer(stop...f_svm.best_score_)\nprint(gs_clf_svm.best_params_)", 'silent': False, 'stop_on_error': True, 'store_history': True, 'user_expressions': {}}, 'header': {'date': datetime.datetime(2019, 4, 26, 11, 12, 55, 367867, tzinfo=tzutc()), 'msg_id': '5768fbd3da094f2285d1c0938caf2e98', 'msg_type': 'execute_request', 'session': '19361b4e3f56426cb50186b49a66e139', 'username': 'username', 'version': '5.2'}, 'metadata': {}, 'msg_id': '5768fbd3da094f2285d1c0938caf2e98', 'msg_type': 'execute_request', 'parent_header': {}}
    268             except Exception:
    269                 self.log.error("Exception in message handler:", exc_info=True)
    270             finally:
    271                 try:

...........................................................................
C:\Users\vasil\Anaconda3\envs\dp\lib\site-packages\tornado\gen.py in wrapper(*args=(<ipykernel.ipkernel.IPythonKernel object>, <zmq.eventloop.zmqstream.ZMQStream object>, [b'19361b4e3f56426cb50186b49a66e139'], {'buffers': [], 'content': {'allow_stdin': True, 'code': "model = Pipeline([('tfidf', TfidfVectorizer(stop...f_svm.best_score_)\nprint(gs_clf_svm.best_params_)", 'silent': False, 'stop_on_error': True, 'store_history': True, 'user_expressions': {}}, 'header': {'date': datetime.datetime(2019, 4, 26, 11, 12, 55, 367867, tzinfo=tzutc()), 'msg_id': '5768fbd3da094f2285d1c0938caf2e98', 'msg_type': 'execute_request', 'session': '19361b4e3f56426cb50186b49a66e139', 'username': 'username', 'version': '5.2'}, 'metadata': {}, 'msg_id': '5768fbd3da094f2285d1c0938caf2e98', 'msg_type': 'execute_request', 'parent_header': {}}), **kwargs={})
    321                 # never actually yields, which in turn allows us to
    322                 # use "optional" coroutines in critical path code without
    323                 # performance penalty for the synchronous case.
    324                 try:
    325                     orig_stack_contexts = stack_context._state.contexts
--> 326                     yielded = next(result)
        yielded = undefined
        result = <generator object execute_request>
    327                     if stack_context._state.contexts is not orig_stack_contexts:
    328                         yielded = _create_future()
    329                         yielded.set_exception(
    330                             stack_context.StackContextInconsistentError(

...........................................................................
C:\Users\vasil\Anaconda3\envs\dp\lib\site-packages\ipykernel\kernelbase.py in execute_request(self=<ipykernel.ipkernel.IPythonKernel object>, stream=<zmq.eventloop.zmqstream.ZMQStream object>, ident=[b'19361b4e3f56426cb50186b49a66e139'], parent={'buffers': [], 'content': {'allow_stdin': True, 'code': "model = Pipeline([('tfidf', TfidfVectorizer(stop...f_svm.best_score_)\nprint(gs_clf_svm.best_params_)", 'silent': False, 'stop_on_error': True, 'store_history': True, 'user_expressions': {}}, 'header': {'date': datetime.datetime(2019, 4, 26, 11, 12, 55, 367867, tzinfo=tzutc()), 'msg_id': '5768fbd3da094f2285d1c0938caf2e98', 'msg_type': 'execute_request', 'session': '19361b4e3f56426cb50186b49a66e139', 'username': 'username', 'version': '5.2'}, 'metadata': {}, 'msg_id': '5768fbd3da094f2285d1c0938caf2e98', 'msg_type': 'execute_request', 'parent_header': {}})
    529             self._publish_execute_input(code, parent, self.execution_count)
    530 
    531         reply_content = yield gen.maybe_future(
    532             self.do_execute(
    533                 code, silent, store_history,
--> 534                 user_expressions, allow_stdin,
        user_expressions = {}
        allow_stdin = True
    535             )
    536         )
    537 
    538         # Flush output before sending the reply.

...........................................................................
C:\Users\vasil\Anaconda3\envs\dp\lib\site-packages\tornado\gen.py in wrapper(*args=(<ipykernel.ipkernel.IPythonKernel object>, "model = Pipeline([('tfidf', TfidfVectorizer(stop...f_svm.best_score_)\nprint(gs_clf_svm.best_params_)", False, True, {}, True), **kwargs={})
    321                 # never actually yields, which in turn allows us to
    322                 # use "optional" coroutines in critical path code without
    323                 # performance penalty for the synchronous case.
    324                 try:
    325                     orig_stack_contexts = stack_context._state.contexts
--> 326                     yielded = next(result)
        yielded = undefined
        result = <generator object do_execute>
    327                     if stack_context._state.contexts is not orig_stack_contexts:
    328                         yielded = _create_future()
    329                         yielded.set_exception(
    330                             stack_context.StackContextInconsistentError(

...........................................................................
C:\Users\vasil\Anaconda3\envs\dp\lib\site-packages\ipykernel\ipkernel.py in do_execute(self=<ipykernel.ipkernel.IPythonKernel object>, code="model = Pipeline([('tfidf', TfidfVectorizer(stop...f_svm.best_score_)\nprint(gs_clf_svm.best_params_)", silent=False, store_history=True, user_expressions={}, allow_stdin=True)
    289                     res = yield coro_future
    290             else:
    291                 # runner isn't already running,
    292                 # make synchronous call,
    293                 # letting shell dispatch to loop runners
--> 294                 res = shell.run_cell(code, store_history=store_history, silent=silent)
        res = undefined
        code = "model = Pipeline([('tfidf', TfidfVectorizer(stop...f_svm.best_score_)\nprint(gs_clf_svm.best_params_)"
        store_history = True
        silent = False
    295         finally:
    296             self._restore_input()
    297 
    298         if res.error_before_exec is not None:

...........................................................................
C:\Users\vasil\Anaconda3\envs\dp\lib\site-packages\ipykernel\zmqshell.py in run_cell(self=<ipykernel.zmqshell.ZMQInteractiveShell object>, *args=("model = Pipeline([('tfidf', TfidfVectorizer(stop...f_svm.best_score_)\nprint(gs_clf_svm.best_params_)",), **kwargs={'silent': False, 'store_history': True})
    531             )
    532         self.payload_manager.write_payload(payload)
    533 
    534     def run_cell(self, *args, **kwargs):
    535         self._last_traceback = None
--> 536         return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
        self.run_cell = <bound method ZMQInteractiveShell.run_cell of <ipykernel.zmqshell.ZMQInteractiveShell object>>
        args = ("model = Pipeline([('tfidf', TfidfVectorizer(stop...f_svm.best_score_)\nprint(gs_clf_svm.best_params_)",)
        kwargs = {'silent': False, 'store_history': True}
    537 
    538     def _showtraceback(self, etype, evalue, stb):
    539         # try to preserve ordering of tracebacks and print statements
    540         sys.stdout.flush()

...........................................................................
C:\Users\vasil\Anaconda3\envs\dp\lib\site-packages\IPython\core\interactiveshell.py in run_cell(self=<ipykernel.zmqshell.ZMQInteractiveShell object>, raw_cell="model = Pipeline([('tfidf', TfidfVectorizer(stop...f_svm.best_score_)\nprint(gs_clf_svm.best_params_)", store_history=True, silent=False, shell_futures=True)
   2814         result : :class:`ExecutionResult`
   2815         """
   2816         result = None
   2817         try:
   2818             result = self._run_cell(
-> 2819                 raw_cell, store_history, silent, shell_futures)
        raw_cell = "model = Pipeline([('tfidf', TfidfVectorizer(stop...f_svm.best_score_)\nprint(gs_clf_svm.best_params_)"
        store_history = True
        silent = False
        shell_futures = True
   2820         finally:
   2821             self.events.trigger('post_execute')
   2822             if not silent:
   2823                 self.events.trigger('post_run_cell', result)

...........................................................................
C:\Users\vasil\Anaconda3\envs\dp\lib\site-packages\IPython\core\interactiveshell.py in _run_cell(self=<ipykernel.zmqshell.ZMQInteractiveShell object>, raw_cell="model = Pipeline([('tfidf', TfidfVectorizer(stop...f_svm.best_score_)\nprint(gs_clf_svm.best_params_)", store_history=True, silent=False, shell_futures=True)
   2840             runner = self.loop_runner
   2841         else:
   2842             runner = _pseudo_sync_runner
   2843 
   2844         try:
-> 2845             return runner(coro)
        runner = <function _pseudo_sync_runner>
        coro = <generator object InteractiveShell.run_cell_async>
   2846         except BaseException as e:
   2847             info = ExecutionInfo(raw_cell, store_history, silent, shell_futures)
   2848             result = ExecutionResult(info)
   2849             result.error_in_exec = e

...........................................................................
C:\Users\vasil\Anaconda3\envs\dp\lib\site-packages\IPython\core\async_helpers.py in _pseudo_sync_runner(coro=<generator object InteractiveShell.run_cell_async>)
     62 
     63     Credit to Nathaniel Smith
     64 
     65     """
     66     try:
---> 67         coro.send(None)
        coro.send = <built-in method send of generator object>
     68     except StopIteration as exc:
     69         return exc.value
     70     else:
     71         # TODO: do not raise but return an execution result with the right info.

...........................................................................
C:\Users\vasil\Anaconda3\envs\dp\lib\site-packages\IPython\core\interactiveshell.py in run_cell_async(self=<ipykernel.zmqshell.ZMQInteractiveShell object>, raw_cell="model = Pipeline([('tfidf', TfidfVectorizer(stop...f_svm.best_score_)\nprint(gs_clf_svm.best_params_)", store_history=True, silent=False, shell_futures=True)
   3015                 interactivity = "none" if silent else self.ast_node_interactivity
   3016                 if _run_async:
   3017                     interactivity = 'async'
   3018 
   3019                 has_raised = yield from self.run_ast_nodes(code_ast.body, cell_name,
-> 3020                        interactivity=interactivity, compiler=compiler, result=result)
        interactivity = 'last_expr'
        compiler = <IPython.core.compilerop.CachingCompiler object>
   3021 
   3022                 self.last_execution_succeeded = not has_raised
   3023                 self.last_execution_result = result
   3024 

...........................................................................
C:\Users\vasil\Anaconda3\envs\dp\lib\site-packages\IPython\core\interactiveshell.py in run_ast_nodes(self=<ipykernel.zmqshell.ZMQInteractiveShell object>, nodelist=[<_ast.Assign object>, <_ast.ImportFrom object>, <_ast.Assign object>, <_ast.Assign object>, <_ast.Assign object>, <_ast.Expr object>, <_ast.Expr object>], cell_name='<ipython-input-33-185ea2a96147>', interactivity='last', compiler=<IPython.core.compilerop.CachingCompiler object>, result=<ExecutionResult object at 1d396c671d0, executio...rue silent=False shell_futures=True> result=None>)
   3180                     return True
   3181             else:
   3182                 for i, node in enumerate(to_run_exec):
   3183                     mod = ast.Module([node])
   3184                     code = compiler(mod, cell_name, "exec")
-> 3185                     if (yield from self.run_code(code, result)):
        self.run_code = <bound method InteractiveShell.run_code of <ipykernel.zmqshell.ZMQInteractiveShell object>>
        code = <code object <module> at 0x000001D399CCB9C0, file "<ipython-input-33-185ea2a96147>", line 11>
        result = <ExecutionResult object at 1d396c671d0, executio...rue silent=False shell_futures=True> result=None>
   3186                         return True
   3187 
   3188                 for i, node in enumerate(to_run_interactive):
   3189                     mod = ast.Interactive([node])

...........................................................................
C:\Users\vasil\Anaconda3\envs\dp\lib\site-packages\IPython\core\interactiveshell.py in run_code(self=<ipykernel.zmqshell.ZMQInteractiveShell object>, code_obj=<code object <module> at 0x000001D399CCB9C0, file "<ipython-input-33-185ea2a96147>", line 11>, result=<ExecutionResult object at 1d396c671d0, executio...rue silent=False shell_futures=True> result=None>, async_=False)
   3262                 if async_:
   3263                     last_expr = (yield from self._async_exec(code_obj, self.user_ns))
   3264                     code = compile('last_expr', 'fake', "single")
   3265                     exec(code, {'last_expr': last_expr})
   3266                 else:
-> 3267                     exec(code_obj, self.user_global_ns, self.user_ns)
        code_obj = <code object <module> at 0x000001D399CCB9C0, file "<ipython-input-33-185ea2a96147>", line 11>
        self.user_global_ns = {'GridSearchCV': <class 'sklearn.model_selection._search.GridSearchCV'>, 'In': ['', 'import time\nimport os\nimport re\nfrom wordcloud i...mport pymorphy2\nmorph = pymorphy2.MorphAnalyzer()', "\ndef clear(txt):\n    tokens=word_tokenize(txt)\n ...ext'].map(lambda x: clear(str(x)))\nnum_labels = 7", 'x_train, x_test,y_train, y_test = train_test_spl...ls, random_state=42, test_size=0.2, shuffle=True)', "LogReg_pipeline = Pipeline([\n                ('t...ccuracy_score(y_test[category], prediction), 4)))", 'import time\nimport os\nimport re\nfrom wordcloud i...mport pymorphy2\nmorph = pymorphy2.MorphAnalyzer()', 'import time\nimport os\nimport re\nfrom wordcloud i...mport pymorphy2\nmorph = pymorphy2.MorphAnalyzer()', "\ndef clear(txt):\n    tokens=word_tokenize(txt)\n ...ext'].map(lambda x: clear(str(x)))\nnum_labels = 7", 'import time\nimport os\nimport re\nfrom wordcloud i...mport pymorphy2\nmorph = pymorphy2.MorphAnalyzer()', 'x_train, x_test,y_train, y_test = train_test_spl...ls, random_state=42, test_size=0.2, shuffle=True)', 'x_train, x_test,y_train, y_test = train_test_spl...ls, random_state=42, test_size=0.2, shuffle=True)', 'LogReg_pipeline = Pipeline([\n                (\'t...acy_score(y_test[category], prediction), 4)))\n"""', 'LogReg_pipeline = Pipeline([\n                (\'t...acy_score(y_test[category], prediction), 4)))\n"""', 'LogReg_pipeline = Pipeline([\n                (\'t...acy_score(y_test[category], prediction), 4)))\n"""', "\ndef clear(txt):\n    tokens=word_tokenize(txt)\n ...ext'].map(lambda x: clear(str(x)))\nnum_labels = 7", 'x_train, x_test,y_train, y_test = train_test_spl...ls, random_state=42, test_size=0.2, shuffle=True)', "model = Pipeline([('vectorizer', CountVectorizer...f_svm.best_score_)\nprint(gs_clf_svm.best_params_)", "train, test = train_test_split(data, data.oc[:,'...ls, random_state=42, test_size=0.2, shuffle=True)", 'train, test = train_test_split(data, data.loc[:,...ls, random_state=42, test_size=0.2, shuffle=True)', 'train, test = train_test_split(data, data.loc[:,...ls, random_state=42, test_size=0.2, shuffle=True)', ...], 'LinearSVC': <class 'sklearn.svm.classes.LinearSVC'>, 'LogReg_pipeline': Pipeline(memory=None,
     steps=[('tfidf', Tfid...bose=0, warm_start=False),
          n_jobs=1))]), 'LogisticRegression': <class 'sklearn.linear_model.logistic.LogisticRegression'>, 'MultinomialNB': <class 'sklearn.naive_bayes.MultinomialNB'>, 'OneVsRestClassifier': <class 'sklearn.multiclass.OneVsRestClassifier'>, 'Out': {30: '\nfrom sklearn.model_selection import GridSearchC...curacy_score(y_test[category], prediction), 4)))\n'}, 'Pipeline': <class 'sklearn.pipeline.Pipeline'>, 'STOPWORDS': {'a', 'about', 'above', 'after', 'again', 'against', ...}, ...}
        self.user_ns = {'GridSearchCV': <class 'sklearn.model_selection._search.GridSearchCV'>, 'In': ['', 'import time\nimport os\nimport re\nfrom wordcloud i...mport pymorphy2\nmorph = pymorphy2.MorphAnalyzer()', "\ndef clear(txt):\n    tokens=word_tokenize(txt)\n ...ext'].map(lambda x: clear(str(x)))\nnum_labels = 7", 'x_train, x_test,y_train, y_test = train_test_spl...ls, random_state=42, test_size=0.2, shuffle=True)', "LogReg_pipeline = Pipeline([\n                ('t...ccuracy_score(y_test[category], prediction), 4)))", 'import time\nimport os\nimport re\nfrom wordcloud i...mport pymorphy2\nmorph = pymorphy2.MorphAnalyzer()', 'import time\nimport os\nimport re\nfrom wordcloud i...mport pymorphy2\nmorph = pymorphy2.MorphAnalyzer()', "\ndef clear(txt):\n    tokens=word_tokenize(txt)\n ...ext'].map(lambda x: clear(str(x)))\nnum_labels = 7", 'import time\nimport os\nimport re\nfrom wordcloud i...mport pymorphy2\nmorph = pymorphy2.MorphAnalyzer()', 'x_train, x_test,y_train, y_test = train_test_spl...ls, random_state=42, test_size=0.2, shuffle=True)', 'x_train, x_test,y_train, y_test = train_test_spl...ls, random_state=42, test_size=0.2, shuffle=True)', 'LogReg_pipeline = Pipeline([\n                (\'t...acy_score(y_test[category], prediction), 4)))\n"""', 'LogReg_pipeline = Pipeline([\n                (\'t...acy_score(y_test[category], prediction), 4)))\n"""', 'LogReg_pipeline = Pipeline([\n                (\'t...acy_score(y_test[category], prediction), 4)))\n"""', "\ndef clear(txt):\n    tokens=word_tokenize(txt)\n ...ext'].map(lambda x: clear(str(x)))\nnum_labels = 7", 'x_train, x_test,y_train, y_test = train_test_spl...ls, random_state=42, test_size=0.2, shuffle=True)', "model = Pipeline([('vectorizer', CountVectorizer...f_svm.best_score_)\nprint(gs_clf_svm.best_params_)", "train, test = train_test_split(data, data.oc[:,'...ls, random_state=42, test_size=0.2, shuffle=True)", 'train, test = train_test_split(data, data.loc[:,...ls, random_state=42, test_size=0.2, shuffle=True)', 'train, test = train_test_split(data, data.loc[:,...ls, random_state=42, test_size=0.2, shuffle=True)', ...], 'LinearSVC': <class 'sklearn.svm.classes.LinearSVC'>, 'LogReg_pipeline': Pipeline(memory=None,
     steps=[('tfidf', Tfid...bose=0, warm_start=False),
          n_jobs=1))]), 'LogisticRegression': <class 'sklearn.linear_model.logistic.LogisticRegression'>, 'MultinomialNB': <class 'sklearn.naive_bayes.MultinomialNB'>, 'OneVsRestClassifier': <class 'sklearn.multiclass.OneVsRestClassifier'>, 'Out': {30: '\nfrom sklearn.model_selection import GridSearchC...curacy_score(y_test[category], prediction), 4)))\n'}, 'Pipeline': <class 'sklearn.pipeline.Pipeline'>, 'STOPWORDS': {'a', 'about', 'above', 'after', 'again', 'against', ...}, ...}
   3268             finally:
   3269                 # Reset our crash handler in place
   3270                 sys.excepthook = old_excepthook
   3271         except SystemExit as e:

...........................................................................
c:\users\vasil\desktop\parsing_data\<ipython-input-33-185ea2a96147> in <module>()
      6     
      7 from sklearn.model_selection import GridSearchCV
      8 parameters = {'vectorizer__ngram_range': [(1, 1), (1, 2),(2,2)],
      9                'tfidf__use_idf': (True, False)}
     10 gs_clf_svm = GridSearchCV(model, parameters, n_jobs=-1)
---> 11 gs_clf_svm = gs_clf_svm.fit(data['text'], data.loc[:,'salary':'court_threat'])
     12 
     13 print(gs_clf_svm.best_score_)
     14 print(gs_clf_svm.best_params_)

...........................................................................
C:\Users\vasil\Anaconda3\envs\dp\lib\site-packages\sklearn\model_selection\_search.py in fit(self=GridSearchCV(cv=None, error_score='raise',
     ...ain_score='warn',
       scoring=None, verbose=0), X=0      ['хотеть', 'получать', 'нормальный', 'зп'..., 'крет...
Name: text, Length: 612, dtype: object, y=     salary  boss  atmosphere  fire_threat  work....0      0.0           0.0

[612 rows x 7 columns], groups=None, **fit_params={})
    634                                   return_train_score=self.return_train_score,
    635                                   return_n_test_samples=True,
    636                                   return_times=True, return_parameters=False,
    637                                   error_score=self.error_score)
    638           for parameters, (train, test) in product(candidate_params,
--> 639                                                    cv.split(X, y, groups)))
        cv.split = <bound method _BaseKFold.split of KFold(n_splits=3, random_state=None, shuffle=False)>
        X = 0      ['хотеть', 'получать', 'нормальный', 'зп'..., 'крет...
Name: text, Length: 612, dtype: object
        y =      salary  boss  atmosphere  fire_threat  work....0      0.0           0.0

[612 rows x 7 columns]
        groups = None
    640 
    641         # if one choose to see train score, "out" will contain train score info
    642         if self.return_train_score:
    643             (train_score_dicts, test_score_dicts, test_sample_counts, fit_time,

...........................................................................
C:\Users\vasil\Anaconda3\envs\dp\lib\site-packages\sklearn\externals\joblib\parallel.py in __call__(self=Parallel(n_jobs=-1), iterable=<generator object BaseSearchCV.fit.<locals>.<genexpr>>)
    784             if pre_dispatch == "all" or n_jobs == 1:
    785                 # The iterable was consumed all at once by the above for loop.
    786                 # No need to wait for async callbacks to trigger to
    787                 # consumption.
    788                 self._iterating = False
--> 789             self.retrieve()
        self.retrieve = <bound method Parallel.retrieve of Parallel(n_jobs=-1)>
    790             # Make sure that we get a last message telling us we are done
    791             elapsed_time = time.time() - self._start_time
    792             self._print('Done %3i out of %3i | elapsed: %s finished',
    793                         (len(self._output), len(self._output),

---------------------------------------------------------------------------
Sub-process traceback:
---------------------------------------------------------------------------
ValueError                                         Fri Apr 26 14:12:57 2019
PID: 17152        Python 3.6.7: C:\Users\vasil\Anaconda3\envs\dp\python.exe
...........................................................................
C:\Users\vasil\Anaconda3\envs\dp\lib\site-packages\sklearn\externals\joblib\parallel.py in __call__(self=<sklearn.externals.joblib.parallel.BatchedCalls object>)
    126     def __init__(self, iterator_slice):
    127         self.items = list(iterator_slice)
    128         self._size = len(self.items)
    129 
    130     def __call__(self):
--> 131         return [func(*args, **kwargs) for func, args, kwargs in self.items]
        self.items = [(<function _fit_and_score>, (Pipeline(memory=None,
     steps=[('tfidf', Tfid...l=0.0001,
     verbose=0),
          n_jobs=1))]), 0      ['хотеть', 'получать', 'нормальный', 'зп'..., 'крет...
Name: text, Length: 612, dtype: object,      salary  boss  atmosphere  fire_threat  work....0      0.0           0.0

[612 rows x 7 columns], {'score': <function _passthrough_scorer>}, array([204, 205, 206, 207, 208, 209, 210, 211, 2..., 604, 605, 606,
       607, 608, 609, 610, 611]), array([  0,   1,   2,   3,   4,   5,   6,   7,  ...    195, 196, 197, 198, 199, 200, 201, 202, 203]), 0, {'tfidf__use_idf': True, 'vectorizer__ngram_range': (1, 1)}), {'error_score': 'raise', 'fit_params': {}, 'return_n_test_samples': True, 'return_parameters': False, 'return_times': True, 'return_train_score': 'warn'})]
    132 
    133     def __len__(self):
    134         return self._size
    135 

...........................................................................
C:\Users\vasil\Anaconda3\envs\dp\lib\site-packages\sklearn\externals\joblib\parallel.py in <listcomp>(.0=<list_iterator object>)
    126     def __init__(self, iterator_slice):
    127         self.items = list(iterator_slice)
    128         self._size = len(self.items)
    129 
    130     def __call__(self):
--> 131         return [func(*args, **kwargs) for func, args, kwargs in self.items]
        func = <function _fit_and_score>
        args = (Pipeline(memory=None,
     steps=[('tfidf', Tfid...l=0.0001,
     verbose=0),
          n_jobs=1))]), 0      ['хотеть', 'получать', 'нормальный', 'зп'..., 'крет...
Name: text, Length: 612, dtype: object,      salary  boss  atmosphere  fire_threat  work....0      0.0           0.0

[612 rows x 7 columns], {'score': <function _passthrough_scorer>}, array([204, 205, 206, 207, 208, 209, 210, 211, 2..., 604, 605, 606,
       607, 608, 609, 610, 611]), array([  0,   1,   2,   3,   4,   5,   6,   7,  ...    195, 196, 197, 198, 199, 200, 201, 202, 203]), 0, {'tfidf__use_idf': True, 'vectorizer__ngram_range': (1, 1)})
        kwargs = {'error_score': 'raise', 'fit_params': {}, 'return_n_test_samples': True, 'return_parameters': False, 'return_times': True, 'return_train_score': 'warn'}
    132 
    133     def __len__(self):
    134         return self._size
    135 

...........................................................................
C:\Users\vasil\Anaconda3\envs\dp\lib\site-packages\sklearn\model_selection\_validation.py in _fit_and_score(estimator=Pipeline(memory=None,
     steps=[('tfidf', Tfid...l=0.0001,
     verbose=0),
          n_jobs=1))]), X=0      ['хотеть', 'получать', 'нормальный', 'зп'..., 'крет...
Name: text, Length: 612, dtype: object, y=     salary  boss  atmosphere  fire_threat  work....0      0.0           0.0

[612 rows x 7 columns], scorer={'score': <function _passthrough_scorer>}, train=array([204, 205, 206, 207, 208, 209, 210, 211, 2..., 604, 605, 606,
       607, 608, 609, 610, 611]), test=array([  0,   1,   2,   3,   4,   5,   6,   7,  ...    195, 196, 197, 198, 199, 200, 201, 202, 203]), verbose=0, parameters={'tfidf__use_idf': True, 'vectorizer__ngram_range': (1, 1)}, fit_params={}, return_train_score='warn', return_parameters=False, return_n_test_samples=True, return_times=True, error_score='raise')
    439                       for k, v in fit_params.items()])
    440 
    441     test_scores = {}
    442     train_scores = {}
    443     if parameters is not None:
--> 444         estimator.set_params(**parameters)
        estimator.set_params = <bound method Pipeline.set_params of Pipeline(me...=0.0001,
     verbose=0),
          n_jobs=1))])>
        parameters = {'tfidf__use_idf': True, 'vectorizer__ngram_range': (1, 1)}
    445 
    446     start_time = time.time()
    447 
    448     X_train, y_train = _safe_split(estimator, X, y, train)

...........................................................................
C:\Users\vasil\Anaconda3\envs\dp\lib\site-packages\sklearn\pipeline.py in set_params(self=Pipeline(memory=None,
     steps=[('tfidf', Tfid...l=0.0001,
     verbose=0),
          n_jobs=1))]), **kwargs={'tfidf__use_idf': True, 'vectorizer__ngram_range': (1, 1)})
    137 
    138         Returns
    139         -------
    140         self
    141         """
--> 142         self._set_params('steps', **kwargs)
        self._set_params = <bound method _BaseComposition._set_params of Pi...=0.0001,
     verbose=0),
          n_jobs=1))])>
        kwargs = {'tfidf__use_idf': True, 'vectorizer__ngram_range': (1, 1)}
    143         return self
    144 
    145     def _validate_steps(self):
    146         names, estimators = zip(*self.steps)

...........................................................................
C:\Users\vasil\Anaconda3\envs\dp\lib\site-packages\sklearn\utils\metaestimators.py in _set_params(self=Pipeline(memory=None,
     steps=[('tfidf', Tfid...l=0.0001,
     verbose=0),
          n_jobs=1))]), attr='steps', **params={'tfidf__use_idf': True, 'vectorizer__ngram_range': (1, 1)})
     44         names, _ = zip(*getattr(self, attr))
     45         for name in list(six.iterkeys(params)):
     46             if '__' not in name and name in names:
     47                 self._replace_estimator(attr, name, params.pop(name))
     48         # 3. Step parameters and other initilisation arguments
---> 49         super(_BaseComposition, self).set_params(**params)
        self.set_params = <bound method Pipeline.set_params of Pipeline(me...=0.0001,
     verbose=0),
          n_jobs=1))])>
        params = {'tfidf__use_idf': True, 'vectorizer__ngram_range': (1, 1)}
     50         return self
     51 
     52     def _replace_estimator(self, attr, name, new_val):
     53         # assumes `name` is a valid estimator name

...........................................................................
C:\Users\vasil\Anaconda3\envs\dp\lib\site-packages\sklearn\base.py in set_params(self=Pipeline(memory=None,
     steps=[('tfidf', Tfid...l=0.0001,
     verbose=0),
          n_jobs=1))]), **params={'tfidf__use_idf': True, 'vectorizer__ngram_range': (1, 1)})
    269             key, delim, sub_key = key.partition('__')
    270             if key not in valid_params:
    271                 raise ValueError('Invalid parameter %s for estimator %s. '
    272                                  'Check the list of available parameters '
    273                                  'with `estimator.get_params().keys()`.' %
--> 274                                  (key, self))
        key = 'vectorizer'
        self = Pipeline(memory=None,
     steps=[('tfidf', Tfid...l=0.0001,
     verbose=0),
          n_jobs=1))])
    275 
    276             if delim:
    277                 nested_params[key][sub_key] = value
    278             else:

ValueError: Invalid parameter vectorizer for estimator Pipeline(memory=None,
     steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
 ...lti_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0),
          n_jobs=1))]). Check the list of available parameters with `estimator.get_params().keys()`.
___________________________________________________________________________

In [43]:
#LogReg_pipeline = Pipeline([
  #              ('tfidf', TfidfVectorizer(stop_words=stopwords.words('russian'))),
  #              ('clf', OneVsRestClassifier(LogisticRegression(solver='sag'), n_jobs=1)),
   #         ])
SVC_pipeline = Pipeline([
                ('tfidf', TfidfVectorizer(stop_words=stopwords.words('russian'))),
                ('clf', OneVsRestClassifier(LinearSVC(), n_jobs=1)),
            ])
NB_pipeline = Pipeline([
                ('tfidf', TfidfVectorizer(stop_words=stopwords.words('russian'))),
                ('clf', OneVsRestClassifier(MultinomialNB(
                    fit_prior=True, class_prior=None))),
            ])
"""
from sklearn.model_selection import GridSearchCV
parameters = {'vectorizer__ngram_range': [(1, 1), (1, 2),(2,2)],
               'tfidf__use_idf': (True, False)}

gs_log_reg = GridSearchCV(LogReg_pipeline, parameters, n_jobs=-1)
gs_log_reg = gs_log_reg.fit(data['text'], data.loc[:,'salary':'court_threat'])
print(gs_log_reg.best_score_)
print(gs_log_reg.best_params_)

for category in labels:
    print('Категория: {}'.format(category))
    LogReg_pipeline.fit(x_train, y_train[category])
    prediction = LogReg_pipeline.predict(x_test)
    print('Точность на тестовых данных: {}'.format(round(accuracy_score(y_test[category], prediction), 4)))
"""


"\nfrom sklearn.model_selection import GridSearchCV\nparameters = {'vectorizer__ngram_range': [(1, 1), (1, 2),(2,2)],\n               'tfidf__use_idf': (True, False)}\n\ngs_log_reg = GridSearchCV(LogReg_pipeline, parameters, n_jobs=-1)\ngs_log_reg = gs_log_reg.fit(data['text'], data.loc[:,'salary':'court_threat'])\nprint(gs_log_reg.best_score_)\nprint(gs_log_reg.best_params_)\n\nfor category in labels:\n    print('Категория: {}'.format(category))\n    LogReg_pipeline.fit(x_train, y_train[category])\n    prediction = LogReg_pipeline.predict(x_test)\n    print('Точность на тестовых данных: {}'.format(round(accuracy_score(y_test[category], prediction), 4)))\n"

In [44]:
for topic in labels:
    print('Категория: {}'.format(topic))
    NB_pipeline.fit(x_train, y_train[topic])
    prediction = NB_pipeline.predict(x_test)
    print('Точность на тестовых данных: {}'.format(
    round(accuracy_score(y_test[topic], prediction), 4)))

Категория: salary
Точность на тестовых данных: 0.8049
Категория: boss
Точность на тестовых данных: 0.7642
Категория: atmosphere
Точность на тестовых данных: 0.7154
Категория: fire_threat
Точность на тестовых данных: 0.9187
Категория: work_cond
Точность на тестовых данных: 0.7236
Категория: neutral
Точность на тестовых данных: 0.8049
Категория: court_threat
Точность на тестовых данных: 0.935
