In [1]:
import string
from keras.preprocessing.sequence import pad_sequences
import pandas as pd
import numpy as np
import os
import re
from datetime import date
from fastnumbers import isfloat, isint
import tensorflow as tf
from keras.models import Sequential, load_model
from keras.layers import Embedding
from keras.layers import BatchNormalization, SpatialDropout1D, Conv1D, Dense, Dropout, GlobalMaxPooling1D
from keras.callbacks import ModelCheckpoint, TensorBoard, EarlyStopping
from keras.utils import to_categorical


rus_alphabet = ['а','б','в','г','д','е','ё','ж','з','и','й','к','л','м','н','о','п','р','с','т','у','ф','х','ц','ч','ш','щ','ъ','ы','ь','э','ю','я']

def create_vocab_set():

    alphabet = (rus_alphabet + list(string.ascii_lowercase) + list(string.digits) + list(string.punctuation) + [' ', '\n'])
    vocab_size = len(alphabet)
    vocab = {}
    for ix, t in enumerate(alphabet):
        vocab[t] = ix+1
    return vocab, vocab_size

def text2sequence(text, vocab):
    temp = []
    for review in text:
        temp.append([])
        for i in review:
            char = vocab.get(i,0)
            if char != 0:
                temp[-1].append(char)
    return temp

dir_train = '../data'

mappings = {
    'career': 0,
    'theory_and_practice': 1,
    'deep_learning': 2,
    'lang_python': 3,
    '_meetings': 4,
    'kaggle_crackers': 5,
    'big_data': 6,
    'lang_r': 7,
    'nlp': 8,
    'welcome': 9,
    'datasets': 10,
    'bayesian': 11
}


# parameters initialization
VALIDATION_SPLIT = 0.1
RANDOM_SEED = 42

# initialize dictionary size and maximum sentence length
MAX_SEQUENCE_LENGTH = 150

data = pd.read_csv(os.path.join(dir_train, 'train_set.csv'), usecols=range(1,11), parse_dates=['timestamp', 'thread_timestamp'])
data = data[
    data.channel.isin(['career', 'big_data', 'deep_learning', 'kaggle_crackers',
           'lang_python',  'lang_r', 'nlp', 'theory_and_practice', 'welcome', 'bayesian', '_meetings', 'datasets']) &
    data.main_msg
]

# data_train = data.
date_before = date(2017, 4, 1)
train = data[data['timestamp'] <= date_before]
val = data[data['timestamp'] > date_before]

train_data = train[['channel', 'text']].reset_index()[['channel', 'text']]
train_data['channel'] = train_data.channel.map(mappings)
train_data = train_data.sort_values('channel').reset_index()[['channel', 'text']]

val_data = val[['channel', 'text']].reset_index()[['channel', 'text']]
val_data['channel'] = val_data.channel.map(mappings)
val_data = val_data.sort_values('channel').reset_index()[['channel', 'text']]

train_data.text = train_data.text.astype(str)\
    .apply(lambda x: re.sub('(<\S+>:?)|(\s?:\S+:\s?)|(&gt;)|([\w\.]*@[\w\.]*)', ' ', x))\
    .apply(lambda x: re.sub('\s+', ' ', x))
train_data = train_data[~train_data.text.apply(lambda x: isfloat(x) or isint(x) or len(x) < 20)]

val_data.text = val_data.text.astype(str)\
    .apply(lambda x: re.sub('(<\S+>:?)|(\s?:\S+:\s?)|(&gt;)|([\w\.]*@[\w\.]*)', ' ', x))\
    .apply(lambda x: re.sub('\s+', ' ', x))
val_data = val_data[~val_data.text.apply(lambda x: isfloat(x) or isint(x) or len(x) < 20)]

train_text = train_data['text'].astype(str).apply(lambda x: x.lower())
train_labels =  np.asarray(train_data['channel'], dtype='int8')

val_text = val_data['text'].astype(str).apply(lambda x: x.lower())
val_labels = np.asarray(val_data['channel'], dtype='int8')
    
vocab, vocab_size = create_vocab_set()

X_train = text2sequence(train_text, vocab)
X_val = text2sequence(val_text, vocab)

X_train = pad_sequences(X_train, maxlen=MAX_SEQUENCE_LENGTH, value=0)
X_val = pad_sequences(X_val, maxlen=MAX_SEQUENCE_LENGTH, value=0)

train_labels = to_categorical(train_labels, num_classes=12)
val_labels = to_categorical(val_labels, num_classes=12)

model = load_model("../models/model_simple ohe lstm.hdf5")

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

model.summary()

Using TensorFlow backend.


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 150, 150)          15600     
_________________________________________________________________
spatial_dropout1d_1 (Spatial (None, 150, 150)          0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 148, 150)          67650     
_________________________________________________________________
spatial_dropout1d_2 (Spatial (None, 148, 150)          0         
_________________________________________________________________
batch_normalization_1 (Batch (None, 148, 150)          600       
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 146, 150)          67650     
_________________________________________________________________
spatial_dropout1d_3 (Spatial (None, 146, 150)          0         
__________

In [107]:
model.evaluate(X_val, val_labels)



[1.4525082519565289, 0.53860080176351388]

In [108]:
nn_predictions = model.predict(X_val)

In [2]:
import pymorphy2
import nltk
import stop_words
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC, SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score
from sklearn.model_selection import KFold, GridSearchCV, train_test_split, cross_val_score, StratifiedKFold, ParameterGrid
from sklearn import preprocessing
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from datetime import date
import fastnumbers
import re
from fastnumbers import isfloat, isint

train_text = train_data['text'].astype(str).apply(lambda x: x.lower())
train_labels =  np.asarray(train_data['channel'], dtype='int8')

val_text = val_data['text'].astype(str).apply(lambda x: x.lower())
val_labels = np.asarray(val_data['channel'], dtype='int8')

In [110]:
# library for lemmatization and stop_words
morph = pymorphy2.MorphAnalyzer()
# without tuning accuracy_score = 50.81%
# accuracy_score = 55.33%
classifier = Pipeline([
    ('vectorizer', TfidfVectorizer(analyzer = 'char', max_features = 1000000, 
                                                       ngram_range = (1, 4))),
    ('clf', OneVsRestClassifier(LogisticRegression(),n_jobs=-1))])

classifier.fit(train_text, train_labels)
decision_boundary = classifier.decision_function(val_text)
linear_predictions = np.exp(decision_boundary) / np.sum(np.exp(decision_boundary), axis=1)[:, np.newaxis]

In [111]:
ansamble_pred = np.argmax(linear_predictions * 0.63 + nn_predictions * (1 - 0.63), axis = 1)

In [198]:
def classify_text(text):
    decision_boundary = classifier.decision_function([text])
    linear_class = np.exp(decision_boundary) / np.sum(np.exp(decision_boundary), axis=1)[:, np.newaxis]
    text_val = pad_sequences(text2sequence([text], vocab), maxlen=MAX_SEQUENCE_LENGTH, value=0)
    nn_class = model.predict(text_val)
    v = np.argmax(linear_class * 0.63 + nn_class * (1 - 0.63), axis = 1)[0]
    for k in mappings:
        if mappings[k] == v:
            return k

In [224]:
classify_text('где найти синтаксис дерево')

'nlp'

In [3]:
import xgboost as xgb



In [4]:
vectorizer = TfidfVectorizer(analyzer = 'char', max_features = 1000000, ngram_range = (1, 4))
train_matrix = vectorizer.fit_transform(train_text)
val_matrix = vectorizer.fit_transform(val_text)

In [5]:
xgb_train = xgb.DMatrix(train_matrix, label=train_labels)
xgb_val = xgb.DMatrix(val_matrix, label=val_labels)

params = {
    'eta': 0.1, 
    'seed': 42, 
    'subsample': 0.7, 
    'colsample_bytree': 0.7,
    'objective': 'multi:softmax', 
    'max_depth': 7, 
    'min_child_weight': 1,
    'num_class': 12,
    'eval_metric': 'merror' 
}

eval_matrix = [(xgb_val, 'xgb_val')]

final_gb = xgb.train(params, xgb_train, num_boost_round = 1000, evals = eval_matrix, early_stopping_rounds=20,
                    verbose_eval=5)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.


In [1]:
predictions_xgb = final_gb.predict(xgb_val) # Predict using our testdmat
print(predictions_xgb.round().astype(int))

NameError: name 'final_gb' is not defined

In [None]:
import lightgbm as lgb

In [None]:
lgb_train = lgb.Dataset(train_matrix, label = train_labels)
lgb_val = lgb.Dataset(val_matrix, label = val_labels, reference = lgb_train)