# Data loading

In [338]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
pd.options.display.max_columns = 200
plt.style.use('ggplot')
np.warnings.filterwarnings('ignore')

In [339]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/My Drive/DL project

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/My Drive/DL project


In [340]:
#загружаем тренировочную, тестовую выборки, а также пример предсказаний
train = pd.read_csv('train.csv')
test = pd.read_csv('valid.csv')

In [341]:
train.tail(2)

Unnamed: 0,Id,Title,Body,Tags,CreationDate,Y
44998,60465318,how to implement fill in the blank in Swift,"<p>""I _____ any questions.""</p>\n\n<p>I want t...",<ios><swift>,2020-02-29 12:50:43,LQ_CLOSE
44999,60468018,How can I make a c# application outside of vis...,<p>I'm very new to programming and I'm teachin...,<c#><visual-studio>,2020-02-29 17:55:56,LQ_CLOSE


In [342]:
X_train, y_train = train['Title'].values, train['Tags'].values
X_test, y_test = test['Title'].values, test['Tags'].values

# Data preprocessing

In [343]:
import nltk
from nltk.corpus import stopwords
import re
nltk.download('stopwords')

STOPWORDS = set(stopwords.words('english'))
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Уберем лишнее из исходних текстов:

In [344]:
def text_prepare(text):
    """
        text: a string
        return: modified initial string
    """
    text = text.lower()
    text = REPLACE_BY_SPACE_RE.sub(" ", text)
    text = BAD_SYMBOLS_RE.sub("", text)
    text = [word for word in text.split() if not word in STOPWORDS]
    
    
    return (' '.join(text))

In [345]:
X_train = [text_prepare(x) for x in X_train]
X_test = [text_prepare(x) for x in X_test]

In [346]:
X_train[:5]

['java repeat task every random seconds',
 'java optionals immutable',
 'text overlay image darkened opacity react native',
 'ternary operator swift picky',
 'hide show fab scale animation']

Преобразуем тэги в список:

In [347]:
def tags_prepare(tags):
    """
        tags: a string
        return: list of tags
    """
    tags = tags.replace("><", ",")
    tags = tags.replace("<", "")
    tags = tags.replace(">", "")
    tags = tags.split(',')
    
    return(tags)

In [348]:
y_train = [tags_prepare(x) for x in y_train]
y_test = [tags_prepare(x) for x in y_test]

In [349]:
y_train[:5]

[['java', 'repeat'],
 ['java', 'optional'],
 ['javascript', 'image', 'overlay', 'react-native', 'opacity'],
 ['swift', 'operators', 'whitespace', 'ternary-operator', 'optional'],
 ['android', 'material-design', 'floating-action-button']]

Создадим словари, а также посмотрим на самые популярные слова в текстах и теги:

In [350]:
tokenizer = nltk.tokenize.TreebankWordTokenizer()

train_X = [tokenizer.tokenize(text) for text in X_train]

In [358]:
# Dictionary of all tags from train corpus with their counts.
tags_counts = {}
# Dictionary of all words from train corpus with their counts.
words_counts = {}

for text in y_train:   
    for word in text:
        if word in tags_counts:
            tags_counts[word] += 1
        else:
            tags_counts[word] = 1

for text in train_X:    
    for word in text:
        if word in words_counts:
            words_counts[word] += 1
        else:
            words_counts[word] = 1

In [359]:
len(tags_counts)

9336

Так как в словаре имеется слишком много редких тегов(усложняются вычисления + зачастую они не имеют смысла), то для дальнейших предсказаний отсавим те, которые встречаются чаще всего(более n раз):

In [360]:
n = 4
tags_counts = {k: v for k, v in tags_counts.items() if v > n}
len(tags_counts)

2229

In [310]:
most_common_tags = sorted(tags_counts.items(), key=lambda x: x[1], reverse=True)[:3]
most_common_words = sorted(words_counts.items(), key=lambda x: x[1], reverse=True)[:3]

In [229]:
most_common_tags, most_common_words

([('javascript', 5385), ('python', 5041), ('java', 4677)],
 [('using', 3045), ('python', 2383), ('error', 2273)])

# MultiLogisticRegression

## Data vectorization

Векторизуем предобработанные текста с помощью TfidfVectorizer для дальнейше подачи на вход модели:

In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer


tfidf_vectorizer = TfidfVectorizer(min_df=5, max_df=0.9, ngram_range=(1,2), token_pattern='(\S+)', max_features=5000)
tfidf_vectorizer.fit(X_train)

X_train_tfidf = tfidf_vectorizer.transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

Бинаризуем наши теги для подачи на вход модели:

In [361]:
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer(classes=sorted(tags_counts.keys()))
y_train_mlb = mlb.fit_transform(y_train)
y_test_mlb = mlb.fit_transform(y_test)

In [231]:
len(y_test)

15000

## Classifier

Создадим и обучим классификатор на основе подхода OneVsRest, внутри которого будем строить k штук LogisticRegression, где k - число различных тэгов:

In [None]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier

In [None]:
%%time

multi_lr = OneVsRestClassifier(LogisticRegression(random_state=42))
multi_lr.fit(X_train_tfidf, y_train_mlb)

CPU times: user 5min 48s, sys: 3.84 s, total: 5min 52s
Wall time: 5min 53s


In [None]:
y_train_predicted = multi_lr.predict(X_train_tfidf)
y_test_predicted = multi_lr.predict(X_test_tfidf)

In [None]:
y_test_predicted_inv = mlb.inverse_transform(y_test_predicted)
y_test_inv = mlb.inverse_transform(y_test_mlb)

for i in range(10, 15):
    print('Title:\t{}\nTrue labels:\t{}\nPredicted labels:\t{}\n\n'.format(
        X_test[i],
        ','.join(y_test_inv[i]),
        ','.join(y_test_predicted_inv[i])
    ))

In [377]:
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

def print_evaluation_scores(y_val, predicted):
  
    print('accuracy ' + str(accuracy_score(y_val, predicted)),
          'precision ' + str(precision_score(y_val, predicted, average='micro')),
          'recall ' + str(recall_score(y_val, predicted, average='micro')),
          'f1 ' + str(f1_score(y_val, predicted, average='micro')))

In [None]:
print_evaluation_scores(y_test_mlb, y_test_predicted)

accuracy 0.12633333333333333 precision 0.8603867988068893 recall 0.24806502621577384 f1 0.3369575634475602


Протеститируем работу для нового текста:

In [None]:
text = "I have a problem with python and html"
text = text_prepare(text)

text = tfidf_vectorizer.transform([text])
test_predictions = classifier_tfidf.predict(text)
test_pred_inversed = mlb.inverse_transform(test_predictions)
test_pred_inversed

[('html', 'python')]

# Neural network

## Without Pretrained Word Embeddings

In [363]:
from numpy import array
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers.core import Activation, Dropout, Dense
from keras.layers import Flatten, LSTM
from keras.layers import GlobalMaxPooling1D
from keras.models import Model
from keras.layers.embeddings import Embedding
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.layers import Input
from keras.layers.merge import Concatenate
import matplotlib.pyplot as plt
from tensorflow import keras
from tensorflow.keras.layers import Dense, Flatten, Input
from keras.callbacks import EarlyStopping

In [364]:
from nltk.tokenize import word_tokenize

all_words = []
for sent in X_train:
    tokenize_word = word_tokenize(sent)
    for word in tokenize_word:
        all_words.append(word)

In [365]:
unique_words = set(all_words)
print(len(unique_words))

23501


### Data vectorization

In [366]:
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X_train)

vocab_size = len(tokenizer.word_index) + 1

X_train_num = tokenizer.texts_to_sequences(X_train)
X_test_num = tokenizer.texts_to_sequences(X_test)

In [367]:
vocab_size

22839

In [368]:
from nltk.tokenize import word_tokenize


word_count = lambda sentence: len(word_tokenize(sentence))
longest_sentence = max(X_train, key=word_count)
length_long_sentence = len(word_tokenize(longest_sentence))

In [369]:
length_long_sentence

29

In [370]:
X_train_num = pad_sequences(X_train_num, length_long_sentence, padding='post')
X_test_num = pad_sequences(X_test_num, length_long_sentence, padding='post')

In [182]:
X_train_num[3]

array([1519,  253,   61,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0], dtype=int32)

In [83]:
len(y_train[:3])

3

### Model

In [371]:
import keras.backend as K

def get_f1(y_true, y_pred): #taken from old keras source code
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    recall = true_positives / (possible_positives + K.epsilon())
    f1_val = 2*(precision*recall)/(precision+recall+K.epsilon())
    return f1_val

In [387]:
model = Sequential()
model.add(Embedding(vocab_size, 20, input_length=length_long_sentence))
model.add(Flatten())
model.add(Dense(600, activation='relu'))
model.add(Dense(2229, activation='sigmoid'))


model.compile(optimizer='adam',
             loss='binary_crossentropy',
             metrics=[get_f1])


print(model.summary())      # вывод структуры НС в консоль

Model: "sequential_18"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_18 (Embedding)     (None, 29, 20)            456780    
_________________________________________________________________
flatten_17 (Flatten)         (None, 580)               0         
_________________________________________________________________
dense_25 (Dense)             (None, 600)               348600    
_________________________________________________________________
dense_26 (Dense)             (None, 2229)              1339629   
Total params: 2,145,009
Trainable params: 2,145,009
Non-trainable params: 0
_________________________________________________________________
None


In [388]:
model.fit(X_train_num, y_train_mlb, batch_size=30, epochs=100, 
          validation_split=0.2, callbacks=EarlyStopping(monitor='val_get_f1', mode='max', patience = 5))

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100


<tensorflow.python.keras.callbacks.History at 0x7fc6e03bcd90>

In [389]:
model.evaluate(X_test_num, y_test_mlb)



[0.005706890486180782, 0.42215412855148315]

In [390]:
y_test_predicted = model.predict(X_test_num)
y_test_predicted[y_test_predicted>=0.5] = 1
y_test_predicted[y_test_predicted<0.5] = 0

In [391]:
print_evaluation_scores(y_test_mlb, y_test_predicted)

accuracy 0.11026666666666667 precision 0.5828382195383267 recall 0.33201098565761367 f1 0.42303953624008056


In [393]:
text = "I have a problem with python and html"
text = text_prepare(text)

text = tokenizer.texts_to_sequences([text])
text = pad_sequences(text, length_long_sentence, padding='post')

test_predictions = model.predict(text)
test_predictions[test_predictions>=0.5] = 1
test_predictions[test_predictions<0.5] = 0
test_pred_inversed = mlb.inverse_transform(test_predictions)
test_pred_inversed

[('python',)]

## With Pretrained Word Embeddings