# <u>StackOverflow Tag Predictor
StackOverflow lets us post your queries and the other user can help you with answers. The site uses tags for managing the questions effectively. Here we will be predicting tags for a given question. Tags like C, C++, Python are widely used.

In [66]:
import nltk
from nltk.corpus import stopwords
from ast import literal_eval
import pandas as pd
import numpy as np
from tqdm import tqdm
import re
from collections import defaultdict
from scipy import sparse
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer

### <u>Data loading

In [2]:
# for reading the data
def load_data(dirname):
    # laod the data file
    data = pd.read_csv(dirname, sep='\t')
    # convert string charcter to language syntactic characters if any
    data['tags'] = data['tags'].apply(literal_eval)
    return data

In [5]:
# load training and validation data
train_data = load_data('dataset/train.tsv')
val_data = load_data('dataset/validation.tsv')

In [6]:
# test data
test_data = pd.read_csv('dataset/test.tsv', sep='\t')

In [8]:
train_data.head()

Unnamed: 0,title,tags
0,How to draw a stacked dotplot in R?,[r]
1,mysql select all records where a datetime fiel...,"[php, mysql]"
2,How to terminate windows phone 8.1 app,[c#]
3,get current time in a specific country via jquery,"[javascript, jquery]"
4,Configuring Tomcat to Use SSL,[java]


In [9]:
# training data
X_train = train_data['title'].values 
y_train = train_data['tags'].values
# validation data
X_val = val_data['title'].values
y_val = val_data['tags'].values

In [10]:
print(X_train.shape)
print(y_train.shape)
print(X_val.shape)
print(y_val.shape)

(100000,)
(100000,)
(30000,)
(30000,)


### <u>Text Preprocessing
We remove the punctuations, unecessary whitespaces and some other characters

In [11]:
# preprocess text
def preprocess_data(text):
    STOPWORDS = set(stopwords.words('english'))
    # convert to lowercase
    text = text.lower()
    # replace whitespaces and punctuations
    text = re.sub('[/(){}\[\]\|@,;]', ' ', text)
    text = re.sub('[^0-9a-z #+_]', '', text)
    text = ' '.join(word for word in text.split() if word not in STOPWORDS)
    return text

In [16]:
# preprocess the data
X_train = [preprocess_data(text) for text in X_train]
X_val = [preprocess_data(text) for text in X_val]

Find word and tag frequencies

In [18]:
def compute_frequency(X_train, y_train):
    # dictionary of all tags with their frequency.
    tag_counts = defaultdict(int)
    # dictionary of all words with their frequency.
    word_counts = defaultdict(int)

    # find tag counts
    for _,tags in tqdm(enumerate(y_train)):
        for tag in tags:
            #print(tag)
            tag_counts[tag] += 1

    # for words
    for _,senten in tqdm(enumerate(X_train)):
        for word in senten.split():
            word_counts[word] += 1
    
    return word_counts, tag_counts

In [19]:
word_counts, tag_counts = compute_frequency(X_train, y_train)

100000it [00:00, 1279796.91it/s]
100000it [00:00, 399972.54it/s]


We will create vocabulary dictionary of top **N** words from the training data. We need two mappings:<br>
1) Words to index<br>
2) Index to words

In [56]:
# for creating word to index and vice versa mappings
def create_vocabulary_mappings(X_train, word_counts, DICT_SIZE=4500):
    # word to index mapping
    word_to_idx = {word:idx for idx,(word,f) in enumerate(
                sorted(word_counts.items(), key=lambda v:v[1], reverse=True)[:DICT_SIZE])}
    # reverse index to word mapping
    idx_to_word= {word_to_idx[word]:word for word in word_to_idx.keys()}
    
    return word_to_idx, idx_to_word

In [57]:
DICT_SIZE=4500
word_to_idx, idx_to_word = create_vocabulary_mappings(X_train, word_counts, DICT_SIZE=4500)

Now we will be trying two feature representations : Bag of Words(BOW) and TF-IDF. First we will create a function for **BOW**. For BOW we will use most commonly used 4500 words.

### Bag of Words

In [50]:
# for creating BOW representation
def create_bag_of_words(text, word_to_idx, DICT_SIZE):
    # Intial Matrix for holding the features
    feature_vector = np.zeros(DICT_SIZE)
    
    # update the word frequencies
    for word in text.split():
        if word in word_to_idx.keys():
            feature_vector[word_to_idx[word]] += 1 
    
    return feature_vector

In [59]:
# create the bag of words feature vector
# we will use a sparse representation , here we will be using csr matrix representation
# for storing it
X_train_bow = sparse.vstack([sparse.csr_matrix(create_bag_of_words(text, word_to_idx, DICT_SIZE)) for text in X_train])
X_val_bow = sparse.vstack([sparse.csr_matrix(create_bag_of_words(text, word_to_idx, DICT_SIZE)) for text in X_val])

print('X_train shape ', X_train_bow.shape)
print('X_val shape ', X_val_bow.shape)

X_train shape  (100000, 4500)
X_val shape  (30000, 4500)


### TF-IDF

In [63]:
# creates tf-idf feature vector
def create_tfidf_features(X_train, X_val):
    # fit for training data
    tfidf = TfidfVectorizer(ngram_range=(1,2), max_df=0.9, min_df=5, token_pattern='(\S+)')####### YOUR CODE HERE #######
    # apply for training and validation set
    X_train = tfidf.fit_transform(X_train)
    X_val = tfidf.transform(X_val)
    
    return X_train, X_val, tfidf.vocabulary_

In [64]:
X_train_tfidf, X_val_tfidf, tfidf_vocab = create_tfidf_features(X_train, X_val)
tfidf_reverse_vocab = {i:word for word,i in tfidf_vocab.items()}

## <u> Classifier
Since there can be multiple tags for the query question, so we will represent the output as either 0 or 1, where 1 means that tag is present and 0 means it is absent. So will use MultiLabelBinarizer from scikit-learn for this.

In [68]:
# create an instance
mlb_object = MultiLabelBinarizer(classes=sorted(tag_counts.keys()))
# transform the tags 
y_train = mlb_object.fit_transform(y_train)
y_val = mlb_object.fit_transform(y_val)

## <u>Training
We will experiment with different classifiers. We will use One vs All approach here. 

In [88]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier

In [91]:
import pickle

In [89]:
# define the classifier and fit it to the training data
def train_classifier(X_train, y_train, inner_clf):
    # define the classifier
    clf = OneVsRestClassifier(inner_clf)
    # train it
    clf.fit(X_train, y_train)
    return clf

In [103]:
# classifer for one vs all
ridge_clf = RidgeClassifier()
rf_clf = RandomForestClassifier(n_estimators = 100, max_depth=10, n_jobs=-1, verbose=0)
lr_clf = LogisticRegression(n_jobs=-1)
nb_clf = GaussianNB()
svm_clf = LinearSVC()
knn_clf = KNeighborsClassifier(n_jobs=-1)

In [104]:
# for bag of words
clf_bow_rf = train_classifier(X_train_bow, y_train, rf_clf)
# for ifidf
clf_tfidf_rf = train_classifier(X_train_tfidf, y_train, rf_clf)

filename_bow = 'models/rf_bow.sav'
filename_tfidf = 'models/rf_tfidf.sav'
pickle.dump(clf_bow_rf, open(filename_bow, 'wb'))
pickle.dump(clf_tfidf_rf, open(filename_tfidf, 'wb'))

In [105]:
# for bag of words
clf_bow_lr = train_classifier(X_train_bow, y_train, lr_clf)
# for ifidf
clf_tfidf_lr = train_classifier(X_train_tfidf, y_train, lr_clf)

filename_bow = 'models/lr_bow.sav'
filename_tfidf = 'models/lr_tfidf.sav'
pickle.dump(clf_bow_lr, open(filename_bow, 'wb'))
pickle.dump(clf_tfidf_lr, open(filename_tfidf, 'wb'))

  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}

  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}

  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}

  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}

  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}

  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))


In [None]:
# for bag of words
clf_bow_nb = train_classifier(X_train_bow.toarray(), y_train, nb_clf)
# for ifidf
clf_tfidf_nb = train_classifier(X_train_tfidf.toarray(), y_train, nb_clf)

filename_bow = 'models/nb_bow.sav'
filename_tfidf = 'models/nb_tfidf.sav'
pickle.dump(clf_bow_nb, open(filename_bow, 'wb'))
pickle.dump(clf_tfidf_nb, open(filename_tfidf, 'wb'))

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "C:\Users\SEEKER\Anaconda3\envs\tensorflow\lib\site-packages\IPython\core\interactiveshell.py", line 2963, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-107-97f9d1a8d302>", line 2, in <module>
    clf_bow_nb = train_classifier(X_train_bow.toarray(), y_train, nb_clf)
  File "<ipython-input-89-97b98067bc45>", line 6, in train_classifier
    clf.fit(X_train, y_train)
  File "C:\Users\SEEKER\Anaconda3\envs\tensorflow\lib\site-packages\sklearn\multiclass.py", line 215, in fit
    for i, column in enumerate(columns))
  File "C:\Users\SEEKER\Anaconda3\envs\tensorflow\lib\site-packages\sklearn\externals\joblib\parallel.py", line 779, in __call__
    while self.dispatch_one_batch(iterator):
  File "C:\Users\SEEKER\Anaconda3\envs\tensorflow\lib\site-packages\sklearn\externals\joblib\parallel.py", line 625, in dispatch_one_batch
    self._dispatch(tasks)
  File "C:\Users\SEEKER\Anaconda3\envs\tensorflow\lib\site-

  File "C:\Users\SEEKER\Anaconda3\envs\tensorflow\lib\linecache.py", line 74, in checkcache
    stat = os.stat(fullname)
KeyboardInterrupt

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:\Users\SEEKER\Anaconda3\envs\tensorflow\lib\site-packages\IPython\core\interactiveshell.py", line 1863, in showtraceback
    stb = value._render_traceback_()
AttributeError: 'KeyboardInterrupt' object has no attribute '_render_traceback_'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:\Users\SEEKER\Anaconda3\envs\tensorflow\lib\site-packages\IPython\core\ultratb.py", line 1095, in get_records
    return _fixed_getinnerframes(etb, number_of_lines_of_context, tb_offset)
  File "C:\Users\SEEKER\Anaconda3\envs\tensorflow\lib\site-packages\IPython\core\ultratb.py", line 311, in wrapped
    return f(*args, **kwargs)
  File "C:\Users\SEEKER\Anaconda3\envs\tensorflow\lib\site-p

In [110]:
# for bag of words
clf_bow_knn = train_classifier(X_train_bow, y_train, knn_clf)
# for ifidf
clf_tfidf_knn = train_classifier(X_train_tfidf, y_train, knn_clf)

filename_bow = 'models/knn_bow.sav'
filename_tfidf = 'models/knn_tfidf.sav'
pickle.dump(clf_bow_knn, open(filename_bow, 'wb'))
pickle.dump(clf_tfidf_knn, open(filename_tfidf, 'wb'))

In [108]:
# for bag of words
clf_bow_svm = train_classifier(X_train_bow, y_train, rf_clf)
# for ifidf
clf_tfidf_svm = train_classifier(X_train_tfidf, y_train, rf_clf)

filename_bow = 'models/rf_bow.sav'
filename_tfidf = 'models/rf_tfidf.sav'
pickle.dump(clf_bow_rf, open(filename_bow, 'wb'))
pickle.dump(clf_tfidf_rf, open(filename_tfidf, 'wb'))

In [109]:
# for bag of words
clf_bow_ridge = train_classifier(X_train_bow, y_train, ridge_clf)
# for ifidf
clf_tfidf_ridge = train_classifier(X_train_tfidf, y_train, ridge_clf)

filename_bow = 'models/ridge_bow.sav'
filename_tfidf = 'models/ridge_tfidf.sav'
pickle.dump(clf_bow_ridge, open(filename_bow, 'wb'))
pickle.dump(clf_tfidf_ridge, open(filename_tfidf, 'wb'))

## <u>Evaluation metrics

In [1]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score 
from sklearn.metrics import average_precision_score
from sklearn.metrics import recall_score

In [81]:
# gives evaluation statistics
def evaluate_classifiers(y_val, predicted):
    print('Accuracy: '+ str(accuracy_score(y_val, predicted)*100))
    print(average_precision_score(y_val, predicted))

In [112]:
# make predictions 
pred_val_bow_ridge = clf_bow_ridge.predict(X_val_bow)
pred_val_tfidf_ridge = clf_tfidf_ridge.predict(X_val_tfidf)

pred_val_bow_rf = clf_bow_rf.predict(X_val_bow)
pred_val_tfidf_rf = clf_tfidf_rf.predict(X_val_tfidf)

pred_val_bow_lr = clf_bow_lr.predict(X_val_bow)
pred_val_tfidf_lr = clf_tfidf_lr.predict(X_val_tfidf)

#pred_val_bow_nb = clf_bow_nb.predict(X_val_bow)
#pred_val_tfidf_nb = clf_tfidf_nb.predict(X_val_tfidf)

pred_val_bow_svm = clf_bow_svm.predict(X_val_bow)
pred_val_tfidf_svm = clf_tfidf_svm.predict(X_val_tfidf)

pred_val_bow_knn = clf_bow_knn.predict(X_val_bow)
pred_val_tfidf_knn = clf_tfidf_knn.predict(X_val_tfidf)

NameError: name 'clf_bow_svm' is not defined

In [82]:
print('Ridge')
print('Bag of words')
evaluate_classifiers(y_val, pred_val_bow_ridge)
print('Tf-IDF')
evaluate_classifiers(y_val, pred_val_tfidf_ridge)

Bag-of-words
0.34696666666666665


  'precision', 'predicted', average, warn_for)


0.6266790565285131
0.3470088612540947
Tfidf
0.3620333333333333
0.6449542259718493
0.3586114787710488


In [None]:
print('Ridge')
print('Bag of words')
evaluate_classifiers(y_val, pred_val_bow_ridge)
print('Tf-IDF')
evaluate_classifiers(y_val, pred_val_tfidf_ridge)

In [None]:
print('Random Forest')
print('Bag of words')
evaluate_classifiers(y_val, pred_val_bow_rf)
print('Tf-IDF')
evaluate_classifiers(y_val, pred_val_tfidf_rf)

In [None]:
print('Logistic Regression')
print('Bag of words')
evaluate_classifiers(y_val, pred_val_bow_lr)
print('Tf-IDF')
evaluate_classifiers(y_val, pred_val_tfidf_lr)

In [None]:
print('KNN')
print('Bag of words')
evaluate_classifiers(y_val, pred_val_bow_knn)
print('Tf-IDF')
evaluate_classifiers(y_val, pred_val_tfidf_knn)

In [None]:
print('SVM')
print('Bag of words')
evaluate_classifiers(y_val, pred_val_bow_svm)
print('Tf-IDF')
evaluate_classifiers(y_val, pred_val_tfidf_svm)

In [None]:
print('Naive Bayes')
print('Bag of words')
evaluate_classifiers(y_val, pred_val_bow_nb)
print('Tf-IDF')
evaluate_classifiers(y_val, pred_val_tfidf_nb)

In [79]:
# convert back the predictions to the original tags they are suppose to
pred_val_inverse = mlb_object.inverse_transform(pred_val_tfidf)
# convert the original tag labels
y_val_inverse = mlb_object.inverse_transform(y_val)

for i in range(3):
    print('Query:\t{}\nTrue labels:\t{}\nPredicted labels:\t{}\n\n'.format(
        X_val[i],
        ','.join(y_val_inverse[i]),
        ','.join(pred_val_inverse[i])
    ))

Title:	odbc_exec always fail
True labels:	php,sql
Predicted labels:	


Title:	access base classes variable within child class
True labels:	javascript
Predicted labels:	


Title:	contenttype application json required rails
True labels:	ruby,ruby-on-rails
Predicted labels:	ruby-on-rails


