# <u>StackOverflow Tag Predictor
StackOverflow lets us post your queries and the other user can help you with answers. The site uses tags for managing the questions effectively. Here we will be predicting tags for a given question. Tags like C, C++, Python are widely used.

In [60]:
import nltk
from nltk.corpus import stopwords
from ast import literal_eval
import pandas as pd
import numpy as np
from tqdm import tqdm
import re
from collections import defaultdict
from scipy import sparse
from sklearn.feature_extraction.text import TfidfVectorizer

### <u>Data loading

In [2]:
# for reading the data
def load_data(dirname):
    # laod the data file
    data = pd.read_csv(dirname, sep='\t')
    # convert string charcter to language syntactic characters if any
    data['tags'] = data['tags'].apply(literal_eval)
    return data

In [5]:
# load training and validation data
train_data = load_data('dataset/train.tsv')
val_data = load_data('dataset/validation.tsv')

In [6]:
# test data
test_data = pd.read_csv('dataset/test.tsv', sep='\t')

In [8]:
train_data.head()

Unnamed: 0,title,tags
0,How to draw a stacked dotplot in R?,[r]
1,mysql select all records where a datetime fiel...,"[php, mysql]"
2,How to terminate windows phone 8.1 app,[c#]
3,get current time in a specific country via jquery,"[javascript, jquery]"
4,Configuring Tomcat to Use SSL,[java]


In [9]:
# training data
X_train = train_data['title'].values 
y_train = train_data['tags'].values
# validation data
X_val = val_data['title'].values
y_val = val_data['tags'].values

In [10]:
print(X_train.shape)
print(y_train.shape)
print(X_val.shape)
print(y_val.shape)

(100000,)
(100000,)
(30000,)
(30000,)


### <u>Text Preprocessing
We remove the punctuations, unecessary whitespaces and some other characters

In [11]:
# preprocess text
def preprocess_data(text):
    STOPWORDS = set(stopwords.words('english'))
    # convert to lowercase
    text = text.lower()
    # replace whitespaces and punctuations
    text = re.sub('[/(){}\[\]\|@,;]', ' ', text)
    text = re.sub('[^0-9a-z #+_]', '', text)
    text = ' '.join(word for word in text.split() if word not in STOPWORDS)
    return text

In [16]:
# preprocess the data
X_train = [preprocess_data(text) for text in X_train]
X_val = [preprocess_data(text) for text in X_val]

Find word and tag frequencies

In [18]:
def compute_frequency(X_train, y_train):
    # dictionary of all tags with their frequency.
    tag_counts = defaultdict(int)
    # dictionary of all words with their frequency.
    word_counts = defaultdict(int)

    # find tag counts
    for _,tags in tqdm(enumerate(y_train)):
        for tag in tags:
            #print(tag)
            tag_counts[tag] += 1

    # for words
    for _,senten in tqdm(enumerate(X_train)):
        for word in senten.split():
            word_counts[word] += 1
    
    return word_counts, tag_counts

In [19]:
word_counts, tag_counts = compute_frequency(X_train, y_train)

100000it [00:00, 1279796.91it/s]
100000it [00:00, 399972.54it/s]


We will create vocabulary dictionary of top **N** words from the training data. We need two mappings:<br>
1) Words to index<br>
2) Index to words

In [56]:
# for creating word to index and vice versa mappings
def create_vocabulary_mappings(X_train, word_counts, DICT_SIZE=4500):
    # word to index mapping
    word_to_idx = {word:idx for idx,(word,f) in enumerate(
                sorted(word_counts.items(), key=lambda v:v[1], reverse=True)[:DICT_SIZE])}
    # reverse index to word mapping
    idx_to_word= {word_to_idx[word]:word for word in word_to_idx.keys()}
    
    return word_to_idx, idx_to_word

In [57]:
DICT_SIZE=4500
word_to_idx, idx_to_word = create_vocabulary_mappings(X_train, word_counts, DICT_SIZE=4500)

Now we will be trying two feature representations : Bag of Words(BOW) and TF-IDF. First we will create a function for **BOW**. For BOW we will use most commonly used 4500 words.

### Bag of Words

In [50]:
# for creating BOW representation
def create_bag_of_words(text, word_to_idx, DICT_SIZE):
    # Intial Matrix for holding the features
    feature_vector = np.zeros(DICT_SIZE)
    
    # update the word frequencies
    for word in text.split():
        if word in word_to_idx.keys():
            feature_vector[word_to_idx[word]] += 1 
    
    return feature_vector

In [59]:
# create the bag of words feature vector
# we will use a sparse representation , here we will be using csr matrix representation
# for storing it
X_train_bow = sparse.vstack([sparse.csr_matrix(create_bag_of_words(text, word_to_idx, DICT_SIZE)) for text in X_train])
X_val_bow = sparse.vstack([sparse.csr_matrix(create_bag_of_words(text, word_to_idx, DICT_SIZE)) for text in X_val])

print('X_train shape ', X_train_bow.shape)
print('X_val shape ', X_val_bow.shape)

X_train shape  (100000, 4500)
X_val shape  (30000, 4500)


### TF-IDF

In [63]:
# creates tf-idf feature vector
def create_tfidf_features(X_train, X_val):
    # fit for training data
    tfidf = TfidfVectorizer(ngram_range=(1,2), max_df=0.9, min_df=5, token_pattern='(\S+)')####### YOUR CODE HERE #######
    # apply for training and validation set
    X_train = tfidf.fit_transform(X_train)
    X_val = tfidf.transform(X_val)
    
    return X_train, X_val, tfidf.vocabulary_

In [64]:
X_train_tfidf, X_val_tfidf, tfidf_vocab = create_tfidf_features(X_train, X_val)
tfidf_reverse_vocab = {i:word for word,i in tfidf_vocab.items()}

## <u> Classifier

## <u>Training

## <u>Evaluation metrics