# Multiclass Text Tagging with Doc2Vec 

In [2]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from gensim.models.doc2vec import TaggedDocument
from tqdm import tqdm
import time
import numba
import multiprocessing as mp

## Preprocess text

In [2]:
def tokenize_text(text):
    tokens = []
    
    text = text.replace("</p>", " ")
    for sent in nltk.sent_tokenize(text):
        for word in nltk.word_tokenize(sent):
            if len(word) < 2:
                continue
            tokens.append(word.lower())
            
    return tokens

In [10]:
traindf = pd.read_csv("data/Train.csv", index_col="Id",chunksize=60000)

In [None]:
docs = []

proc_func = lambda x: TaggedDocument(
    words = tokenize_text(x.Title) + tokenize_text(x.Body),
    tags = str(x.Tags).split()
)
jitted = numba.jit(proc_func)

for i, chunk in enumerate(traindf):
    print("Processing chunk %s" % i)
    tagged = chunk.apply(jitted, axis=1)
    print(" - chunk shape: %s"%tagged.shape)
    tagged.to_hdf("data/tagged.hdf",key="chunk_%s"%i)

Processing chunk 0
 - chunk shape: 60000
Processing chunk 1
 - chunk shape: 60000
Processing chunk 2
 - chunk shape: 60000
Processing chunk 3
 - chunk shape: 60000
Processing chunk 4
 - chunk shape: 60000
Processing chunk 5
 - chunk shape: 60000
Processing chunk 6
 - chunk shape: 60000
Processing chunk 7
 - chunk shape: 60000
Processing chunk 8
 - chunk shape: 60000
Processing chunk 9
 - chunk shape: 60000
Processing chunk 10
 - chunk shape: 60000
Processing chunk 11
 - chunk shape: 60000
Processing chunk 12
 - chunk shape: 60000
Processing chunk 13
 - chunk shape: 60000
Processing chunk 14
 - chunk shape: 60000
Processing chunk 15
 - chunk shape: 60000
Processing chunk 16
 - chunk shape: 60000
Processing chunk 17
 - chunk shape: 60000
Processing chunk 18
 - chunk shape: 60000
Processing chunk 19
 - chunk shape: 60000
Processing chunk 20
 - chunk shape: 60000
Processing chunk 21
 - chunk shape: 60000
Processing chunk 22
 - chunk shape: 60000
Processing chunk 23
 - chunk shape: 60000
Pr

## Read DF

In [9]:
df = pd.read_hdf("data/tagged.hdf", key="chunk_2")

In [10]:
df.shape

(60000,)

In [13]:
df.iloc[0].words

['function',
 'declaration',
 'parameter',
 'naming',
 'best',
 'practices',
 'c++',
 'in',
 'function',
 'declaration',
 'while',
 'the',
 'parameters',
 'do',
 'not',
 'have',
 'to',
 'be',
 'named',
 'is',
 'it',
 'preferable',
 'to',
 'have',
 'them',
 'named',
 'what',
 'are',
 'the',
 'advantages',
 'and',
 'disadvantages',
 'of',
 'this']