# Tokenize Data
Extract questions directly from mongoDB and dump to CSV file. Perform tokenization and tf-idf analysis.

`mongoexport -h localhost -d stackoverflow -c posts --type=csv --fields Id,Title,Tags -q '{"PostTypeId":"1"}' --out questions_title_only.csv`

In [1]:
%matplotlib inline

In [2]:
import pandas as pd
import numpy as np
import re
import spacy

In [3]:
from spacy.lang.en.stop_words import STOP_WORDS
nlp = spacy.load('en_core_web_lg')

In [22]:
def mkstr_tags(tags):
    """
    Remove the square brackets and quotes to make 
    space separated strings.
    """
    val = re.sub('["\[\]]*', '', tags).replace(',', ' ')
    return val

In [20]:
def split_tags(tags):
    """
    Split tags string into a list
    """
    val = tags.split(' ')
    return val

## Read in the questions Titles and Tags and create strings
Here we want to separate the tags into list and then concatenate them. 

In [23]:
FILE = 'questions_with_scores.csv'
df = pd.read_csv(FILE)

In [27]:
processed_tags = df['Tags'].apply(mkstr_tags)
df['Tags'] = processed_tags
df['Score'] = df['Score'].astype(int)
df['Data'] = df['Title'] + ' ' + df['Tags']

In [26]:
df.head(10)

Unnamed: 0,Id,Title,Tags,Score
0,33679237,Elasticsearch include field in result set of a...,"[""elasticsearch""]",0
1,33679239,find array item in PHP,"[""php""]",1
2,33679246,Best Way to View my TukeyHSD Output in R,"[""r""]",0
3,33679249,PDOStatement not returning result in Drupal views,"[""php"",""mysql"",""drupal"",""pdo"",""drupal-7""]",1
4,33679252,Getting images dynamically and placing at thei...,"[""android"",""html"",""xml"",""image"",""layout""]",0
5,33679254,Query Parameter Use in Valence,"[""api"",""desire2learn"",""valence""]",0
6,33679257,Cannot initialize the indexer process Magento 1.9,"[""sql"",""database"",""magento"",""catalog""]",0
7,33679260,how to remove bootstrap tags input Events from...,"[""javascript"",""jquery"",""twitter-bootstrap""]",0
8,33679261,create a class that changes uppercase to lower...,"[""java"",""stream""]",0
9,33679265,Iterable.forEach(Consumer),"[""java"",""foreach""]",0


Concatenate question title and tags together to form strings for parsing

In [21]:
from collections import Counter
tags_list = list(df['Tags'].apply(split_tags).values)
flat_list = [item for sublist in tags_list for item in sublist]
tags_counted = Counter(flat_list)
tags_counted.most_common()

[('javascript', 220026),
 ('java', 187044),
 ('php', 155471),
 ('android', 151764),
 ('c#', 149508),
 ('python', 131868),
 ('jquery', 119539),
 ('html', 101248),
 ('ios', 78798),
 ('c++', 73186),
 ('css', 72179),
 ('mysql', 67640),
 ('sql', 58053),
 ('asp.net', 37773),
 ('ruby-on-rails', 36610),
 ('arrays', 36309),
 ('c', 36107),
 ('objective-c', 34773),
 ('angularjs', 33940),
 ('r', 33516),
 ('node.js', 33063),
 ('json', 32258),
 ('sql-server', 29255),
 ('.net', 27238),
 ('swift', 25844),
 ('ruby', 24446),
 ('regex', 24394),
 ('ajax', 24181),
 ('django', 22321),
 ('xml', 21662),
 ('linux', 21413),
 ('iphone', 20579),
 ('asp.net-mvc', 20368),
 ('excel', 19497),
 ('database', 18061),
 ('spring', 18039),
 ('wordpress', 17971),
 ('angular', 17304),
 ('string', 16562),
 ('wpf', 16300),
 ('html5', 15745),
 ('python-3.x', 15448),
 ('xcode', 14857),
 ('eclipse', 14459),
 ('vb.net', 14350),
 ('windows', 14324),
 ('mongodb', 14292),
 ('multithreading', 13656),
 ('bash', 13355),
 ('vba', 13248),

## Use `spacy` to tokenize and lemmatize data
- Remove stop words
- Calculate tf-idf vectors

`Spacy` could probably lemmatize the list of tags for me as well.

In [31]:
doc = nlp(df['Data'].values[1])

In [32]:
for token in doc:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_, token.shape_, token.is_alpha, token.is_stop)

find find VERB VB ROOT xxxx True False
array array NOUN NN compound xxxx True False
item item NOUN NN dobj xxxx True False
in in ADP IN prep xx True False
PHP php PROPN NNP compound XXX True False
php php NOUN NN pobj xxx True False
