# process data & feature extract

In [72]:
%matplotlib inline
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import pandas as pd
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from string import punctuation
from collections import Counter
import re
import numpy as np
import itertools
from sklearn.feature_extraction.text import TfidfVectorizer

In [60]:
# read sample data
df = pd.read_csv('input/stack_ds_4_9_2017 .csv',sep=',',quotechar='|',header=None)
df.columns = ['title','body','tags']
df.head(2)

Unnamed: 0,title,body,tags
0,How is viewport size typically passed as an ar...,I'm looking at the docs for a command line too...,<html><web><command-line>
1,mysqli_query insert success and throws error,This issue has baffled me. I have a simple mys...,<php><mysqli>


**notice that the data is already filtered with code and images etc.**

In [61]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4533 entries, 0 to 4532
Data columns (total 3 columns):
title    4533 non-null object
body     4533 non-null object
tags     4533 non-null object
dtypes: object(3)
memory usage: 106.3+ KB


In [62]:
# merge title and body so that we only have one feature to consider
merged = [ title + ' ' + body for title, body in zip(df.title,df.body)]
df_merged = pd.DataFrame({'content':merged,'tags':df.tags})
df_merged.head(2)

Unnamed: 0,content,tags
0,How is viewport size typically passed as an ar...,<html><web><command-line>
1,mysqli_query insert success and throws error T...,<php><mysqli>


In [63]:
# how many tags we have?
all_tags = df.tags.apply(lambda x: x.replace('<','').split('>'))
all_tags = [x for x in list(itertools.chain(*all_tags)) if x ]
ct = Counter(all_tags)
len(ct),len(df)

(3591, 4533)

tokenize content:
- lowercase 
- remove stopwords and punctuation

In [64]:
#take a look at word_tokenize
word_tokenize(df_merged.content[0]);
# seems not bad

In [66]:
# create tokenizer for tf-idf function
to_be_removed = list(punctuation) + stopwords.words('english')
tokenizer = lambda x : [ word.lower() for word in word_tokenize(x) if word not in to_be_removed]
# df_merged.content = df_merged.content.apply(tokenizer)

**apply tf-idf**

In [78]:
%%time
# para to be tweaked, we start from a simple one
tfidf = TfidfVectorizer(min_df=0.01,max_df=0.95, max_features=None, tokenizer= tokenizer, ngram_range=(1,2))
tfidf_trained = tfidf.fit_transform(list(df_merged.content))

CPU times: user 4.7 s, sys: 12 ms, total: 4.71 s
Wall time: 4.71 s


In [79]:
tfidf_transformed.shape
df_tfidf = pd.DataFrame({'token':tfidf.get_feature_names(),'tfidf_value':tfidf.idf_})

(4533, 704)

In [80]:
df_tfidf.sort_values('tfidf_value',ascending=False)

Unnamed: 0,tfidf_value,token
511,5.569212,public
520,5.569212,range
732,5.569212,whenever
641,5.569212,style
409,5.569212,messages
274,5.569212,game
242,5.569212,extract
480,5.569212,perfectly
236,5.569212,executed
19,5.569212,/div
