# Predict Stack Overflow tags

Baseline model - Post text is one-hot encoded and trained on a single layer 

In [1]:
import numpy as np
import pandas as pd
import os
import tensorflow as tf

from sklearn.preprocessing import LabelBinarizer, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout, LSTM
from keras.preprocessing import text, sequence
from keras.utils import to_categorical
import matplotlib.pyplot as plt
%matplotlib inline

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
df = pd.DataFrame()
for x in os.listdir('data/'):
    loc = 'data/{0}'.format(x)
    query_results = pd.read_csv(loc)
    df = df.append(query_results)
df.reset_index(inplace=True,drop=True)
df.head()

Unnamed: 0,post,tags
0,c++ this to write object into binary file i v...,c++
1,move constructor call optimization on explicit...,c++
2,how to print the multimap elements having vect...,c++
3,forward declaration of class in a.h i have th...,c++
4,why is this legal c++ typedef func i did thi...,c++


In [3]:
#df = pd.read_csv('stack_overflow_tags.csv')
#df.head()

In [4]:
df.shape

(40000, 2)

#### Mean post word count

In [5]:
def word_count(x):
    return len(x.split())

In [6]:
df['post'].map(word_count).mean()

141.23445

#### Inspecting posts

In [7]:
df['post'].loc[1]

'move constructor call optimization on explicitly created rvalue <pre><code>class c { public:     c()     {         std::cout &lt;&lt;  c()  &lt;&lt; std::endl;     }     c(c &amp;&amp;c)     {         std::cout &lt;&lt;  c(c &amp;&amp;)  &lt;&lt; std::endl;     } };  int main() {     c c = c(); } </code></pre>   i ve assumed this would print<br> <code>c()</code><br> <code>c(c &amp;&amp;)</code> <br>since c() creates a temporary object (rvalue)  but this actually prints only<br> <code>c()</code><br>in msvc. is this because of some kind of optimization'

In [8]:
df['post'].loc[10]



In [9]:
#df['post'].loc[1000]

In [10]:
df['post'].loc[10000]

'how to make a responsive step bar in html css  i want to make a responsive step bar with year on top side of step circle and text in bottom of the circle like this one:  <a href= https://i.stack.imgur.com/y4zws.png  rel= nofollow noreferrer ><img src= https://i.stack.imgur.com/y4zws.png  alt= enter image description here ></a>    here is my html code.   <pre><code>&lt;div class= container &gt;       &lt;ul class= progressbar &gt;         &lt;li&gt;visionet was founded lorem ipsum dolor sit amet new york usa by arshad masood&lt;/li&gt;           &lt;li&gt;visionet was founded lorem ipsum dolor sit amet new york usa by arshad masood&lt;/li&gt;           &lt;li&gt;visionet was founded lorem ipsum dolor sit amet new york usa by arshad masood&lt;/li&gt;           &lt;li&gt;visionet was founded lorem ipsum dolor sit amet new york usa by arshad masood&lt;/li&gt;           &lt;li&gt;visionet was founded lorem ipsum dolor sit amet new york usa by arshad masood&lt;/li&gt;           &lt;li id= l

In [11]:
df['post'].loc[30000]

'can anyone tell me how to display images from database without using gridview or repeater controls   please tell  how to display images from local folder in div without using gridview and other displaying controls and please see <a href= http://www.google.co.in/imgres biw=1366&amp;bih=600&amp;tbm=isch&amp;tbnid=j5q6cvwn3lo0km%3a&amp;imgrefurl=http://www.techalam.com/olx-in-a-great-place-to-buy-sell-stuff-online/&amp;docid=ieirb6zbxkzcom&amp;imgurl=http://www.techalam.com/wp-content/uploads/2013/09/olx-3.jpg&amp;w=350&amp;h=217&amp;ei=m20ku-o_dm7graf8tygicq&amp;zoom=1&amp;ved=0coubeiqcmc8&amp;iact=rc&amp;dur=1752&amp;page=3&amp;start=46&amp;ndsp=19  rel= nofollow >this example</a> because i want to display the images like that.'

In [12]:
df.tags.value_counts()

ruby-on-rails    2000
objective-c      2000
android          2000
css              2000
c                2000
.net             2000
c#               2000
angularjs        2000
jquery           2000
sql              2000
mysql            2000
html             2000
iphone           2000
ios              2000
python           2000
java             2000
c++              2000
javascript       2000
php              2000
asp.net          2000
Name: tags, dtype: int64

In [13]:
X_train, X_test, y_train, y_test = train_test_split(df['post'],df['tags'], test_size=0.2)

### Text preprocessing

Tokenize

In [14]:
max_words = 1000
tokenize = text.Tokenizer(num_words=max_words, char_level=False)

In [15]:
tokenize.fit_on_texts(X_train) # only fit on train
X_train = tokenize.texts_to_matrix(X_train)
X_test = tokenize.texts_to_matrix(X_test)

In [16]:
X_train.shape

(32000, 1000)

In [17]:
encoder = LabelEncoder()
encoder.fit(y_train)
y_train = encoder.transform(y_train)
y_test = encoder.transform(y_test)

In [18]:
X_train

array([[0., 1., 1., ..., 0., 0., 0.],
       [0., 1., 1., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       ...,
       [0., 1., 1., ..., 0., 0., 0.],
       [0., 1., 1., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.]])

One-hot encoding

In [19]:
num_classes = np.max(y_train) + 1
y_train = to_categorical(y_train, num_classes)
y_test = to_categorical(y_test, num_classes)

### Model

In [20]:
model = Sequential()
model.add(Dense(512,input_shape=(max_words,),activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(num_classes))
model.add(Activation('softmax'))
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 512)               512512    
_________________________________________________________________
dropout_1 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 20)                10260     
_________________________________________________________________
activation_1 (Activation)    (None, 20)                0         
Total params: 522,772
Trainable params: 522,772
Non-trainable params: 0
_________________________________________________________________


In [21]:
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

Instructions for updating:
keep_dims is deprecated, use keepdims instead
Instructions for updating:
keep_dims is deprecated, use keepdims instead


In [22]:
history = model.fit(X_train, y_train,
                    batch_size=32,
                    epochs=3,
                    verbose=1,
                    validation_split=0.1)

Train on 28800 samples, validate on 3200 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


In [23]:
score = model.evaluate(X_test, 
                       y_test,
                       batch_size=32, verbose=1)

print("Loss: {}".format(score[0]))
print("Accuracy: {}".format(score[1]))

Loss: 0.5989817411899566
Accuracy: 0.804625
