In [18]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

In [19]:
import itertools
import os

%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tensorflow as tf

from sklearn.preprocessing import LabelBinarizer, LabelEncoder
from sklearn.metrics import confusion_matrix

# Michael change
#from tensorflow import keras
import keras

from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras.preprocessing import text, sequence
from keras import utils

# This code was tested with TensorFlow v1.4
print("You have TensorFlow version", tf.__version__)

You have TensorFlow version 1.5.0


# Get data


In [20]:
data_stack = pd.read_csv('DB/stack-overflow-data.csv')
read = open('DB/stack-overflow-data-tags')
tags_stack = read.readline().split('|')
read.close()

In [21]:
data_stack.head()

Unnamed: 0,post,tags
0,what is causing this behavior in our c# datet...,c#
1,have dynamic html load as if it was in an ifra...,asp.net
2,how to convert a float value in to min:sec i ...,objective-c
3,.net framework 4 redistributable just wonderi...,.net
4,trying to calculate and print the mean and its...,python


In [22]:
data_stack.rename(columns={'post' : 'Body', 'tags': 'Tag'}, inplace=True)
data_stack.head()

Unnamed: 0,Body,Tag
0,what is causing this behavior in our c# datet...,c#
1,have dynamic html load as if it was in an ifra...,asp.net
2,how to convert a float value in to min:sec i ...,objective-c
3,.net framework 4 redistributable just wonderi...,.net
4,trying to calculate and print the mean and its...,python


In [23]:
data_free = pd.read_csv('DB/freelance_data.csv', index_col=0)
read = open('DB/freelance_data_tags')
tags_free = read.readline().split('|')
read.close()

In [24]:
data_free.head()

Unnamed: 0,Body,Tag
0,I am a freelance web developer. I have built ...,"['contracts', 'changes', 'design']"
1,I recently started freelancing and found mysel...,"['attracting-clients', 'project', 'difficult-c..."
2,"When working through the day, I may work on ma...","['time-management', 'time-tracking']"
3,I have a project where the client initially re...,['project-management']
4,I started a project given by client and it wen...,"['time-management', 'project-management', 'com..."


In [25]:
data_start = pd.read_csv('DB/startup_data.csv', index_col=0)
read = open('DB/startup_data_tags')
tags_start = read.readline().split('|')
read.close()

In [26]:
data_start.head()

Unnamed: 0,Body,Tag
0,"After registering my small LLC, I signed up fo...","['llc', 'new-hampshire', 'united-states']"
1,"If I have a startup that produces apps, is the...","['tech-company', 'mobile-apps', 'equity']"
2,"In short, for my very first business plan whic...",['business-plan']
3,We know that venture capitalists keep a keen e...,"['intellectual-property', 'investment', 'ventu..."
4,"I'm thinking of doing some freelance work, and...","['united-kingdom', 'freelancing']"


In [27]:
all_tags = tags_stack.copy()
all_tags.extend(tags_free.copy())
all_tags.extend(tags_start.copy())

In [28]:
all_data = pd.DataFrame(columns=['Body', 'Tag'])

In [29]:
all_data = all_data.append(data_free)
all_data = all_data.append(data_start)
all_data = all_data.append(data_stack)

In [30]:
all_data.head()

Unnamed: 0,Body,Tag
0,I am a freelance web developer. I have built ...,"['contracts', 'changes', 'design']"
1,I recently started freelancing and found mysel...,"['attracting-clients', 'project', 'difficult-c..."
2,"When working through the day, I may work on ma...","['time-management', 'time-tracking']"
3,I have a project where the client initially re...,['project-management']
4,I started a project given by client and it wen...,"['time-management', 'project-management', 'com..."


In [31]:
from sklearn.cross_validation import train_test_split

In [32]:
def process_tags(tags):
    tmp = list(tags)
    for i in range(len(tmp)):
        if tmp[i][0] == '[' and tmp[i][-1] == ']':
            tmp[i] = tmp[i][1:-1].split(',')
            for j in range(len(tmp[i])):
                tmp[i][j] = tmp[i][j].strip()
                tmp[i][j] = tmp[i][j][1:-1]
        else:
            tmp[i] = [tmp[i]]
    return tmp

In [33]:
train_posts, test_posts, train_tags, test_tags = train_test_split(all_data.Body, all_data.Tag, 
                                                                  random_state=42, test_size=0.2)

test_tags = process_tags(test_tags)
test_tags[:10]

train_tags = process_tags(train_tags)

test_posts = list(test_posts)
train_posts = list(train_posts)

# Process data

In [34]:
# Prosess posts to vectors

max_words = 2500
all_tags = list(set(all_tags))
num_classes = len(all_tags)

tokenize = text.Tokenizer(num_words=max_words, char_level=False)
tokenize.fit_on_texts(train_posts)

x_train = tokenize.texts_to_matrix(train_posts)
x_test = tokenize.texts_to_matrix(test_posts)

In [35]:
# Prosess tags to vectors
tokenize_y = text.Tokenizer(num_words=num_classes, char_level=False, filters='')
tokenize_y.fit_on_texts(all_tags)

In [36]:
len(set(all_tags)), len(all_tags)

(381, 381)

In [37]:
num_classes

381

In [38]:
tokenize_y.word_index['objective-c']

74

In [39]:
y_train = []
for tag in train_tags:
    y_train.append(tokenize_y.texts_to_matrix(tag).sum(axis=0))
    
y_test = []
for tag in test_tags:
    y_test.append(tokenize_y.texts_to_matrix(tag).sum(axis=0))

In [40]:
y_train[0].sum()

1.0

In [41]:
y_train = np.array(y_train)
y_test = np.array(y_test)

# NN Part

In [42]:
label_from_prediction = list(np.zeros(num_classes))
for t in all_tags:
    label_from_prediction[tokenize_y.word_index[t] - 1] = t
label_from_prediction = np.array(label_from_prediction)

In [43]:
weights = {i : 1200 for i in range(num_classes)}

model = Sequential()
model.add(Dense(1024, input_shape=(max_words,)))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(512))
model.add(Activation('relu'))
model.add(Dropout(0.6))

model.add(Dense(num_classes))
model.add(Activation('sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy', 'binary_accuracy'])

In [44]:
history = model.fit(x_train, y_train, batch_size=32, epochs=3, verbose=2, validation_split=0.1, 
                    class_weight=weights)

Train on 32045 samples, validate on 3561 samples
Epoch 1/3
 - 72s - loss: 17.6124 - acc: 0.9961 - binary_accuracy: 0.9961 - val_loss: 6.9021 - val_acc: 0.9986 - val_binary_accuracy: 0.9986
Epoch 2/3
 - 69s - loss: 6.8442 - acc: 0.9986 - binary_accuracy: 0.9986 - val_loss: 6.3512 - val_acc: 0.9986 - val_binary_accuracy: 0.9986
Epoch 3/3
 - 66s - loss: 5.8104 - acc: 0.9987 - binary_accuracy: 0.9987 - val_loss: 6.3760 - val_acc: 0.9986 - val_binary_accuracy: 0.9986


In [45]:
score = model.evaluate(x_test, y_test, batch_size=128, verbose=2)
print('Test score:', score[0])
print('Test accuracy:', score[1])
print('Test bin_accuracy:', score[2])

Test score: 0.0054718005676822655
Test accuracy: 0.998562440995928
Test bin_accuracy: 0.998562440995928


In [36]:
ind = 1256

pred = model.predict(np.array([x_test[ind]]))
print('Prediction: ', label_from_prediction[pred.ravel()>0.1])
print('Real: ', label_from_prediction[y_test.astype('bool')[ind]])
y_test.shape, pred.shape
#pred[pred>0.4]

Prediction:  ['intellectual-property' 'user-engagement' 'growth'
 'iterative-development']
Real:  ['price' 'billing']


((8902, 381), (1, 381))

In [47]:
model.save('models/primitive_model')

In [None]:
keras.models.load_model

In [38]:
tokens = tokenize.texts_to_matrix(['I love java'])
pred = model.predict(tokens)
pred_tags = label_from_prediction[pred.ravel() > 0.1]
pred_tags

array(['acquisition'], dtype='<U25')