In [None]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

In [None]:
import itertools
import os

%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tensorflow as tf

from sklearn.preprocessing import LabelBinarizer, LabelEncoder
from sklearn.metrics import confusion_matrix

# Michael change
#from tensorflow import keras
import keras

from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras.preprocessing import text, sequence
from keras import utils

# This code was tested with TensorFlow v1.4
print("You have TensorFlow version", tf.__version__)

# Get data


In [None]:
data_stack = pd.read_csv('DB/stack-overflow-data.csv')
read = open('DB/stack-overflow-data-tags')
tags_stack = read.readline().split('|')
read.close()

In [None]:
data_stack.head()

In [None]:
data_stack.rename(columns={'post' : 'Body', 'tags': 'Tag'}, inplace=True)
data_stack.head()

In [None]:
data_free = pd.read_csv('DB/freelance_data.csv', index_col=0)
read = open('DB/freelance_data_tags')
tags_free = read.readline().split('|')
read.close()

In [None]:
data_free.head()

In [None]:
data_start = pd.read_csv('DB/startup_data.csv', index_col=0)
read = open('DB/startup_data_tags')
tags_start = read.readline().split('|')
read.close()

In [None]:
data_start.head()

In [None]:
all_tags = tags_stack.copy()
all_tags.extend(tags_free.copy())
all_tags.extend(tags_start.copy())

In [None]:
all_data = pd.DataFrame(columns=['Body', 'Tag'])

In [None]:
all_data = all_data.append(data_free)
all_data = all_data.append(data_start)
all_data = all_data.append(data_stack)

In [None]:
all_data.head()

In [None]:
from sklearn.cross_validation import train_test_split

In [None]:
def process_tags(tags):
    tmp = list(tags)
    for i in range(len(tmp)):
        if tmp[i][0] == '[' and tmp[i][-1] == ']':
            tmp[i] = tmp[i][1:-1].split(',')
            for j in range(len(tmp[i])):
                tmp[i][j] = tmp[i][j].strip()
                tmp[i][j] = tmp[i][j][1:-1]
        else:
            tmp[i] = [tmp[i]]
    return tmp

In [None]:
train_posts, test_posts, train_tags, test_tags = train_test_split(all_data.Body, all_data.Tag, 
                                                                  random_state=42, test_size=0.2)

test_tags = process_tags(test_tags)
test_tags[:10]

train_tags = process_tags(train_tags)

test_posts = list(test_posts)
train_posts = list(train_posts)

# Process data

In [None]:
# Prosess posts to vectors

max_words = 2500
all_tags = list(set(all_tags))
num_classes = len(all_tags)

tokenize = text.Tokenizer(num_words=max_words, char_level=False)
tokenize.fit_on_texts(train_posts)

x_train = tokenize.texts_to_matrix(train_posts)
x_test = tokenize.texts_to_matrix(test_posts)

In [None]:
# Prosess tags to vectors
tokenize_y = text.Tokenizer(num_words=num_classes, char_level=False, filters='')
tokenize_y.fit_on_texts(all_tags)

In [None]:
len(set(all_tags)), len(all_tags)

In [None]:
num_classes

In [None]:
tokenize_y.word_index['objective-c']

In [None]:
y_train = []
for tag in train_tags:
    y_train.append(tokenize_y.texts_to_matrix(tag).sum(axis=0))
    
y_test = []
for tag in test_tags:
    y_test.append(tokenize_y.texts_to_matrix(tag).sum(axis=0))

In [None]:
y_train[0].sum()

In [None]:
y_train = np.array(y_train)
y_test = np.array(y_test)

# NN Part

In [None]:
label_from_prediction = list(np.zeros(num_classes))
for t in all_tags:
    label_from_prediction[tokenize_y.word_index[t] - 1] = t
label_from_prediction = np.array(label_from_prediction)

In [None]:
weights = {i : 1200 for i in range(num_classes)}

model = Sequential()
model.add(Dense(1024, input_shape=(max_words,)))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(512))
model.add(Activation('relu'))
model.add(Dropout(0.6))

model.add(Dense(num_classes))
model.add(Activation('sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy', 'binary_accuracy'])

In [None]:
history = model.fit(x_train, y_train, batch_size=32, epochs=3, verbose=2, validation_split=0.1, 
                    class_weight=weights)

In [None]:
score = model.evaluate(x_test, y_test, batch_size=128, verbose=2)
print('Test score:', score[0])
print('Test accuracy:', score[1])
print('Test bin_accuracy:', score[2])

In [None]:
ind = 34

pred = model.predict(np.array([x_test[ind]]))
print('Prediction: ', label_from_prediction[pred.ravel()>0.2])
print('Real: ', label_from_prediction[y_test.astype('bool')[ind]])
y_test.shape, pred.shape
#pred[pred>0.4]

In [None]:
model.save('models/primitive_model')