In [None]:
# download the data from here -  https://www.dropbox.com/s/5721wcs2guuykzl/stacksample.zip?dl=0

In [0]:
# Install the PyDrive wrapper & import libraries.
# This only needs to be done once per notebook.
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# Authenticate and create the PyDrive client.
# This only needs to be done once per notebook.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [0]:
folder_id = '********************' # give the folder id where the data is uploaded
list2 = drive.ListFile({'q': "'%s' in parents" % folder_id}).GetList()
for file in list2:
  print('title {}, id {}'.format(file['title'], file['id']))

title stacksample.zip, id 1diUyt9AB2Q2BSYHggZpauRFD-oZYvFml
title mfastboot.zip, id 1vm8KEE5r-2p-YrLINHL3HBbkOz6N4SMh
title Get Started with Dropbox.pdf, id 10z-qVhySQvlOqNwFN99fmcDNE0iF5zHZ


In [0]:
# Download a file based on its file ID.
#
# A file ID looks like: laggVyWshwcyP6kEI-y_W3P8D26sz
file_id = '*******************'  # give the file id obtained in previous step
downloaded = drive.CreateFile({'id': file_id})
downloaded.GetContentFile('stacksample.zip')

In [0]:
!unzip stacksample.zip

Archive:  stacksample.zip
  inflating: Answers.csv             
  inflating: Questions.csv           
  inflating: Tags.csv                


In [0]:
import os
os.listdir()

['.config',
 'Answers.csv',
 'Tags.csv',
 'adc.json',
 'stacksample.zip',
 'Questions.csv',
 'sample_data']

In [0]:
import pandas as pd

In [0]:
questions = pd.read_csv('Questions.csv',encoding='iso-8859-1')
answers = pd.read_csv('Answers.csv',encoding='iso-8859-1')
tags = pd.read_csv('Tags.csv',encoding='iso-8859-1')

In [0]:
questions.head()

Unnamed: 0,Id,OwnerUserId,CreationDate,ClosedDate,Score,Title,Body
0,80,26.0,2008-08-01T13:57:07Z,,26,SQLStatement.execute() - multiple queries in o...,<p>I've written a database generation script i...
1,90,58.0,2008-08-01T14:41:24Z,2012-12-26T03:45:49Z,144,Good branching and merging tutorials for Torto...,<p>Are there any really good tutorials explain...
2,120,83.0,2008-08-01T15:50:08Z,,21,ASP.NET Site Maps,<p>Has anyone got experience creating <strong>...
3,180,2089740.0,2008-08-01T18:42:19Z,,53,Function for creating color wheels,<p>This is something I've pseudo-solved many t...
4,260,91.0,2008-08-01T23:22:08Z,,49,Adding scripting functionality to .NET applica...,<p>I have a little game written in C#. It uses...


In [0]:
print("null values:")
print(questions.isnull().sum())
print("----------")
print("shape", questions.shape)
print("----------")


null values:
Id                    0
OwnerUserId       14454
CreationDate          0
ClosedDate      1208257
Score                 0
Title                 0
Body                  0
dtype: int64
----------
shape (1264216, 7)
----------


In [0]:
# checking unique values to drop the cols
for i in questions.columns:
  print(i, ":" ,len(questions[i].unique()))

Id : 1264216
OwnerUserId : 630910
CreationDate : 1264207
ClosedDate : 55950
Score : 532
Title : 1263995
Body : 1264204


In [0]:
import re 
def rem_html_tags(body):
    regex = re.compile('<.*?>')
    return re.sub(regex, '', body)

In [0]:
questions['Body'] = questions['Body'].apply(rem_html_tags)
questions.head()

Unnamed: 0,Id,OwnerUserId,CreationDate,ClosedDate,Score,Title,Body
0,80,26.0,2008-08-01T13:57:07Z,,26,SQLStatement.execute() - multiple queries in o...,I've written a database generation script in S...
1,90,58.0,2008-08-01T14:41:24Z,2012-12-26T03:45:49Z,144,Good branching and merging tutorials for Torto...,Are there any really good tutorials explaining...
2,120,83.0,2008-08-01T15:50:08Z,,21,ASP.NET Site Maps,Has anyone got experience creating SQL-based A...
3,180,2089740.0,2008-08-01T18:42:19Z,,53,Function for creating color wheels,This is something I've pseudo-solved many time...
4,260,91.0,2008-08-01T23:22:08Z,,49,Adding scripting functionality to .NET applica...,I have a little game written in C#. It uses a ...


In [0]:
# dropping columns closed date and creation date because of toomany null and unique values
questions.drop(['CreationDate','ClosedDate','OwnerUserId','Score'],inplace = True,axis = 1)

In [0]:
questions.head()
questions_final = questions

In [0]:
import os
os.listdir()

['.config',
 'Answers.csv',
 'Tags.csv',
 'adc.json',
 'stacksample.zip',
 'Questions.csv',
 'sample_data']

In [0]:
answers.head()

Unnamed: 0,Id,OwnerUserId,CreationDate,ParentId,Score,Body
0,92,61.0,2008-08-01T14:45:37Z,90,13,"<p><a href=""http://svnbook.red-bean.com/"">Vers..."
1,124,26.0,2008-08-01T16:09:47Z,80,12,<p>I wound up using this. It is a kind of a ha...
2,199,50.0,2008-08-01T19:36:46Z,180,1,<p>I've read somewhere the human eye can't dis...
3,269,91.0,2008-08-01T23:49:57Z,260,4,"<p>Yes, I thought about that, but I soon figur..."
4,307,49.0,2008-08-02T01:49:46Z,260,28,"<p><a href=""http://www.codeproject.com/Article..."


In [0]:
print("null values:")
print(answers.isnull().sum())
print("----------")
print("shape", answers.shape)
print("----------")


null values:
Id                  0
OwnerUserId     13200
CreationDate        0
ParentId            0
Score               0
Body                0
dtype: int64
----------
shape (2014516, 6)
----------


In [0]:
tags.head()

Unnamed: 0,Id,Tag
0,80,flex
1,80,actionscript-3
2,80,air
3,90,svn
4,90,tortoisesvn


In [0]:
print("null values:")
print(tags.isnull().sum())
print("----------")
print("shape", tags.shape)
print("----------")

null values:
Id        0
Tag    1113
dtype: int64
----------
shape (3750994, 2)
----------


In [0]:
top10_tags = list(tags['Tag'].value_counts()[:10].index)

In [0]:
top10_tags

['javascript',
 'java',
 'c#',
 'php',
 'android',
 'jquery',
 'python',
 'html',
 'c++',
 'ios']

In [0]:
tags_final = tags[tags['Tag'].isin(top10_tags)]
tags_final.head(), tags_final.shape

(      Id  Tag
 14   260   c#
 18   330  c++
 28   650   c#
 35   930   c#
 39  1010   c#, (826739, 2))

In [0]:
def add_tags(question_id):
    return tags_final[tags_final['Id'] == question_id['Id']].Tag.values


In [0]:
x = tags_final.groupby('Id')['Tag'].apply(list)

In [0]:
tags_10_final = pd.DataFrame(x)
tags_10_final.head()

Unnamed: 0_level_0,Tag
Id,Unnamed: 1_level_1
260,[c#]
330,[c++]
650,[c#]
930,[c#]
1010,[c#]


In [0]:
tags_10_final.shape

(706336, 1)

In [0]:
final_data = total=pd.merge(questions_final, tags_10_final, on='Id')

In [0]:
final_data.head()

Unnamed: 0,Id,Title,Body,Tag
0,260,Adding scripting functionality to .NET applica...,I have a little game written in C#. It uses a ...,[c#]
1,330,Should I use nested classes in this case?,I am working on a collection of classes used f...,[c++]
2,650,Automatically update version number,I would like the version property of my applic...,[c#]
3,930,How do I connect to a database and loop over a...,What's the simplest way to connect and query a...,[c#]
4,1010,"How to get the value of built, encoded ViewState?",I need to grab the base64-encoded representati...,[c#]


In [0]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from nltk import word_tokenize
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding, BatchNormalization, GRU ,concatenate
from keras.models import Model

Using TensorFlow backend.


In [0]:
from tensorflow.keras.callbacks import TensorBoard, EarlyStopping
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from nltk import word_tokenize
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding, BatchNormalization, GRU ,concatenate
from tensorflow.keras.models import Model

In [0]:
from sklearn.preprocessing import OneHotEncoder

In [0]:
multilabel_binarizer = MultiLabelBinarizer()
multilabel_binarizer.fit(final_data.Tag)
labels = multilabel_binarizer.classes_
labels

array(['android', 'c#', 'c++', 'html', 'ios', 'java', 'javascript',
       'jquery', 'php', 'python'], dtype=object)

In [0]:
train,test=train_test_split(total[:550000],test_size=0.25,random_state=24)  # model has been run of subset of data as its taking very longer times to run on whole set

In [0]:
train.shape,test.shape

((412500, 4), (137500, 4))

In [0]:
X_train_t=train['Title']
X_train_b=train['Body']
y_train=multilabel_binarizer.transform(train['Tag'])
X_test_t=test['Title']
X_test_b=test['Body']
y_test=multilabel_binarizer.transform(test['Tag'])

In [0]:
y_test

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 1, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0]])

In [0]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [0]:
sent_lens_t=[]
for sent in train['Title']:
    sent_lens_t.append(len(word_tokenize(sent)))
max(sent_lens_t)

59

In [0]:
import numpy as np
np.quantile(sent_lens_t,0.97)

18.0

In [0]:
max_len_t = 18
tok = Tokenizer(char_level=False,split=' ')
tok.fit_on_texts(X_train_t)
sequences_train_t = tok.texts_to_sequences(X_train_t)

In [0]:
vocab_len_t=len(tok.index_word.keys())
vocab_len_t

68969

In [0]:

sequences_matrix_train_t = sequence.pad_sequences(sequences_train_t,maxlen=max_len_t)
sequences_matrix_train_t

array([[    0,     0,     0, ...,     1,   957,   197],
       [    0,     0,     0, ...,  9081,    45,   533],
       [    0,     0,     0, ...,   147,     8,   230],
       ...,
       [    0,     0,     0, ...,    10,    71,  2985],
       [    0,     0,     0, ...,     2,    18,    75],
       [    0,     0,     0, ..., 11009,   809,   267]], dtype=int32)

In [0]:
sequences_test_t = tok.texts_to_sequences(X_test_t)
sequences_matrix_test_t = sequence.pad_sequences(sequences_test_t,maxlen=max_len_t)

In [0]:
sequences_matrix_train_t.shape,sequences_matrix_test_t.shape,y_train.shape,y_test.shape

((412500, 18), (137500, 18), (412500, 10), (137500, 10))

In [0]:
sent_lens_b=[]
for sent in train['Body']:
    sent_lens_b.append(len(word_tokenize(sent)))
max(sent_lens_b)

20853

In [0]:
np.quantile(sent_lens_b,0.90)

575.0

In [0]:
max_len_b = 600
tok = Tokenizer(char_level=False,split=' ')
tok.fit_on_texts(X_train_b)
sequences_train_b = tok.texts_to_sequences(X_train_b)

In [0]:

vocab_len_b =len(tok.index_word.keys())
vocab_len_b

1292018

In [0]:
sequences_matrix_train_b = sequence.pad_sequences(sequences_train_b,maxlen=max_len_b)
sequences_matrix_train_b

array([[   0,    0,    0, ...,   51, 2082,   91],
       [   0,    0,    0, ..., 1408,  203,  825],
       [   0,    0,    0, ...,   34,   51,   83],
       ...,
       [   0,    0,    0, ...,   20,   68,  687],
       [   0,    0,    0, ...,  187,   58,   10],
       [   0,    0,    0, ...,  194,  197,   10]], dtype=int32)

In [0]:
sequences_test_b = tok.texts_to_sequences(X_test_b)
sequences_matrix_test_b = sequence.pad_sequences(sequences_test_b,maxlen=max_len_b)

In [0]:
sequences_matrix_train_t.shape,sequences_matrix_train_b.shape,y_train.shape

((412500, 18), (412500, 600), (412500, 10))

In [0]:
sequences_matrix_test_t.shape,sequences_matrix_test_b.shape,y_test.shape

((137500, 18), (137500, 600), (137500, 10))

In [0]:
def RNN():
    # Title Only
    title_input = Input(name='title_input',shape=[max_len_t])
    title_Embed = Embedding(vocab_len_t+1,2000,input_length=max_len_t,mask_zero=True,name='title_Embed')(title_input)
    gru_out_t = GRU(300)(title_Embed)
    # auxiliary output to tune GRU weights smoothly 
    auxiliary_output = Dense(10, activation='sigmoid', name='aux_output')(gru_out_t)   
    
    # Body Only
    body_input = Input(name='body_input',shape=[max_len_b]) 
    body_Embed = Embedding(vocab_len_b+1,170,input_length=max_len_b,mask_zero=True,name='body_Embed')(body_input)
    gru_out_b = GRU(200)(body_Embed)
    
    # combined with GRU output
    com = concatenate([gru_out_t, gru_out_b])
    
    # now the combined data is being fed to dense layers
    dense1 = Dense(100,activation='relu')(com)
    dp1 = Dropout(0.5)(dense1)
    bn = BatchNormalization()(dp1) 
    dense2 = Dense(75,activation='relu')(bn)
    
    main_output = Dense(10, activation='sigmoid', name='main_output')(dense2)
    
    model = Model(inputs=[title_input, body_input],outputs=[main_output, auxiliary_output])
    return model

In [0]:
model = RNN()
model.summary()

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
title_input (InputLayer)        [(None, 18)]         0                                            
__________________________________________________________________________________________________
body_input (InputLayer)         [(None, 600)]        0                                            
__________________________________________________________________________________________________
title_Embed (Embedding)         (None, 18, 2000)     137940000   title_input[0][0]                

In [0]:
model.compile(optimizer='adam',loss={'main_output': 'categorical_crossentropy', 'aux_output': 'categorical_crossentropy'},
              metrics=['accuracy'])

In [0]:
results=model.fit({'title_input': sequences_matrix_train_t, 'body_input': sequences_matrix_train_b},
          {'main_output': y_train, 'aux_output': y_train},
          validation_data=[{'title_input': sequences_matrix_test_t, 'body_input': sequences_matrix_test_b},
          {'main_output': y_test, 'aux_output': y_test
          }],
          epochs=1, batch_size=100)

Train on 412500 samples, validate on 137500 samples


In [0]:
(predicted_main, predicted_aux)=model.predict({'title_input': sequences_matrix_test_t, 'body_input': sequences_matrix_test_b},verbose=1)



In [None]:
predicted_main