In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split

from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

from sklearn.multiclass import OneVsRestClassifier

In [2]:
from google.colab import drive 
drive.mount('colab', force_remount=True)

Mounted at colab


In [3]:
%cd "colab/MyDrive/NE-INT6940"
!pwd

/content/colab/MyDrive/NE-INT6940
/content/colab/MyDrive/NE-INT6940


In [4]:
df_stackoverflow = pd.read_csv('./repurpost/datasets/stackoverflow.csv', encoding="ISO-8859-1", engine = 'python', index_col=0)
df_stackoverflow.head()

Unnamed: 0,Text,Tags
2,aspnet site maps has anyone got experience cre...,"['sql', 'asp.net']"
4,adding scripting functionality to net applicat...,"['c#', '.net']"
5,should i use nested classes in this case i am ...,['c++']
6,homegrown consumption of web services i have b...,['.net']
8,automatically update version number i would li...,['c#']


In [5]:
print('Type of Tags value', type(df_stackoverflow['Tags'].iloc[0]))
df_stackoverflow['Tags'].iloc[0]

Type of Tags value <class 'str'>


"['sql', 'asp.net']"

## Tags column transformation
Even though the 'Tags' column show as a list of Tags, the column is treated as a string.  We need to process the tags column to create a list of string literals. 

### Abstract Syntax Trees
The ast python module allows us to evaluate the string and extract string literals that are individual tags.

In [6]:
import ast
ast.literal_eval(df_stackoverflow['Tags'].iloc[0])

['sql', 'asp.net']

In [7]:
df_stackoverflow['Tags'] = df_stackoverflow['Tags'].apply(lambda x: ast.literal_eval(x))
df_stackoverflow.head()

Unnamed: 0,Text,Tags
2,aspnet site maps has anyone got experience cre...,"[sql, asp.net]"
4,adding scripting functionality to net applicat...,"[c#, .net]"
5,should i use nested classes in this case i am ...,[c++]
6,homegrown consumption of web services i have b...,[.net]
8,automatically update version number i would li...,[c#]


# Encoding Tags
Tags in this dataset is a categorical feature and since all ML algorithms need data in numerical form, we would also need to convert the 'Tags' column in categorical numerical values.  But since the column contains multiple values a.k.a 'Tags', we cannot use any of the traditional categorical encoding like Label Encoder or One Hot Encoder or Pandas get_dummies.

## Multilabel Binarizer
This allows to encode more than one label at the same time.  This is particularly used when a categorical column has more than 1 value, like we have in our dataset.  

In [8]:
multilabelBinarizer = MultiLabelBinarizer()
tagsEncodedArray = multilabelBinarizer.fit_transform(df_stackoverflow['Tags'])
tagsEncodedArray

array([[0, 0, 1, ..., 0, 0, 1],
       [1, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [9]:
multilabelBinarizer.classes_

array(['.net', 'android', 'asp.net', 'c', 'c#', 'c++', 'css', 'html',
       'ios', 'iphone', 'java', 'javascript', 'jquery', 'mysql',
       'objective-c', 'php', 'python', 'ruby', 'ruby-on-rails', 'sql'],
      dtype=object)

# Vectorize the Text

"Word Embeddings or Word vectorization is a methodology in NLP to map words or phrases from vocabulary to a corresponding vector of real numbers which used to find word predictions, word similarities/semantics."

"The process of converting words into numbers are called Vectorization."

Out of the many ways to do vectorization like Count Vectorizer, Term Frequency Vectorizer or Hashing Vectorizer, we will use the Term Frequency Inverse Document Frequency Vectorizer (TfidfVectorizer).

"TF-IDF is a statistical measure that evaluates how relevant a word is to a document in a collection of documents."



References: https://towardsdatascience.com/understanding-nlp-word-embeddings-text-vectorization-1a23744f7223

In [21]:
termFrequencyVectorizer = TfidfVectorizer(analyzer='word', max_features=10000, ngram_range=(1,3), stop_words='english', max_df=.5)
contentVectorArray = termFrequencyVectorizer.fit_transform(df_stackoverflow['Text'])

In [22]:
contentVectorArray.shape, tagsEncodedArray.shape

((48976, 10000), (48976, 20))

# Split dataset
The tfidf vector and encoded tags will be the input to the ML models.  The dataset needs to be split to use for training and use some for testing.  

In [23]:
content_X_train, content_X_test, tags_y_train, tags_y_test = train_test_split(contentVectorArray, tagsEncodedArray, test_size = 0.2, random_state = 0)

# Multi-Label Model Building

In [24]:
sgd = SGDClassifier()
lr = LogisticRegression(solver='lbfgs')
svc = LinearSVC()

The LinearSVC , SGDClassifier or Logistic Regression are basically binary classification models.  Multi-label classification problems are basically solved by splitting the classification into multiple binary classifications and then comparing the results of each classification against the others.  So for multi-label classification scenario, we need classifier models that fit one class(tag) against the other classes in the dataset.

The OneVsRestClassifier allows us to do multi-label classification using classifiers that natively support only binary classification.

In [25]:
def j_score(y_true, tags_y_pred):
  jaccard = np.minimum(y_true, tags_y_pred).sum(axis = 1)/np.maximum(y_true, tags_y_pred).sum(axis = 1)
  return jaccard.mean()*100

def hamming_score(tags_y_test, tags_y_pred):

  acc_list = []
  for i in range(tags_y_test.shape[0]):
    true_set = set( np.where(tags_y_test[i])[0] )
    prediction_set = set( np.where(tags_y_pred[i])[0] )
    tmp_a = None
    if len(true_set) == 0 and len(prediction_set) == 0:
      tmp_a = 1
    else:
      tmp_a = len(true_set.intersection(prediction_set))/\
              float( len(true_set.union(prediction_set)) )
    acc_list.append(tmp_a)

  return np.mean(acc_list)

def print_score(tags_y_pred, clf):
  print("Clf: ", clf.__class__.__name__)
  print('Jacard score: {}'.format(j_score(tags_y_test, tags_y_pred)))
  print('Hamming loss: {}'.format(hamming_score(tags_y_test, tags_y_pred)))
  print('----')

In [26]:
for classifier in [LinearSVC(C=1.5, penalty = 'l1', dual=False)]:
  clf = OneVsRestClassifier(classifier)
  clf.fit(content_X_train, tags_y_train)
  tags_y_pred = clf.predict(content_X_test)
  print_score(tags_y_pred, classifier)

for classifier in [sgd, lr, svc]:
  clf = OneVsRestClassifier(classifier)
  clf.fit(content_X_train, tags_y_train)
  tags_y_pred = clf.predict(content_X_test)
  print_score(tags_y_pred, classifier)

Clf:  LinearSVC
Jacard score: 63.876071866067775
Hamming loss: 0.6387607186606777
----
Clf:  SGDClassifier
Jacard score: 52.71148087654826
Hamming loss: 0.5271148087654826
----


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Clf:  LogisticRegression
Jacard score: 51.1014699877501
Hamming loss: 0.511014699877501
----
Clf:  LinearSVC
Jacard score: 62.42105621342044
Hamming loss: 0.6242105621342044
----


In [34]:
x = ['adding scripting functionality to net applications i have a little game written in c it uses a database as backend it is a trading card game and i wanted to implement the function of the cards as a scriptwhat i mean is that i essentially have an interface icard which a card class implements public class card056 icard and which contains function that are called by the gamenow to make the thing maintainablemoddable i would like to have the class for each card as source code in the database and essentially compile it on first use so when i have to addchange a card i will just add it to the database and tell my application to refresh without needing any assembly deployment especially since we would be talking about 1 assembly per card which means hundreds of assembliesis that possible register a class from a source file and then instantiate it etcicard cardscurrent new mygamecardlibrarycard056cardscurrentonenterplayref currentgamestatethe language is c but extra bonus if it is possible to write the script in any net language']
x = ['mysqlapache error in php mysql query i am getting the following erroraccess denied for user apachelocalhost using password nowhen using the following codephpincludeincludesconnectphpquery select from storyresult mysql queryquery or diemysql errorecho h1delete storyh1if mysql num rowsresult 0 whilerow mysql fetch rowresult echo brow1bspan alignrighta hrefprocessdelete storyphpidrow0deleteaspan echo br irow2i else echo no stories availablethe connectphp file contains my mysql connect calls that are working fine with my insert queries in another portion of the software if i comment out the result mysql query line then it goes through to the else statement so it is that line or the content in the ifi have been searching the net for any solutions and most seem to be related to too many mysql connections or that the user i am logging into mysql as does not have permission i have checked both i can still perform my other queries elsewhere in the software and i have verified that the account has the correct permissions']
x = ['how should i unit test a codegenerator this is a difficult and openended question i know but i thought i would throw it to the floor and see if anyone had any interesting suggestionsi have developed a codegenerator that takes our python interface to our c code generated via swig and generates code needed to expose this as webservices when i developed this code i did it using tdd but i have found my tests to be brittle as hell because each test essentially wanted to verify that for a given bit of input code which happens to be a c header i would get a given bit of outputted code i wrote a small engine that reads test definitions from xml input files and generates test cases from these expectationsthe problem is i dread going in to modify the code at all that and the fact that the unit tests themselves are a complex and b brittleso i am trying to think of alternative approaches to this problem and it strikes me i am perhaps tackling it the wrong way maybe i need to focus more on the outcome ie does the code i generate actually run and do what i want it to rather than does the code look the way i want it tohas anyone got any experiences of something similar to this they would care to share']
xt = termFrequencyVectorizer.transform(x)
clf.predict(xt)
multilabelBinarizer.inverse_transform(clf.predict(xt))

[('c#', 'python')]

In [35]:
import gzip
import pickle

model_pickle_out = open("clf.pkl", "wb")
pickle.dump(clf, model_pickle_out)
model_pickle_out.close() 

In [37]:
tfidf_pickle_out = gzip.open("tfidf.pkl", "wb")
pickle.dump(termFrequencyVectorizer, tfidf_pickle_out)
tfidf_pickle_out.close() 

In [38]:
mlBinarizer_pickle_out = open("multiLabelBinarizer.pkl", "wb")
pickle.dump(multilabelBinarizer, mlBinarizer_pickle_out)
mlBinarizer_pickle_out.close() 

In [39]:
model = pickle.load(open('clf.pkl', 'rb'))
fp = gzip.open('tfidf.pkl', 'rb')
tfidf = pickle.load(fp)
mlB = pickle.load(open('multiLabelBinarizer.pkl', 'rb'))

In [40]:
x = [ 'homegrown consumption of web services i have microsoft ssdprogramming to use I dont know what this may not be working in net']
xt = tfidf.transform(x)

prediction = model.predict(xt)
mlB.inverse_transform(prediction)

In [41]:
type(x)

list

In [42]:
model_objects = []
model_objects.append(clf)
model_objects.append(termFrequencyVectorizer)
model_objects.append(multilabelBinarizer)

model_all_pickle = gzip.open("model_all.pkl", "wb")
pickle.dump(model_objects, model_all_pickle)
model_all_pickle.close()

In [20]:
fp = gzip.open('model_all.pkl', 'rb')
model = pickle.load(fp)

In [21]:
type(model)

list

In [22]:
print(model)

[OneVsRestClassifier(estimator=LinearSVC()), TfidfVectorizer(max_features=5000, ngram_range=(1, 3), stop_words='english'), MultiLabelBinarizer()]


In [23]:
m = model[0]

In [24]:
t = model[1]
ml = model[2]

In [51]:
x = [ 'C .net']
xt = t.transform(x)

prediction = m.predict(xt)
tags = ml.inverse_transform(prediction)

In [52]:
print(tags[0])

('.net', 'c#')


In [35]:
tags.append('a')

In [38]:
print(tags[0])

('.net', 'c#')


In [41]:
list(tags[0])

['.net', 'c#']