In [6]:
from datetime import date
from joblib import dump, load
import numpy as np
import pandas as pd
import scipy.stats

#from imblearn.over_sampling import SMOTE
#from sklearn.decomposition import IncrementalPCA
#from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (accuracy_score, f1_score, 
                             average_precision_score, make_scorer, 
                             multilabel_confusion_matrix,
                             classification_report)
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.multioutput import ClassifierChain
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer, MultiLabelBinarizer, StandardScaler

from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import re

import warnings
warnings.filterwarnings("ignore", message=r"unknown class.*")

### Params

In [23]:
# 100 top tags to predict
TAGS_TO_KEEP = ['python', 'javascript', 'java', 'android', 'c#', 'html', 'reactjs',
       'php', 'python3x', 'nodejs', 'r', 'c++', 'css', 'sql', 'flutter',
       'angular', 'ios', 'pandas', 'jquery', 'mysql', 'swift', 'django', 'c',
       'typescript', 'arrays', 'json', 'laravel', 'reactnative', 'sqlserver',
       'amazonwebservices', 'springboot', 'firebase', 'docker', 'azure',
       'dart', 'dataframe', 'excel', 'kotlin', 'linux', 'spring', 'vuejs',
       'postgresql', 'wordpress', 'numpy', 'string', 'tensorflow', 'mongodb',
       'windows', 'net', 'vba', 'regex', 'aspnetcore', 'bash', 'androidstudio',
       'api', 'git', 'database', 'powershell', 'xcode', 'aspnet', 'list',
       'selenium', 'kubernetes', 'machinelearning', 'visualstudiocode',
       'express', 'rubyonrails', 'xml', 'netcore', 'macos', 'apachespark',
       'function', 'oracle', 'csv', 'ajax', 'algorithm', 'matplotlib',
       'unity3d', 'image', 'googlecloudfirestore', 'swiftui', 'keras', 'wpf',
       'visualstudio', 'woocommerce', 'dictionary', 'amazons3', 'datetime',
       'maven', 'loops', 'shell', 'npm', 'flask', 'apache', 'ruby',
       'googlechrome', 'googlecloudplatform', 'pyspark', 'opencv', 'ubuntu']

REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

## Read data

In [37]:
df = pd.read_csv("full_body_clean.csv")
#df = df.sample(5000).reset_index(drop=True)
X_train, X_test, y_train, y_test = train_test_split(df['text'].values, 
                                                    df['tags'].values, 
                                                    test_size=0.2, 
                                                    random_state=42)
df.head()

Unnamed: 0,text,tags
0,remove message.author reaction I wanted to rem...,"['javascript', 'discord', 'discord.js']"
1,Node.js - Native HTTP how to delete cookies In...,"['javascript', 'node.js', 'cookies']"
2,C# error: error CS1525: Unexpected symbol `Con...,['c#']
3,Use NGinx reverse proxy for create-react-app I...,"['reactjs', 'create-react-app', 'nginx-reverse..."
4,How to hide a specific element inside .map fun...,"['javascript', 'reactjs', 'html-rendering']"


## Text pre-processing

In [16]:
def text_prepare(arr, join_symbol):

    def process(text, join_symbol):
        text = text.lower() 
        text = re.sub(REPLACE_BY_SPACE_RE,"",text)

        text = re.sub(BAD_SYMBOLS_RE,"",text)
        text = re.sub(r'\s+'," ",text)

        return f'{join_symbol}'.join([i for i in text.split() if i not in STOPWORDS])
    retval = [process(text, join_symbol) for text in arr]
    
    return retval

In [17]:
def transform_y(arr, join_symbol=','):
    arr = text_prepare(arr, join_symbol=join_symbol)
    arr = [set(i.split(',')) for i in arr]
    return arr 

In [40]:
binarizer = MultiLabelBinarizer(classes=TAGS_TO_KEEP)
y_train_binarized = binarizer.fit_transform(transform_y(y_train))
y_test_binarized = binarizer.transform(transform_y(y_test))

In [18]:
def tokenize_and_stem(text):
    tokenized_list = word_tokenize(text) 
    snowball = SnowballStemmer(language='english')
    return [snowball.stem(word) for word in tokenized_list]

In [19]:
def to_array(sparse_matrix):
    return sparse_matrix.toarray()

## TDIDF + Linear Regression

In [54]:
estimators = [('preprocessor', FunctionTransformer(text_prepare, kw_args={'join_symbol': ' '})), 
              ('tfidf', TfidfVectorizer(tokenizer=tokenize_and_stem, 
                                        ngram_range=(1, 3), 
                                        max_df=0.9, 
                                        min_df=5,
                                        norm='l2')),
#                ('to_array', FunctionTransformer(to_array)),
#                ('scaler', StandardScaler(with_mean=True, 
#                                          with_std=False,
#                                          copy=False)),
#               ('pca', IncrementalPCA(n_components=16, batch_size=64)),
              ('clf', ClassifierChain(LogisticRegression(C=1.0, 
                                                         penalty='l1', 
                                                         dual=False, 
                                                         solver='liblinear'), 
                                      cv=3))
             ]

training_pipe = Pipeline(estimators, verbose=True)

In [44]:
%time training_pipe.fit(X_train, y_train_binarized)

[Pipeline] ...... (step 1 of 3) Processing preprocessor, total=   2.7s
[Pipeline] ............. (step 2 of 3) Processing tfidf, total=  54.8s
[Pipeline] ............... (step 3 of 3) Processing clf, total= 2.3min
CPU times: user 3min 4s, sys: 6.71 s, total: 3min 11s
Wall time: 3min 16s


Pipeline(steps=[('preprocessor',
                 FunctionTransformer(func=<function text_prepare at 0x7fd6e4744dd0>,
                                     kw_args={'join_symbol': ' '})),
                ('tfidf',
                 TfidfVectorizer(max_df=0.9, min_df=5, ngram_range=(1, 3),
                                 tokenizer=<function tokenize_and_stem at 0x7fd6f3cd0200>)),
                ('clf',
                 ClassifierChain(base_estimator=LogisticRegression(penalty='l1',
                                                                   solver='liblinear'),
                                 cv=3))],
         verbose=True)

In [45]:
y_test_predicted_labels_tfidf = training_pipe.predict(X_test)

print(classification_report(y_test_binarized, 
                            y_test_predicted_labels_tfidf, 
                            target_names=binarizer.classes_, 
                            zero_division=1))

                      precision    recall  f1-score   support

              python       0.78      0.59      0.67       513
          javascript       0.77      0.35      0.48       403
                java       0.81      0.38      0.51       300
             android       0.80      0.45      0.58       187
                  c#       0.81      0.43      0.56       197
                html       0.62      0.26      0.37       154
             reactjs       0.81      0.61      0.69       148
                 php       0.90      0.42      0.57       142
            python3x       0.67      0.03      0.07       115
              nodejs       0.85      0.30      0.44       118
                   r       0.83      0.42      0.55        91
                 c++       0.75      0.47      0.58        85
                 css       0.81      0.42      0.55       101
                 sql       0.58      0.21      0.31        85
             flutter       0.98      0.72      0.83        82
       

In [46]:
accuracy_score(y_test_binarized, y_test_predicted_labels_tfidf)

0.36575

In [48]:
y_test_predicted_labels_tfidf = training_pipe.predict(X_test)
y_test_pred_inversed = binarizer.inverse_transform(y_test_predicted_labels_tfidf)
y_test_inversed = binarizer.inverse_transform(y_test_binarized)
for i in [500, 501, 502]:
    print('Text:\t{}\nTrue labels:\t{}\nPredicted labels:\t{}\n\n'.format(
        X_test[i],
        ','.join(y_test_inversed[i]),
        ','.join(y_test_pred_inversed[i])
    ))

Text:	CMSampleBufferCreate causes Thread 1: EXC_BAD_ACCESS (code=1, address=0x0) I am trying to make a CMSampleBuffer out of a CMBlockBuffer and I am using CMSampleBufferCreate but whatever I try I always get Thread 1: EXC_BAD_ACCESS (code=1, address=0x0)
let status = CMSampleBufferCreate(kCFAllocatorDefault,
                                              buffer, //CMBlockBuffer cannot be nil
                                              true,
                                              nil,
                                              nil,
                                              nil,
                                              1,
                                              0,
                                              nil,
                                              1,
                                              nil,
                                              sampleBuffer //sampleBuffer is nil
)


I do not know how to do this so if there is another way or if I am

In [52]:
pred = training_pipe.predict(['regex to remove parenthesis where use of braces causes stasis'])
binarizer.inverse_transform(pred)

[('regex',)]

## Randomized search CV

In [73]:
search_space = {"tfidf__min_df": np.arange(5, 100),
                "tfidf__max_df": np.arange(0.01, 0.98, step=0.01),
                "clf": [ClassifierChain(LogisticRegression(random_state=42,
                                                           dual=False, 
                                                           solver="liblinear", 
                                                           max_iter=1000), 
                                             cv=3)], 
                "clf__estimator__C": np.arange(0.000001, 1000, step=0.00001),
                "clf__estimator__penalty": ['l1', 'l2']}

In [57]:
scoring = {'f1': make_scorer(f1_score, average= 'weighted'), 
           'average_precision': 'average_precision'}

rs = RandomizedSearchCV(training_pipe, 
                        param_distributions=tfidf_params, 
                        scoring=scoring, 
                        refit='f1', 
                        return_train_score=True, 
                        n_iter=50, 
                        cv=3, 
                        verbose=10, 
                        n_jobs=-1)

In [58]:
%time rs.fit(X_train, y_train_binarized)

Fitting 3 folds for each of 50 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed: 12.1min
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed: 18.3min
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed: 30.5min
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed: 37.0min
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed: 53.2min
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed: 65.5min
[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed: 83.2min
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed: 96.7min
[Parallel(n_jobs=-1)]: Done  77 tasks      | elapsed: 120.3min
[Parallel(n_jobs=-1)]: Done  90 tasks      | elapsed: 137.6min
[Parallel(n_jobs=-1)]: Done 105 tasks      | elapsed: 161.0min
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed: 180.4min
[Parallel(n_jobs=-1)]: Done 137 tasks      | elapsed: 207.2min
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed: 223.1min finished


[Pipeline] ...... (step 1 of 3) Processing preprocessor, total=   2.7s
[Pipeline] ............. (step 2 of 3) Processing tfidf, total=  56.6s
[Pipeline] ............... (step 3 of 3) Processing clf, total= 1.6min
CPU times: user 2min 35s, sys: 11.4 s, total: 2min 46s
Wall time: 3h 45min 42s


RandomizedSearchCV(cv=3,
                   estimator=Pipeline(steps=[('preprocessor',
                                              FunctionTransformer(func=<function text_prepare at 0x7fd6e4744dd0>,
                                                                  kw_args={'join_symbol': ' '})),
                                             ('tfidf',
                                              TfidfVectorizer(max_df=0.9,
                                                              min_df=5,
                                                              ngram_range=(1,
                                                                           3),
                                                              tokenizer=<function tokenize_and_stem at 0x7fd6f3cd0200>)),
                                             ('clf',
                                              ClassifierChain(base_estimator=LogisticRegression(penalty='l1',
                                                          

In [62]:
y_test_predicted_labels_tfidf = rs.predict(X_test)
y_test_pred_inversed = binarizer.inverse_transform(y_test_predicted_labels_tfidf)
y_test_inversed = binarizer.inverse_transform(y_test_binarized)
for i in range(301, 302, 303):
    print('Text:\t{}\nTrue labels:\t{}\nPredicted labels:\t{}\n\n'.format(
        X_test[i],
        ','.join(y_test_inversed[i]),
        ','.join(y_test_pred_inversed[i])
    ))

Text:	Nuxtjs Vuex not saving changes I have data between sessions that is saved in window.localStorage. When a new session starts a plugin will grab the data and add it to the store.
// ./store/data.js

export const state = () = ({
  data: []
})

export const mutations = {
  addItemToData (state, item) {
    state.data = state.data.push(item)
  },
  setData (state, data) {
    state.data = data
  },
}


// ./store/index.js

import localStorage from '../plugins/localStorage'

export const plugins = [localStorage]


// plugins/localStorage.js

const localStorage = store = {

  store.subscribe((mutation, state) = {

    if (mutation.type === 'data/addItemToData') {
      console.log('saving added item to storage')
      window.localStorage.setItem('data', JSON.stringify(state.data.data))
    }

  })

  // called when the store is initialized
  if (typeof window !== 'undefined') {
    if (window.localStorage.data) {
      store.commit('data/setData', JSON.parse(window.localStorage.getItem(

In [60]:
print(classification_report(y_test_binarized, 
                            y_test_predicted_labels_tfidf, 
                            target_names=binarizer.classes_, 
                            zero_division=1))

                      precision    recall  f1-score   support

              python       0.80      0.59      0.68       513
          javascript       0.80      0.36      0.50       403
                java       0.82      0.40      0.54       300
             android       0.80      0.51      0.63       187
                  c#       0.80      0.41      0.54       197
                html       0.60      0.26      0.36       154
             reactjs       0.80      0.59      0.68       148
                 php       0.85      0.41      0.55       142
            python3x       0.60      0.03      0.05       115
              nodejs       0.87      0.33      0.48       118
                   r       0.79      0.41      0.54        91
                 c++       0.76      0.45      0.56        85
                 css       0.80      0.43      0.55       101
                 sql       0.62      0.25      0.35        85
             flutter       0.97      0.73      0.83        82
       

### Save the RandomizedSearchCV model

Saving RandomizedSearchCV best predictor along with the binarizer transformer. 

In [20]:
fname = "models/" + str(date.today()).replace("-", "_") + "_rs_model_and_mlb.pkl"
joblib.dump((rs.best_estimator_, binarizer), fname)

NameError: name 'rs' is not defined

## Making predictions

In [21]:
fname = "../models/2020_11_08_rs_model_and_mlb.pkl"
model, binarizer = load(fname)

In [67]:
def get_preds(model, binarizer, sentence: str) -> set:
    """Get labels of prediction for a single sentence (hence the 0)"""
    raw_preds = model.predict([sentence])
    return binarizer.inverse_transform(raw_pred)[0]
    
def get_probability_preds(model, sentence: str) -> dict:
    """Get probabilities for all tags possible for a single sentence (hence the 0)"""
    global TAGS_TO_KEEP
    probs = model.predict_proba([sentence])[0]
    return dict(zip(TAGS_TO_KEEP, probs))    

def predict(model, binarizer, sentence) -> dict:
    preds = get_preds(model, binarizer, sentence)
    probs_dict = get_probability_preds(model, sentence)
    return dict(zip(preds, [probs_dict.get(key) for key in preds]))

In [70]:
sentence = "This Tensorflow Python thing is turning out really tricky "
predict(model=model, binarizer=binarizer, sentence=sentence)

{'python': 0.9990015485606143, 'tensorflow': 0.9999104637304294}