In [4]:
import logging
import pandas as pd
import numpy as np
from numpy import random

In [6]:
import matplotlib.pyplot as plt
import re
from bs4 import BeautifulSoup
%matplotlib inline

In [28]:
import spacy
from spacy.lang.en.stop_words import STOP_WORDS

In [7]:
df = pd.read_csv('stack-overflow-data.csv')

In [8]:
df.head()

Unnamed: 0,post,tags
0,what is causing this behavior in our c# datet...,c#
1,have dynamic html load as if it was in an ifra...,asp.net
2,how to convert a float value in to min:sec i ...,objective-c
3,.net framework 4 redistributable just wonderi...,.net
4,trying to calculate and print the mean and its...,python


In [13]:
df['post'].apply(lambda x: len(x.split(' '))).sum()

10286120

In [14]:
df['tags'].value_counts()

iphone           2000
jquery           2000
sql              2000
ruby-on-rails    2000
css              2000
ios              2000
c#               2000
html             2000
php              2000
java             2000
asp.net          2000
objective-c      2000
javascript       2000
python           2000
c                2000
mysql            2000
android          2000
c++              2000
.net             2000
angularjs        2000
Name: tags, dtype: int64

In [17]:
df['post'][0]

'what is causing this behavior  in our c# datetime type  <pre><code>[test] public void sadness() {    var datetime = datetime.utcnow;    assert.that(datetime  is.equalto(datetime.parse(datetime.tostring()))); } </code></pre>   failed :   <pre><code> expected: 2011-10-31 06:12:44.000  but was:  2011-10-31 06:12:44.350 </code></pre>   i wish to know what is happening behind the scenes in tostring() etc to cause this behavior.    edit after seeing jon s answer :   <pre><code>[test] public void newsadness() {     var datetime = datetime.utcnow;     assert.that(datetime  is.equalto(datetime.parse(datetime.tostring( o )))); } </code></pre>   result :   <pre><code>expected: 2011-10-31 12:03:04.161 but was:  2011-10-31 06:33:04.161 </code></pre>   same result with capital and small  o  . i m reading up the docs  but still unclear.'

In [18]:
df['tags'][0]

'c#'

### Data Preprocessing - Text Cleaning 

In [42]:
post = df['post'][0]

In [51]:
data = BeautifulSoup(post, 'lxml')

In [52]:
data.text

'what is causing this behavior  in our c# datetime type  [test] public void sadness() {    var datetime = datetime.utcnow;    assert.that(datetime  is.equalto(datetime.parse(datetime.tostring()))); }    failed :    expected: 2011-10-31 06:12:44.000  but was:  2011-10-31 06:12:44.350    i wish to know what is happening behind the scenes in tostring() etc to cause this behavior.    edit after seeing jon s answer :   [test] public void newsadness() {     var datetime = datetime.utcnow;     assert.that(datetime  is.equalto(datetime.parse(datetime.tostring( o )))); }    result :   expected: 2011-10-31 12:03:04.161 but was:  2011-10-31 06:33:04.161    same result with capital and small  o  . i m reading up the docs  but still unclear.'

In [53]:
data = data.text
data.lower()

'what is causing this behavior  in our c# datetime type  [test] public void sadness() {    var datetime = datetime.utcnow;    assert.that(datetime  is.equalto(datetime.parse(datetime.tostring()))); }    failed :    expected: 2011-10-31 06:12:44.000  but was:  2011-10-31 06:12:44.350    i wish to know what is happening behind the scenes in tostring() etc to cause this behavior.    edit after seeing jon s answer :   [test] public void newsadness() {     var datetime = datetime.utcnow;     assert.that(datetime  is.equalto(datetime.parse(datetime.tostring( o )))); }    result :   expected: 2011-10-31 12:03:04.161 but was:  2011-10-31 06:33:04.161    same result with capital and small  o  . i m reading up the docs  but still unclear.'

In [54]:
data = re.compile('[/(){}\[\]\|@,;\d-]').sub(' ', data)
data

'what is causing this behavior  in our c# datetime type   test  public void sadness        var datetime = datetime.utcnow     assert.that datetime  is.equalto datetime.parse datetime.tostring            failed :    expected:              :  :  .     but was:               :  :  .       i wish to know what is happening behind the scenes in tostring   etc to cause this behavior.    edit after seeing jon s answer :    test  public void newsadness         var datetime = datetime.utcnow      assert.that datetime  is.equalto datetime.parse datetime.tostring  o            result :   expected:              :  :  .    but was:               :  :  .       same result with capital and small  o  . i m reading up the docs  but still unclear.'

In [55]:
data = re.compile('[^0-9a-z #+_]').sub('', data)
data

'what is causing this behavior  in our c# datetime type   test  public void sadness        var datetime  datetimeutcnow     assertthat datetime  isequalto datetimeparse datetimetostring            failed     expected                       but was                          i wish to know what is happening behind the scenes in tostring   etc to cause this behavior    edit after seeing jon s answer     test  public void newsadness         var datetime  datetimeutcnow      assertthat datetime  isequalto datetimeparse datetimetostring  o            result    expected                      but was                          same result with capital and small  o   i m reading up the docs  but still unclear'

In [56]:
data = ' '.join(token for token in data.split() if token not in STOP_WORDS)

In [57]:
data

'causing behavior c# datetime type test public void sadness var datetime datetimeutcnow assertthat datetime isequalto datetimeparse datetimetostring failed expected wish know happening scenes tostring etc cause behavior edit seeing jon s answer test public void newsadness var datetime datetimeutcnow assertthat datetime isequalto datetimeparse datetimetostring o result expected result capital small o m reading docs unclear'

In [58]:
def text_clean(post):
    data = BeautifulSoup(post, 'lxml')
    data = data.text
    data = re.compile('[/(){}\[\]\|@,;\d-]').sub(' ', data)
    data = re.compile('[^0-9a-z #+_]').sub('', data)
    data = ' '.join(token for token in data.split() if token not in STOP_WORDS)
    return data

In [59]:
df['cleaned_post'] = df['post'].apply(text_clean)

In [60]:
df['cleaned_post'][0]

'causing behavior c# datetime type test public void sadness var datetime datetimeutcnow assertthat datetime isequalto datetimeparse datetimetostring failed expected wish know happening scenes tostring etc cause behavior edit seeing jon s answer test public void newsadness var datetime datetimeutcnow assertthat datetime isequalto datetimeparse datetimetostring o result expected result capital small o m reading docs unclear'

In [61]:
df['cleaned_post'][1]

'dynamic html load iframe aspnet site users save entire html page backend database want load dynamic content div existing page content area couple things happen want css affect outside div trying loading badly formed html images divs outside content area lot html pages use base tag images links want base tag respected inside div solution going try use iframe set url child page loads dynamic html page entirely wondering better solution'

In [62]:
df['cleaned_post'].apply(lambda x: len(x.split(' '))).sum()

3180726

### TF-IDF Model with SVM

In [78]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier

In [152]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import LabelBinarizer, LabelEncoder


In [154]:
data = df[['cleaned_post', 'tags']].copy()

In [155]:
data.head()

Unnamed: 0,cleaned_post,tags
0,causing behavior c# datetime type test public ...,c#
1,dynamic html load iframe aspnet site users sav...,asp.net
2,convert float value minsec m trying convert se...,objective-c
3,net framework redistributable wondering net fr...,.net
4,trying calculate print mean returning number p...,python


In [156]:
tfidf = TfidfVectorizer()
classifier = LinearSVC()

In [158]:
X = data['cleaned_post']
y = data['tags']

In [159]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [160]:
X_train.shape, X_test.shape

((32000,), (8000,))

In [161]:
clf = Pipeline([('tfidf', tfidf), ('clf', classifier)])

In [162]:
clf.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('clf',
                 LinearSVC(C=1.0, class_weight=None, dual=True,
                           fit_intercept=True, intercept_scaling=1,
               

In [163]:
y_pred = clf.predict(X_test)

In [164]:
print(accuracy_score(y_test, y_pred))

0.805375


In [165]:
print(classification_report(y_test, y_pred))

               precision    recall  f1-score   support

         .net       0.77      0.67      0.71       408
      android       0.92      0.90      0.91       396
    angularjs       0.97      0.95      0.96       398
      asp.net       0.83      0.78      0.81       395
            c       0.79      0.86      0.82       395
           c#       0.66      0.62      0.64       376
          c++       0.80      0.77      0.78       402
          css       0.82      0.87      0.85       407
         html       0.70      0.71      0.71       424
          ios       0.66      0.66      0.66       383
       iphone       0.66      0.67      0.66       398
         java       0.84      0.85      0.84       400
   javascript       0.79      0.81      0.80       405
       jquery       0.84      0.87      0.85       375
        mysql       0.84      0.82      0.83       383
  objective-c       0.72      0.67      0.69       395
          php       0.82      0.85      0.84       397
       py

### Random Forest 

In [169]:
tfidf = TfidfVectorizer()
classifier = RandomForestClassifier(n_estimators = 100, n_jobs = -1)

In [170]:
clf = Pipeline([('tfidf', tfidf), ('clf', classifier)])

In [171]:
clf.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='...
                 RandomForestClassifier(bootstrap=True, class_weight=None,
                                        criterion='gini', max_depth=None,
                                        max_features='auto',
                                        max_leaf_nodes=None,
                               

In [172]:
y_pred = clf.predict(X_test)

In [173]:
print(accuracy_score(y_test, y_pred))

0.79025


In [174]:
print(classification_report(y_test, y_pred))

               precision    recall  f1-score   support

         .net       0.67      0.69      0.68       408
      android       0.96      0.87      0.91       396
    angularjs       0.97      0.96      0.97       398
      asp.net       0.80      0.77      0.78       395
            c       0.76      0.88      0.81       395
           c#       0.68      0.54      0.60       376
          c++       0.87      0.68      0.76       402
          css       0.68      0.91      0.78       407
         html       0.70      0.62      0.66       424
          ios       0.72      0.61      0.66       383
       iphone       0.61      0.63      0.62       398
         java       0.86      0.85      0.86       400
   javascript       0.79      0.79      0.79       405
       jquery       0.82      0.85      0.83       375
        mysql       0.80      0.82      0.81       383
  objective-c       0.68      0.71      0.70       395
          php       0.88      0.85      0.86       397
       py

### Word2Vec 

In [175]:
!python -m spacy download en_core_web_lg

[+] Download and installation successful
You can now load the model via spacy.load('en_core_web_lg')


In [176]:
nlp = spacy.load('en_core_web_lg')

In [178]:
doc = nlp('today I am going to learn word2doc')

In [180]:
doc[0].vector

array([-1.5657e-01,  5.9489e-01, -3.1445e-02, -7.7586e-02,  2.7863e-01,
       -5.0921e-01, -6.6350e-02, -8.1890e-02, -4.7986e-02,  2.8036e+00,
       -1.3182e-01, -6.2710e-01,  2.5388e-01, -4.7291e-01, -4.2875e-01,
        1.4352e-01, -1.1818e-01,  6.7974e-01, -6.2287e-03, -1.5272e-03,
       -1.6379e-01, -3.4067e-02,  2.7102e-01,  5.4245e-02, -1.1752e-02,
        3.2447e-01, -3.2140e-01, -3.7041e-03, -2.0539e-01,  5.7008e-02,
        1.7746e-01, -1.6603e-01,  5.0532e-01, -2.1520e-01,  1.0503e-02,
        5.9839e-01,  2.1347e-02,  1.0365e-01,  2.3387e-02,  9.7183e-02,
       -1.7079e-01, -7.6186e-02,  2.0963e-01, -2.0728e-02,  4.9396e-02,
        4.2684e-01, -3.3965e-01, -6.7623e-02,  1.1842e-01, -1.3857e-01,
       -1.6147e-01,  4.0594e-01, -2.7631e-01,  1.3817e-01,  1.5902e-01,
        2.0709e-02, -2.5881e-01, -1.4681e-01,  1.7239e-01, -3.9357e-01,
       -3.2685e-01, -7.6166e-02, -1.3211e-01,  1.3168e-01, -1.3224e-02,
       -5.3735e-03,  4.2536e-02,  1.2877e-01, -2.6833e-02,  1.56

In [181]:
doc.vector

array([-6.22242922e-03,  3.66661400e-01, -2.40593582e-01,  3.08350027e-02,
        1.12123422e-01, -2.34057065e-02,  6.26979023e-02, -8.27725604e-02,
       -2.62314244e-03,  2.06082869e+00, -4.01948601e-01, -1.98807418e-02,
        4.94028553e-02, -6.53694198e-02, -9.82314274e-02,  2.87915673e-02,
       -1.76654294e-01,  9.29194331e-01, -1.93387792e-01,  7.47183934e-02,
        1.20282851e-01, -6.67640045e-02,  9.39718559e-02,  9.89337116e-02,
       -8.08691457e-02,  1.39777288e-01, -7.30607659e-02, -1.60455853e-01,
        1.35614559e-01, -1.35344282e-01, -1.49561718e-01,  1.70657150e-02,
        1.05809011e-01,  4.92172875e-02,  7.65613979e-03,  6.04450032e-02,
        9.90117192e-02,  1.93372846e-01, -1.04064859e-01, -5.76275587e-02,
       -1.77936569e-01, -1.44178584e-01, -4.68965694e-02,  1.88559983e-02,
        8.85578543e-02,  3.65736634e-01, -5.02628498e-02, -3.01755704e-02,
        4.43435721e-02, -8.17852765e-02, -7.66167119e-02,  1.23787716e-01,
       -2.89720036e-02,  

In [183]:
doc[0].similarity(doc[1])

0.33404082