In [8]:
import itertools
import lxml.html
import requests
import tqdm
import re

## data crawling

In [2]:
url_base = 'http://stackoverflow.com/questions/tagged/{tag}?page={page}&sort=newest&pagesize=50'

In [3]:
tags = ['rstudio', 'dplyr', 'ggplot2']

In [4]:
list(itertools.product(tags, range(1, 3)))

[('rstudio', 1),
 ('rstudio', 2),
 ('dplyr', 1),
 ('dplyr', 2),
 ('ggplot2', 1),
 ('ggplot2', 2)]

In [11]:
tag_page = list(itertools.product(tags, range(1, 21)))

In [9]:
titles = []
title_tag = []
for tag, page in tqdm.tqdm_notebook(tag_page):
    url = url_base.format(tag=tag, page=page)
    res = requests.get(url)
    root = lxml.html.fromstring(res.text)
    for link in root.cssselect('h3 a.question-hyperlink'):
        title = link.text
        title = re.sub(tag, ' ', title, flags=re.IGNORECASE)
        titles.append(title)
        title_tag.append(tag)




In [12]:
len(titles)

3000

In [13]:
len(title_tag)

3000

## TDM

In [14]:
from sklearn.feature_extraction.text import CountVectorizer

In [15]:
cv = CountVectorizer(stop_words='english', max_features=1000)
tdm = cv.fit_transform(titles)

In [16]:
from sklearn.model_selection import train_test_split

In [17]:
X_train, X_test, y_train, y_test = train_test_split(tdm, title_tag, test_size=0.2)

## Logistic Regression

In [19]:
from sklearn.linear_model import LogisticRegressionCV

In [20]:
logreg = LogisticRegressionCV()

In [21]:
logreg.fit(X_train, y_train)

LogisticRegressionCV(Cs=10, class_weight=None, cv=None, dual=False,
           fit_intercept=True, intercept_scaling=1.0, max_iter=100,
           multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
           refit=True, scoring=None, solver='lbfgs', tol=0.0001, verbose=0)

In [23]:
logreg.C_  # alpha의 역수.

array([ 2.7825594,  2.7825594,  2.7825594])

In [24]:
logreg.classes_

array(['dplyr', 'ggplot2', 'rstudio'], 
      dtype='<U7')