In [1]:
import itertools
import re
import lxml.html
import requests
import tqdm

## 데이터 준비

In [2]:
url_base = 'http://stackoverflow.com/questions/tagged/{tag}?page={page}&sort=newest&pagesize=50'
tags = ['python', 'r']

In [3]:
list(itertools.product(tags, range(1, 3)))

[('python', 1), ('python', 2), ('r', 1), ('r', 2)]

In [4]:
tag_page = list(itertools.product(tags, range(1, 21)))

In [5]:
titles = []
title_tag = []
for tag, page in tqdm.tqdm_notebook(tag_page):
    url = url_base.format(tag=tag, page=page)
    res = requests.get(url)
    root = lxml.html.fromstring(res.text)
    for link in root.cssselect('h3 a.question-hyperlink'):
        title = link.text
        title = re.sub(tag, ' ', title, flags=re.IGNORECASE)
        titles.append(title)
        title_tag.append(tag)




In [6]:
len(titles)

2000

## TDM

In [10]:
from sklearn.feature_extraction.text import CountVectorizer

In [11]:
cv = CountVectorizer(stop_words='english', max_features=1000)

In [12]:
tdm = cv.fit_transform(titles)

## data split

In [13]:
from sklearn.model_selection import train_test_split

In [14]:
X_train, X_test, y_train, y_test = train_test_split(tdm, title_tag, test_size=0.2)

## Logistic Regression

In [15]:
from sklearn.linear_model import LogisticRegressionCV

In [16]:
logreg = LogisticRegressionCV()
logreg.fit(X_train, y_train)

LogisticRegressionCV(Cs=10, class_weight=None, cv=None, dual=False,
           fit_intercept=True, intercept_scaling=1.0, max_iter=100,
           multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
           refit=True, scoring=None, solver='lbfgs', tol=0.0001, verbose=0)

In [17]:
logreg.C_

array([ 2.7825594])

In [18]:
logreg.classes_

array(['python', 'r'], 
      dtype='<U6')

In [19]:
logreg.score(X_train, y_train) 

0.98375000000000001

In [20]:
y_logreg = logreg.predict(X_test)

In [21]:
from sklearn import metrics

In [22]:
metrics.confusion_matrix(y_test, y_logreg)  # row: true, col: predicted

array([[177,   9],
       [ 25, 189]])

In [23]:
metrics.accuracy_score(y_test, y_logreg)

0.91500000000000004

In [24]:
# Imbalanced data로 만들기

In [25]:
import numpy

In [26]:
len(y_train)

1600

In [27]:
imb_id = []
count = 0
for i, tag in enumerate(y_train):
    if tag == 'python':
        imb_id.append(i)
    elif count < 200:
        imb_id.append(i)
        count += 1

In [28]:
# Imbalnced data 트레이닝

In [29]:
X_imb = X_train[imb_id, :]
y_imb = numpy.array(y_train)[imb_id]

In [30]:
model_imb = LogisticRegressionCV()
model_imb.fit(X_imb, y_imb)

LogisticRegressionCV(Cs=10, class_weight=None, cv=None, dual=False,
           fit_intercept=True, intercept_scaling=1.0, max_iter=100,
           multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
           refit=True, scoring=None, solver='lbfgs', tol=0.0001, verbose=0)

In [31]:
y_pred_imb = model_imb.predict(X_test)

In [33]:
metrics.confusion_matrix(y_test, y_pred_imb)  # row: true, col: predicted

array([[182,   4],
       [ 68, 146]])

In [34]:
metrics.accuracy_score(y_test, y_pred_imb)

0.81999999999999995

# SMOTE

In [56]:
from imblearn.over_sampling import SMOTE

In [57]:
sm = SMOTE(kind='svm')  # kind = ['regular', 'borderline1', 'borderline2', 'svm]

In [58]:
X_sm, y_sm = sm.fit_sample(X_imb.toarray(), y_imb)

In [59]:
X_sm.shape

(1627, 1000)

In [60]:
model_sm = LogisticRegressionCV()
model_sm.fit(X_sm, y_sm)

LogisticRegressionCV(Cs=10, class_weight=None, cv=None, dual=False,
           fit_intercept=True, intercept_scaling=1.0, max_iter=100,
           multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
           refit=True, scoring=None, solver='lbfgs', tol=0.0001, verbose=0)

In [61]:
y_pred_sm = model_sm.predict(X_test)

In [62]:
metrics.confusion_matrix(y_test, y_pred_sm)  # row: true, col: predicted

array([[176,  10],
       [ 62, 152]])

In [63]:
metrics.accuracy_score(y_test, y_pred_sm)

0.81999999999999995

# ADASYN

In [64]:
from imblearn.over_sampling import ADASYN

In [65]:
ada = ADASYN()

In [66]:
X_ada, y_ada = ada.fit_sample(X_imb.toarray(), y_imb)

In [67]:
model_ada = LogisticRegressionCV()
model_ada.fit(X_ada, y_ada)

LogisticRegressionCV(Cs=10, class_weight=None, cv=None, dual=False,
           fit_intercept=True, intercept_scaling=1.0, max_iter=100,
           multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
           refit=True, scoring=None, solver='lbfgs', tol=0.0001, verbose=0)

In [68]:
y_pred_ada = model_ada.predict(X_test)

In [69]:
metrics.confusion_matrix(y_test, y_pred_ada)  # row: true, col: predicted

array([[180,   6],
       [ 67, 147]])

In [70]:
metrics.accuracy_score(y_test, y_pred_ada)

0.8175

# Tomek link + SMOTE

In [71]:
from imblearn.combine import SMOTETomek

In [72]:
tomek = SMOTETomek(kind_smote='regular')

In [73]:
X_tomek, y_tomek = tomek.fit_sample(X_imb.toarray(), y_imb)

In [74]:
model_tomek = LogisticRegressionCV()
model_tomek.fit(X_tomek, y_tomek)

LogisticRegressionCV(Cs=10, class_weight=None, cv=None, dual=False,
           fit_intercept=True, intercept_scaling=1.0, max_iter=100,
           multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
           refit=True, scoring=None, solver='lbfgs', tol=0.0001, verbose=0)

In [75]:
y_pred_tomek = model_tomek.predict(X_test)

In [76]:
metrics.confusion_matrix(y_test, y_pred_tomek)  # row: true, col: predicted

array([[176,  10],
       [ 56, 158]])

In [77]:
metrics.accuracy_score(y_test, y_pred_tomek)

0.83499999999999996