# Environment

In [1]:
import google.cloud.bigquery as bigquery, pandas as pd, matplotlib.pyplot as plt, seaborn as snsn, numpy as np, \
importlib

In [2]:
import src.utils.evaluation as evaluation

In [3]:
from sklearn.preprocessing import LabelBinarizer, MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from skmultilearn.problem_transform import LabelPowerset
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    accuracy_score,
    balanced_accuracy_score,
    cohen_kappa_score,
    roc_auc_score,
)

# Data

In [None]:
client = bigquery.Client()

In [None]:
query = """
SELECT
  *
FROM
  stackoverflow.posts_preprocessed_selection
WHERE RAND() <= 0.25
"""

In [None]:
df = client.query(query).to_dataframe()

In [None]:
df.head()

In [None]:
df.info(memory_usage="deep")

# Naive Bayes (Single-Label Classification)

In [None]:
x = df["code_body"] + df["title"]
y = df["tags"].apply(lambda row: " ".join(row))

In [None]:
x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2, stratify=y, random_state=30
)

In [None]:
estimators = [
    ("tfidf", TfidfVectorizer(tokenizer=lambda string: string.split())),
    ("clf", MultinomialNB()),
]
parameters = {
    "tfidf__min_df": np.arange(10, 30, 10),
    "tfidf__max_df": np.arange(0.75, 0.9, 0.05),
    "tfidf__ngram_range": [(1, 1), (2, 2), (3, 3)],
    "clf__alpha": np.arange(0.2, 1, 0.2),
}

In [None]:
p = Pipeline(estimators)
grid = RandomizedSearchCV(
    p,
    param_distributions=parameters,
    scoring="balanced_accuracy",
    n_iter=1,
    cv=3,
    n_jobs=1,
    pre_dispatch="2*n_jobs",
)

In [None]:
score = grid.fit(x_train, y_train)

In [None]:
print(grid.best_params_)

In [None]:
y_pred = grid.predict(x_test)

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
cm = confusion_matrix(y_test, y_pred)
evaluation.plot_confusion_matrix(cm=cm, classes=range(1, 6), normalize=True)

In [None]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Balanced accuracy:", balanced_accuracy_score(y_test, y_pred))
print("Cohen's cappa coefficient:", cohen_kappa_score(y_test, y_pred))

# Support Vector Machine (Single-Label Classification)

In [None]:
x2 = df["code_body"] + df["title"]
y2 = df["tags"].apply(lambda row: " ".join(row))

In [None]:
x2_train, x2_test, y2_train, y2_test = train_test_split(
    x2, y2, test_size=0.2, stratify=y2, random_state=30
)

In [None]:
estimators2 = [
    ('tfidf', TfidfVectorizer(tokenizer=lambda string: string.split())),
    ('clf', SGDClassifier(n_jobs=1, early_stopping=True, class_weight='balanced'))
]
parameters2 = {
    'tfidf__min_df': np.arange(10, 30, 10),
    'tfidf__max_df': np.arange(0.75, 0.9, 0.05),
    'tfidf__ngram_range': [(1, 1), (2, 2), (3, 3)],
    'tfidf__norm': ['l1', 'l2', None]
    'clf__alpha': (1e-2, 1e-3)
}

In [None]:
p2 = Pipeline(estimators2)
grid2 = RandomizedSearchCV(
    p2,
    param_distributions=parameters2,
    scoring="balanced_accuracy",
    n_iter=20,
    cv=3,
    n_jobs=13,
    pre_dispatch="n_jobs",
)

In [None]:
score2 = grid2.fit(x2_train, y2_train)

In [None]:
print(grid2.best_params_)

In [None]:
y2_pred = grid2.predict(x2_test)

In [None]:
print(classification_report(y2_test, y2_pred))

In [None]:
cm2 = confusion_matrix(y2_test, y2_pred)
evaluation.plot_confusion_matrix(cm=cm2, classes=range(1, 6), normalize=True)

In [None]:
print("Accuracy:", accuracy_score(y2_test, y2_pred))
print("Balanced accuracy:", balanced_accuracy_score(y2_test, y2_pred))
print("Cohen's cappa coefficient:", cohen_kappa_score(y2_test, y2_pred))

# Random Forest

In [11]:
## Working

In [None]:
x3 = df["code_body"] + df["title"]
y3 = df["tags"].apply(lambda row: " ".join(row))

In [None]:
x3_train, x3_test, y3_train, y3_test = train_test_split(
    x3, y3, test_size=0.2, stratify=y3, random_state=30
)

In [15]:
##Pipeline

In [16]:
estimators3 = [
    ('tfidf', TfidfVectorizer(tokenizer=lambda string: string.split(),
                             min_df=20, 
                             max_df=0.75,
                             ngram_range=(1,1))),
    ('clf', RandomForestClassifier(n_estimators=100,
                                   n_jobs=-1, 
                                   class_weight='balanced'))
]

In [17]:
p3 = Pipeline(estimators3)

In [18]:
score3 = p3.fit(train_X, train_y)

KeyboardInterrupt: 

In [None]:
y3_pred = p3.predict(x3_test)

In [None]:
print(classification_report(y3_test, y3_pred))

# Model Problems

<ul>
    <li>Some tags are very similar to each other</li>
    <li>Are there better features?</li>
    <li>Class imblances</li>
    <li>Hyper-parameter space must be checked and an optimization procedure better than random search should be applied</li>