In [6]:
import string
import re
import nltk
nltk.download(["stopwords"])
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

def preprocess(text):
    cleaned_text = text.translate(str.maketrans('', '', '!"#$%&\'()*+,.<=>?@[]^`{|}~' + u'\xa0'))
    cleaned_text = cleaned_text.lower()
    cleaned_text = cleaned_text.translate(str.maketrans(string.whitespace, ' ' * len(string.whitespace), ''))
    cleaned_text = ' '.join(['_variable_with_underscore' if '_' in t else t for t in cleaned_text.split()])
    cleaned_text = ' '.join(['_variable_with_dash' if '-' in t else t for t in cleaned_text.split()])
    cleaned_text = ' '.join(['_long_variable_name' if len(t) > 15 and t[0] != '#' else t for t in cleaned_text.split()])
    cleaned_text = ' '.join(['_weburl' if t.startswith('http') and '/' in t else t for t in cleaned_text.split()])
    cleaned_text = ' '.join(['_number' if re.sub('[\\/;:_-]', '', t).isdigit() else t for t in cleaned_text.split()])
    cleaned_text = ' '.join(['_variable_with_address' if re.match('.*0x[0-9a-f].*', t) else t for t in cleaned_text.split()])
    cleaned_text = ' '.join(['_name_with_number' if re.match('.*[a-f]*:[0-9]*', t) else t for t in cleaned_text.split()])
    cleaned_text = ' '.join(['_number_starts_with_one_character' if re.match('[a-f][0-9].*', t) else t for t in cleaned_text.split()])
    cleaned_text = ' '.join(['_number_starts_with_three_characters' if re.match('[a-f]{3}[0-9].*', t) else t for t in cleaned_text.split()])
    cleaned_text = ' '.join(['_version' if any(i.isdigit() for i in t) and t.startswith('v') else t for t in cleaned_text.split()])
    cleaned_text = ' '.join(['_localpath' if ('\\' in t or '/' in t) and ':' not in t else t for t in cleaned_text.split()])
    cleaned_text = ' '.join(['_image_size' if t.endswith('px') else t for t in cleaned_text.split()])
    tokenized_text = word_tokenize(cleaned_text)
    sw_removed_text = [word for word in tokenized_text if word not in stopword_set]
    sw_removed_text = [word for word in sw_removed_text if len(word) > 2]
    stemmed_text = ' '.join([stemmer.stem(w) for w in sw_removed_text])
    return stemmed_text

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/thanatornkanthala/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Preprocessing Data

In [7]:
from multiprocessing.pool import ThreadPool as Pool
import pandas as pd

def initialize_pool(stopwords, ps):
    global stopword_set
    global stemmer
    stopword_set = stopwords
    stemmer = ps

dataset = pd.read_json('data/embold_train.json')
dataset.loc[dataset['label'] > 0, 'label'] = -1
dataset.loc[dataset['label'] == 0, 'label'] = 1
dataset.loc[dataset['label'] == -1, 'label'] = 0
stopwords = set(stopwords.words('english'))
ps = PorterStemmer()
pool = Pool(8, initializer=initialize_pool, initargs=(stopwords, ps, ))

cleaned_title = pool.map(preprocess, dataset.title)
cleaned_body = pool.map(preprocess, dataset.body)

In [8]:
df = pd.DataFrame([cleaned_title, cleaned_body], index=["title", "body"]).T
df.head()

Unnamed: 0,title,body
0,_long_variable_nam piano roll,_long_variable_nam piano roll would use
1,buggi behavior select,screenshot _long_variable_nam _number _number ...
2,auto updat featur,_localpath _localpath great job far saenzramir...
3,filter noisi endpoint log,think stop log request _name_with_numb _long_v...
4,enabl pid _localpath pid alarm action _localpath,expect _localpath alarm action pid pid enabl _...


## Extract with TF-IDF Vectorizer

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 1))
tfidf_vectorizer.fit(cleaned_title + cleaned_body)

In [11]:
from sklearn.model_selection import train_test_split

label = dataset["label"]
data_fit, data_test, label_fit, label_test = train_test_split(df, label, test_size=0.1)

x_tfidf_fit = tfidf_vectorizer.transform(data_fit["title"] + data_fit["body"])
x_tfidf_test = tfidf_vectorizer.transform(data_test["title"] + data_test["body"])

print(f"fit data: {x_tfidf_fit.shape}")
print(f"test data: {x_tfidf_test.shape}")

fit data: (135000, 164632)
test data: (15000, 164632)


## Extract with LSA
- LSA Scoring: `[ p:0.7575 r:0.7570 f:0.7572 ]` <br>
- TFIDF + LSA Scoring: `[ p:0.7839 r:0.7823 f:0.7830 ]`

In [13]:
from sklearn.decomposition import TruncatedSVD

lsa = TruncatedSVD(n_components=500, n_iter=100, random_state=0)
lsa.fit(x_tfidf_fit)
x_lsa_fit = lsa.transform(x_tfidf_fit)
x_lsa_test = lsa.transform(x_tfidf_test)
print(f"LSA fit data: {x_lsa_fit.shape}")
print(f"LSA test data: {x_lsa_test.shape}")

LSA fit data: (135000, 500)


In [14]:
from lightgbm import LGBMClassifier
from sklearn.model_selection import cross_val_score

model_with_lsa = LGBMClassifier(verbose=-1)

precision_cv_score = cross_val_score(
    model_with_lsa,
    x_lsa_fit,
    label_fit,
    cv=5,
    n_jobs=-2,
    scoring="precision_macro",
    verbose=0,
).mean()

recall_cv_score = cross_val_score(
    model_with_lsa,
    x_lsa_fit,
    label_fit,
    cv=5,
    n_jobs=-2,
    scoring="recall_macro",
    verbose=0,
).mean()

f1_cv_score = cross_val_score(
    model_with_lsa,
    x_lsa_fit,
    label_fit,
    cv=5,
    n_jobs=-2,
    scoring="f1_macro",
    verbose=0,
).mean()

print('fit: p:{0:.4f} r:{1:.4f} f:{2:.4f}'.format(precision_cv_score, recall_cv_score, f1_cv_score))

fit: p:0.7575 r:0.7570 f:0.7572


In [16]:
from scipy.sparse import hstack

x_tfidf_lsa_fit = hstack([x_tfidf_fit, x_lsa_fit]).tocsr()
print(f"TF-IDF + LSA fit: {x_tfidf_lsa_fit.shape}")

precision_cv_score = cross_val_score(
    model_with_lsa,
    x_tfidf_lsa_fit,
    label_fit,
    cv=5,
    n_jobs=-2,
    scoring="precision_macro",
    verbose=0,
).mean()

recall_cv_score = cross_val_score(
    model_with_lsa,
    x_tfidf_lsa_fit,
    label_fit,
    cv=5,
    n_jobs=-2,
    scoring="recall_macro",
    verbose=0,
).mean()

f1_cv_score = cross_val_score(
    model_with_lsa,
    x_tfidf_lsa_fit,
    label_fit,
    cv=5,
    n_jobs=-2,
    scoring="f1_macro",
    verbose=0,
).mean()

print('fit: p:{0:.4f} r:{1:.4f} f:{2:.4f}'.format(precision_cv_score, recall_cv_score, f1_cv_score))

TF-IDF + LSA fit: (135000, 165132)
fit: p:0.7839 r:0.7823 f:0.7830


## Extract with LDA

In [18]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer

count_vectorizer = CountVectorizer(ngram_range=(1, 1))
count_vectorizer.fit(cleaned_title + cleaned_body)
x_tf_fit = count_vectorizer.transform(data_fit["title"] + data_fit["body"])
x_tf_test = count_vectorizer.transform(data_test["title"] + data_test["body"])
print(f"TF fit: {x_tf_fit.shape}")

TF fit: (135000, 164632)


In [19]:
lda = LatentDirichletAllocation(n_components=500, random_state=0)
lda.fit(x_tf_fit)
x_lda_fit = lda.transform(x_tf_fit)
x_lda_test = lda.transform(x_tf_test)
x_tfidf_lsa_lda_fit = hstack([x_tfidf_fit, x_lsa_fit, x_lda_fit]).tocsr()
x_tfidf_lsa_lda_test = hstack([x_tfidf_test, x_lsa_test, x_lda_test]).tocsr()

print(f"TF-IDF + LSA + LDA fit: {x_tfidf_lsa_lda_fit.shape}")
print(f"TF-IDF + LSA + LDA test: {x_tfidf_lsa_lda_test.shape}")

TF-IDF + LSA + LDA fit: (135000, 165632)


In [20]:
import optuna
import numpy as np
import lightgbm as lgb
from sklearn import metrics

x_train, x_val, y_train, y_val = train_test_split(x_tfidf_lsa_lda_fit, label_fit, test_size=0.3)

def objective(trial):
    dtrain = lgb.Dataset(x_train, label=y_train)

    param = {
        "objective": "binary",
        "metric": "binary_logloss",
        "verbosity": -1,
        "boosting_type": "gbdt",
        "lambda_l1": trial.suggest_float("lambda_l1", 1e-8, 10.0, log=True),
        "lambda_l2": trial.suggest_float("lambda_l2", 1e-8, 10.0, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.4, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.4, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
    }

    model = lgb.train(param, dtrain)
    preds = model.predict(x_val)
    pred_labels = np.rint(preds)
    accuracy = metrics.roc_auc_score(y_val, pred_labels)
    return accuracy

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=30)

  from .autonotebook import tqdm as notebook_tqdm
[I 2024-03-09 18:52:43,219] A new study created in memory with name: no-name-04205741-b916-4a46-ac5b-e691004fceca
[I 2024-03-09 18:52:56,499] Trial 0 finished with value: 0.7826610671197397 and parameters: {'lambda_l1': 0.03694470109043198, 'lambda_l2': 0.04496115891365337, 'num_leaves': 61, 'feature_fraction': 0.9494260836782721, 'bagging_fraction': 0.7205310948944763, 'bagging_freq': 4, 'min_child_samples': 99}. Best is trial 0 with value: 0.7826610671197397.
[I 2024-03-09 18:53:26,576] Trial 1 finished with value: 0.7824650218066858 and parameters: {'lambda_l1': 2.1164879103238917e-05, 'lambda_l2': 0.0691145353710854, 'num_leaves': 246, 'feature_fraction': 0.9305669722775954, 'bagging_fraction': 0.8010736408171835, 'bagging_freq': 1, 'min_child_samples': 77}. Best is trial 0 with value: 0.7826610671197397.
[I 2024-03-09 18:53:58,014] Trial 2 finished with value: 0.7843497843238797 and parameters: {'lambda_l1': 0.00038625329111759827,

In [21]:
best_params = study.best_params
model = LGBMClassifier(**best_params, verbose=-1)

precision_cv_score = cross_val_score(model, x_tfidf_lsa_lda_fit, label_fit, cv=5, n_jobs=-2, scoring='precision_macro').mean()
recall_cv_score = cross_val_score(model, x_tfidf_lsa_lda_fit, label_fit, cv=5, n_jobs=-2, scoring='recall_macro').mean()
f1_cv_score = cross_val_score(model, x_tfidf_lsa_lda_fit, label_fit, cv=5, n_jobs=-2, scoring='f1_macro').mean()

print('CV: p:{0:.4f} r:{1:.4f} f:{2:.4f}'.format(precision_cv_score, recall_cv_score, f1_cv_score))

CV: p:0.7879 r:0.7864 f:0.7870


In [29]:
model.fit(x_tfidf_lsa_lda_fit, label_fit)

In [51]:
x_tfidf_test = tfidf_vectorizer.transform(data_test["title"]+data_test["body"])
x_tf_test = count_vectorizer.transform(data_test["title"]+data_test["body"])

x_lsa_test = lsa.transform(x_tfidf_test)
x_lda_test = lda.transform(x_tf_test)

x_test = hstack([x_tfidf_test, x_lsa_test, x_lda_test]).tocsr()
preds = model.predict(x_test)

precision_test_score = metrics.precision_score(preds, label_test, average='macro')
recall_test_score = metrics.recall_score(preds, label_test, average='macro')
f1_test_score = metrics.f1_score(preds, label_test, average='macro')

print('test: p:{0:.4f} r:{1:.4f} f:{2:.4f}'.format(precision_test_score, recall_test_score, f1_test_score))

test: p:0.7891 r:0.7907 f:0.7897


In [52]:
import joblib

joblib.dump(model, "3combo_model.pkl")
joblib.dump(tfidf_vectorizer, 'tfidf_vectorizer.pkl')
joblib.dump(count_vectorizer, 'count_vectorizer.pkl')
joblib.dump(lsa, 'lsa.pkl')
joblib.dump(lda, 'lda.pkl')

['lda.pkl']