# Hand on 7.1

---
Page `22-27` of Handout #7

In [25]:
# Page 22 : (but optimized?)

import re
import string
from nltk.tokenize import word_tokenize
# Precompile regex patterns (compiled only once)
_var_addr_pattern = re.compile(r'.*0x[0-9a-f].*')
_name_with_number_pattern = re.compile(r'.*[a-f]*:[0-9]*')
_number_start_one_char_pattern = re.compile(r'[a-f][0-9].*')
_number_start_three_char_pattern = re.compile(r'[a-f]{3}[0-9].*')
_number_sub_pattern = re.compile(r'[\\/;:_-]')

def preprocess(text, stopword_set, stemmer):
    # Remove punctuation and unwanted characters, then lowercase the text
    translation_table = str.maketrans('', '', '!"#$%&\'()*+,.<=>?@[]^`{|}~' + u'\xa0')
    cleaned_text = text.translate(translation_table).lower()
    
    # Replace all whitespace characters with a single space
    cleaned_text = cleaned_text.translate(str.maketrans(string.whitespace, ' ' * len(string.whitespace), ''))
    
    # Split the text only once
    tokens = cleaned_text.split()
    new_tokens = []
    
    # Process each token with all transformation rules in one pass
    for token in tokens:
        if '_' in token:
            new_tokens.append('_variable_with_underscore')
        elif '-' in token:
            new_tokens.append('_variable_with_dash')
        elif len(token) > 15 and token[0] != '#':
            new_tokens.append('_long_variable_name')
        elif token.startswith('http') and '/' in token:
            new_tokens.append('_weburl')
        elif _number_sub_pattern.sub('', token).isdigit():
            new_tokens.append('_number')
        elif _var_addr_pattern.match(token):
            new_tokens.append('_variable_with_address')
        elif _name_with_number_pattern.match(token):
            new_tokens.append('_name_with_number')
        elif _number_start_one_char_pattern.match(token):
            new_tokens.append('_number_starts_with_one_character')
        elif _number_start_three_char_pattern.match(token):
            new_tokens.append('_number_starts_with_three_characters')
        elif any(c.isdigit() for c in token) and token.startswith('v'):
            new_tokens.append('_version')
        elif ('\\' in token or '/' in token) and ':' not in token:
            new_tokens.append('_localpath')
        elif token.endswith('px'):
            new_tokens.append('_image_size')
        else:
            new_tokens.append(token)
    
    # Remove stopwords and tokens shorter than 3 characters, then perform stemming
    final_tokens = [stemmer.stem(tok) for tok in new_tokens if tok not in stopword_set and len(tok) > 2]
    return ' '.join(final_tokens)



In [None]:
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from multiprocessing import Pool

# Function to initialize global variables in worker processes
def initialize_pool(stopword_set_arg, stemmer_arg):
    global stopword_set, stemmer
    stopword_set = stopword_set_arg
    stemmer = stemmer_arg

# Load dataset
dataset = pd.read_json('resource/embold_train.json')

# Label transformations
dataset.loc[dataset['label'] > 0, 'label'] = -1
dataset.loc[dataset['label'] == 0, 'label'] = 1
dataset.loc[dataset['label'] == -1, 'label'] = 0

# Define stopwords and stemmer
stopwords_set = set(stopwords.words('English'))
ps = PorterStemmer()

# Initialize the pool of workers with the optimized preprocess globals
pool = Pool(8, initializer=initialize_pool, initargs=(stopwords_set, ps))

# Preprocess the dataset using multiprocessing
cleaned_title = pool.map(preprocess, dataset['title'])
cleaned_body = pool.map(preprocess, dataset['body'])

# Combine the cleaned texts into a DataFrame
data_texts = pd.DataFrame({'title': cleaned_title, 'body': cleaned_body})

# Labels
y = dataset['label']

# Close the pool
pool.close()
pool.join()


Pickle is now given, Skipping first 2 pages

In [6]:
# Skipped page 23
from nltk.corpus import stopwords
import pickle
from nltk.stem import PorterStemmer
from multiprocessing import Pool
# Read pickle
with open('resource/data_texts.pickle', 'rb') as f:
    data_texts = pickle.load(f)
with open('resource/embold_train_y.pickle', 'rb') as f:
    y = pickle.load(f)

In [7]:
# Page 25 : Walkthroughs – cross validation
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import lightgbm as lgb
from sklearn import model_selection

# Split the dataset into training and blindtest (testing) sets
data_fit, data_blindtest, y_fit, y_blindtest = train_test_split(data_texts, y, test_size=0.1)

# Initialize the TF-IDF Vectorizer with unigrams
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 1))

# Concatenate the title and body columns
# Assuming 'data_texts' contains both 'title' and 'body' columns
data_texts_combined = data_texts['title'] + ' ' + data_texts['body']

# Fit the TF-IDF vectorizer on the concatenated text (title + body)
tfidf_vectorizer.fit(data_texts_combined)

# Transform the training and blindtest data
X_tfidf_fit = tfidf_vectorizer.transform(data_fit['title'] + ' ' + data_fit['body'])
X_tfidf_blindtest = tfidf_vectorizer.transform(data_blindtest['title'] + ' ' + data_blindtest['body'])

# Initialize the model
gbm_model = lgb.LGBMClassifier()

# Cross-validation for precision, recall, and f1 score
precision_cv_score = model_selection.cross_val_score(gbm_model, X_tfidf_fit, y_fit, cv=5, n_jobs=-2, scoring='precision_macro').mean()
recall_cv_score = model_selection.cross_val_score(gbm_model, X_tfidf_fit, y_fit, cv=5, n_jobs=-2, scoring='recall_macro').mean()
f1_cv_score = model_selection.cross_val_score(gbm_model, X_tfidf_fit, y_fit, cv=5, n_jobs=-2, scoring='f1_macro').mean()

# Output the results
print('CV: p:{0:.4f} r:{1:.4f} f:{2:.4f}'.format(precision_cv_score, recall_cv_score, f1_cv_score))




CV: p:0.7956 r:0.7932 f:0.7942


In [8]:
# Page 26 : Modelling
from sklearn import metrics

data_fit, data_blindtest, y_fit, y_blindtest = model_selection.train_test_split(data_texts, y, test_size=0.3)

data_fit_train, data_fit_test, y_fit_train, y_fit_test = model_selection.train_test_split(data_fit, y_fit, test_size=0.3)
X_tfidf_fit_train = tfidf_vectorizer.transform(data_fit_train['title'])
X_tfidf_fit_test = tfidf_vectorizer.transform(data_fit_test['title'])
X_tfidf_blindtest = tfidf_vectorizer.transform(data_blindtest['title'])

gbm_model.fit(X_tfidf_fit_train, y_fit_train, eval_set=[(X_tfidf_fit_test, y_fit_test)], eval_metric='AUC')

precision_test_score = metrics.precision_score(gbm_model.predict(X_tfidf_blindtest), y_blindtest, average='macro')
recall_test_score = metrics.recall_score(gbm_model.predict(X_tfidf_blindtest), y_blindtest, average='macro')
f1_test_score = metrics.f1_score(gbm_model.predict(X_tfidf_blindtest), y_blindtest, average='macro')

print('test: p:{0:.4f} r:{1:.4f} f:{2:.4f}'.format(precision_test_score, recall_test_score, f1_test_score))

[LightGBM] [Info] Number of positive: 32696, number of negative: 40804
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.093978 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 85032
[LightGBM] [Info] Number of data points in the train set: 73500, number of used features: 1863
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.444844 -> initscore=-0.221527
[LightGBM] [Info] Start training from score -0.221527
test: p:0.7432 r:0.7648 f:0.7461


In [9]:
import pickle

# Page 27 : Save the models
pickle.dump(tfidf_vectorizer, open('resource/github_bug_prediction_tfidf_vectorizer.pkl', 'wb'))
pickle.dump(gbm_model, open('resource/github_bug_prediction_basic_model.pkl', 'wb'))

---
Page `28-29` of Handout #7


In [None]:
from flask import Flask, request
from scipy.sparse import hstack
import pickle
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer  # Using PorterStemmer

# Initialize the Flask app
app = Flask(__name__)

# Load resources
app.tfidf_vectorizer = pickle.load(open('resource/github_bug_prediction_tfidf_vectorizer.pkl', 'rb'))
app.basic_model = pickle.load(open('resource/github_bug_prediction_basic_model.pkl', 'rb'))
app.stopword_set = set(stopwords.words('english'))
app.stemmer = PorterStemmer()  # Correctly initialize PorterStemmer

@app.route('/predict_basic', methods=['GET'])
def predict_basic_get():
    response_object = {'status': 'success'}
    
    # Get query parameters
    argList = request.args.to_dict(flat=False)
    title = argList.get('title', [None])[0]  # Safely get title
    body = argList.get('body', [None])[0]  # Safely get body

    if not title or not body:  # Validate input
        response_object['status'] = 'error'
        response_object['message'] = 'Missing title or body'
        return response_object

    # Apply preprocessing to the title
    processed_title = preprocess(title)  # Using preprocess function on the title

    # Predict bug likelihood
    predict = app.basic_model.predict_proba(hstack([app.tfidf_vectorizer.transform([processed_title])]))

    response_object['predict_as'] = 'bug' if predict[0][1] > 0.5 else 'not bug'
    response_object['bug_prob'] = predict[0][1]
    
    return response_object

@app.route('/predict_basic', methods=['POST'])
def predict_basic_post():
    response_object = {'status': 'success'}
    
    # Get data from JSON body
    data = request.get_json()  # Parse JSON body
    title = data.get('title')  
    body = data.get('body')    

    if not title or not body:  # Validate if title or body are missing
        response_object['status'] = 'error'
        response_object['message'] = 'Missing title or body'
        return response_object

    # Initialize stopword_set and stemmer
    stopword_set = set(stopwords.words('english'))  # Assuming you are using NLTK stopwords
    stemmer = PorterStemmer()  # Using PorterStemmer
    
    # Process title using preprocess with stopword_set and stemmer
    processed_title = preprocess(title, stopword_set, stemmer)

    # Predict bug likelihood
    predict = app.basic_model.predict_proba(hstack([app.tfidf_vectorizer.transform([processed_title])]))

    response_object['predict_as'] = 'bug' if predict[0][1] > 0.5 else 'not bug'
    response_object['bug_prob'] = predict[0][1]
    
    return response_object


In [None]:
app.run(debug=False)

 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
Press CTRL+C to quit
127.0.0.1 - - [17/Feb/2025 17:38:15] "POST /predict_basic HTTP/1.1" 200 -


---
Page `39` of Handout #7