# Hand on 7.1

---
Page `22-27` of Handout #7

In [2]:
# Page 22 : (but optimized?)

import re
import string
from nltk.tokenize import word_tokenize
# Precompile regex patterns (compiled only once)
_var_addr_pattern = re.compile(r'.*0x[0-9a-f].*')
_name_with_number_pattern = re.compile(r'.*[a-f]*:[0-9]*')
_number_start_one_char_pattern = re.compile(r'[a-f][0-9].*')
_number_start_three_char_pattern = re.compile(r'[a-f]{3}[0-9].*')
_number_sub_pattern = re.compile(r'[\\/;:_-]')

def preprocess(text, stopword_set, stemmer):
    # Remove punctuation and unwanted characters, then lowercase the text
    translation_table = str.maketrans('', '', '!"#$%&\'()*+,.<=>?@[]^`{|}~' + u'\xa0')
    cleaned_text = text.translate(translation_table).lower()
    
    # Replace all whitespace characters with a single space
    cleaned_text = cleaned_text.translate(str.maketrans(string.whitespace, ' ' * len(string.whitespace), ''))
    
    # Split the text only once
    tokens = cleaned_text.split()
    new_tokens = []
    
    # Process each token with all transformation rules in one pass
    for token in tokens:
        if '_' in token:
            new_tokens.append('_variable_with_underscore')
        elif '-' in token:
            new_tokens.append('_variable_with_dash')
        elif len(token) > 15 and token[0] != '#':
            new_tokens.append('_long_variable_name')
        elif token.startswith('http') and '/' in token:
            new_tokens.append('_weburl')
        elif _number_sub_pattern.sub('', token).isdigit():
            new_tokens.append('_number')
        elif _var_addr_pattern.match(token):
            new_tokens.append('_variable_with_address')
        elif _name_with_number_pattern.match(token):
            new_tokens.append('_name_with_number')
        elif _number_start_one_char_pattern.match(token):
            new_tokens.append('_number_starts_with_one_character')
        elif _number_start_three_char_pattern.match(token):
            new_tokens.append('_number_starts_with_three_characters')
        elif any(c.isdigit() for c in token) and token.startswith('v'):
            new_tokens.append('_version')
        elif ('\\' in token or '/' in token) and ':' not in token:
            new_tokens.append('_localpath')
        elif token.endswith('px'):
            new_tokens.append('_image_size')
        else:
            new_tokens.append(token)
    
    # Remove stopwords and tokens shorter than 3 characters, then perform stemming
    final_tokens = [stemmer.stem(tok) for tok in new_tokens if tok not in stopword_set and len(tok) > 2]
    return ' '.join(final_tokens)



In [None]:
# Running this takes too long
# # Page 23
# import pandas as pd
# from nltk.corpus import stopwords
# from nltk.stem import PorterStemmer
# from multiprocessing import Pool
# 
# # Function to initialize global variables in worker processes
# def initialize_pool(stopword_set_arg, stemmer_arg):
#     global stopword_set, stemmer
#     stopword_set = stopword_set_arg
#     stemmer = stemmer_arg
# 
# # Load dataset
# dataset = pd.read_json('../Week 10/resource/embold_train.json')
# 
# # Label transformations
# dataset.loc[dataset['label'] > 0, 'label'] = -1
# dataset.loc[dataset['label'] == 0, 'label'] = 1
# dataset.loc[dataset['label'] == -1, 'label'] = 0
# 
# # Define stopwords and stemmer
# stopwords_set = set(stopwords.words('English'))
# ps = PorterStemmer()
# 
# # Initialize the pool of workers with the optimized preprocess globals
# pool = Pool(8, initializer=initialize_pool, initargs=(stopwords_set, ps))
# 
# # Preprocess the dataset using multiprocessing
# cleaned_title = pool.map(preprocess, dataset['title'])
# cleaned_body = pool.map(preprocess, dataset['body'])
# 
# 
# # Combine the cleaned texts into a DataFrame
# data_texts = pd.DataFrame({'title': cleaned_title, 'body': cleaned_body})
# 
# # Labels
# y = dataset['label']
# 
# # Close the pool
# pool.close()
# pool.join()

Pickle is now given, Skipping first 2 pages

In [3]:
# Skipping page 23
from nltk.corpus import stopwords
import pickle
from nltk.stem import PorterStemmer
from multiprocessing import Pool
# Read pickle
with open('../Week 10/resource/data_texts.pickle', 'rb') as f:
    data_texts = pickle.load(f)
with open('../Week 10/resource/embold_train_y.pickle', 'rb') as f:
    y = pickle.load(f)

In [4]:
# Page 25 : Walkthroughs – cross validation
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import lightgbm as lgb
from sklearn import model_selection

# Split the dataset into training and blindtest (testing) sets
data_fit, data_blindtest, y_fit, y_blindtest = train_test_split(data_texts, y, test_size=0.1)

# Initialize the TF-IDF Vectorizer with unigrams
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 1))

# Concatenate the title and body columns
# Assuming 'data_texts' contains both 'title' and 'body' columns
data_texts_combined = data_texts['title'] + ' ' + data_texts['body']

# Fit the TF-IDF vectorizer on the concatenated text (title + body)
tfidf_vectorizer.fit(data_texts_combined)

# Transform the training and blindtest data
X_tfidf_fit = tfidf_vectorizer.transform(data_fit['title'] + ' ' + data_fit['body'])
X_tfidf_blindtest = tfidf_vectorizer.transform(data_blindtest['title'] + ' ' + data_blindtest['body'])

# Initialize the model
gbm_model = lgb.LGBMClassifier()

# Cross-validation for precision, recall, and f1 score
precision_cv_score = model_selection.cross_val_score(gbm_model, X_tfidf_fit, y_fit, cv=5, n_jobs=-2, scoring='precision_macro').mean()
recall_cv_score = model_selection.cross_val_score(gbm_model, X_tfidf_fit, y_fit, cv=5, n_jobs=-2, scoring='recall_macro').mean()
f1_cv_score = model_selection.cross_val_score(gbm_model, X_tfidf_fit, y_fit, cv=5, n_jobs=-2, scoring='f1_macro').mean()

# Output the results
print('CV: p:{0:.4f} r:{1:.4f} f:{2:.4f}'.format(precision_cv_score, recall_cv_score, f1_cv_score))




CV: p:0.7962 r:0.7935 f:0.7946


In [5]:
# Page 26 : Modelling
from sklearn import metrics

data_fit, data_blindtest, y_fit, y_blindtest = model_selection.train_test_split(data_texts, y, test_size=0.3)

data_fit_train, data_fit_test, y_fit_train, y_fit_test = model_selection.train_test_split(data_fit, y_fit, test_size=0.3)
X_tfidf_fit_train = tfidf_vectorizer.transform(data_fit_train['title'])
X_tfidf_fit_test = tfidf_vectorizer.transform(data_fit_test['title'])
X_tfidf_blindtest = tfidf_vectorizer.transform(data_blindtest['title'])

gbm_model.fit(X_tfidf_fit_train, y_fit_train, eval_set=[(X_tfidf_fit_test, y_fit_test)], eval_metric='AUC')

precision_test_score = metrics.precision_score(gbm_model.predict(X_tfidf_blindtest), y_blindtest, average='macro')
recall_test_score = metrics.recall_score(gbm_model.predict(X_tfidf_blindtest), y_blindtest, average='macro')
f1_test_score = metrics.f1_score(gbm_model.predict(X_tfidf_blindtest), y_blindtest, average='macro')

print('test: p:{0:.4f} r:{1:.4f} f:{2:.4f}'.format(precision_test_score, recall_test_score, f1_test_score))

[LightGBM] [Info] Number of positive: 32720, number of negative: 40780
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.090742 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 85138
[LightGBM] [Info] Number of data points in the train set: 73500, number of used features: 1864
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.445170 -> initscore=-0.220205
[LightGBM] [Info] Start training from score -0.220205
test: p:0.7465 r:0.7704 f:0.7494


In [6]:
import pickle

# Page 27 : Save the models
pickle.dump(tfidf_vectorizer, open('../Week 10/resource/github_bug_prediction_tfidf_vectorizer.pkl', 'wb'))
pickle.dump(gbm_model, open('../Week 10/resource/github_bug_prediction_basic_model.pkl', 'wb'))

---
Page `28-29` of Handout #7


In [7]:
from flask import Flask, request
from scipy.sparse import hstack
import pickle
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer  # Using PorterStemmer

# Initialize the Flask app
app = Flask(__name__)

# Load resources
app.tfidf_vectorizer = pickle.load(open('../Week 10/resource/github_bug_prediction_tfidf_vectorizer.pkl', 'rb'))
app.basic_model = pickle.load(open('../Week 10/resource/github_bug_prediction_basic_model.pkl', 'rb'))
app.stopword_set = set(stopwords.words('english'))
app.stemmer = PorterStemmer()  # Correctly initialize PorterStemmer

@app.route('/predict_basic', methods=['GET'])
def predict_basic_get():
    response_object = {'status': 'success'}
    
    # Get query parameters
    argList = request.args.to_dict(flat=False)
    title = argList.get('title', [None])[0]  # Safely get title
    body = argList.get('body', [None])[0]  # Safely get body

    if not title or not body:  # Validate input
        response_object['status'] = 'error'
        response_object['message'] = 'Missing title or body'
        return response_object

    # Apply preprocessing to the title
    processed_title = preprocess(title)  # Using preprocess function on the title

    # Predict bug likelihood
    predict = app.basic_model.predict_proba(hstack([app.tfidf_vectorizer.transform([processed_title])]))

    response_object['predict_as'] = 'bug' if predict[0][1] > 0.5 else 'not bug'
    response_object['bug_prob'] = predict[0][1]
    
    return response_object

@app.route('/predict_basic', methods=['POST'])
def predict_basic_post():
    response_object = {'status': 'success'}
    
    # Get data from JSON body
    data = request.get_json()  # Parse JSON body
    title = data.get('title')  
    body = data.get('body')    

    if not title or not body:  # Validate if title or body are missing
        response_object['status'] = 'error'
        response_object['message'] = 'Missing title or body'
        return response_object

    # Initialize stopword_set and stemmer
    stopword_set = set(stopwords.words('english'))  # Assuming you are using NLTK stopwords
    stemmer = PorterStemmer()  # Using PorterStemmer
    
    # Process title using preprocess with stopword_set and stemmer
    processed_title = preprocess(title, stopword_set, stemmer)

    # Predict bug likelihood
    predict = app.basic_model.predict_proba(hstack([app.tfidf_vectorizer.transform([processed_title])]))

    response_object['predict_as'] = 'bug' if predict[0][1] > 0.5 else 'not bug'
    response_object['bug_prob'] = predict[0][1]
    
    return response_object


In [None]:
app.run(debug=False)

 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
Press CTRL+C to quit
127.0.0.1 - - [17/Feb/2025 17:38:15] "POST /predict_basic HTTP/1.1" 200 -


---
# Hand on 7.2
Page `39` of Handout #7

In [None]:
print(f'x: {X_lsa_fit.shape}')
print(f'y: {y_fit.shape}')

x: (135000, 500)
y: (105000,)


In [8]:
from sklearn.decomposition import TruncatedSVD
from sklearn import model_selection
import lightgbm as lgb
from scipy.sparse import hstack

lsa = TruncatedSVD(n_components=500, n_iter=100, random_state=0)
lsa.fit(X_tfidf_fit)
X_lsa_fit = lsa.transform(X_tfidf_fit)

gbm_model_with_lsa = lgb.LGBMClassifier()

In [9]:
X_lsa_fit = X_lsa_fit[:len(y_fit)]  # Trim X_lsa_fit to match the length of y_fit
X_tfidf_fit = X_tfidf_fit[:len(y_fit)]  # Trim X_tfidf_fit to match the length of y_fit

# Cross-validation using only LSA features
precision_cv_score = model_selection.cross_val_score(
    gbm_model_with_lsa, X_lsa_fit, y_fit, cv=5, n_jobs=-2, scoring='precision_macro'
).mean()

recall_cv_score = model_selection.cross_val_score(
    gbm_model_with_lsa, X_lsa_fit, y_fit, cv=5, n_jobs=-2, scoring='recall_macro'
).mean()

f1_cv_score = model_selection.cross_val_score(
    gbm_model_with_lsa, X_lsa_fit, y_fit, cv=5, n_jobs=-2, scoring='f1_macro'
).mean()

print('LSA fit: p:{0:.4f} r:{1:.4f} f:{2:.4f}'.format(precision_cv_score, recall_cv_score, f1_cv_score))


LSA fit: p:0.4972 r:0.4993 f:0.4077


In [10]:
# Stack LSA features with original features
X_fit_with_lsa = hstack([X_tfidf_fit, X_lsa_fit]).tocsr()

# Cross-validation using both LSA and original features
precision_cv_score = model_selection.cross_val_score(
    gbm_model_with_lsa, X_fit_with_lsa, y_fit, cv=5, n_jobs=-2, scoring='precision_macro'
).mean()

recall_cv_score = model_selection.cross_val_score(
    gbm_model_with_lsa, X_fit_with_lsa, y_fit, cv=5, n_jobs=-2, scoring='recall_macro'
).mean()

f1_cv_score = model_selection.cross_val_score(
    gbm_model_with_lsa, X_fit_with_lsa, y_fit, cv=5, n_jobs=-2, scoring='f1_macro'
).mean()

print('With LSA and original features: p:{0:.4f} r:{1:.4f} f:{2:.4f}'.format(precision_cv_score, recall_cv_score, f1_cv_score))

With LSA and original features: p:0.5030 r:0.5009 f:0.4119


---
Page `43` of handout 7

In [11]:
import pickle
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn import model_selection
import lightgbm as lgb
from scipy.sparse import hstack

# Load 'title' and 'body' columns
cleaned_title = data_texts['title']
cleaned_body = data_texts['body']

# Initialize and fit CountVectorizer
count_vectorizer = CountVectorizer(ngram_range=(1,1))
count_vectorizer.fit(cleaned_title + cleaned_body)

# Perform transformation
X_tf_fit = count_vectorizer.transform(data_fit['title'] + data_fit['body'])
X_tf_blindtest = count_vectorizer.transform(data_blindtest['title'] + data_blindtest['body'])

# Perform Latent Dirichlet Allocation (LDA)
lda = LatentDirichletAllocation(n_components=500, random_state=0)
lda.fit(X_tf_fit)
X_lda_fit = lda.transform(X_tf_fit)

# Initialize LightGBM classifier
gbm_model_with_lda = lgb.LGBMClassifier()

# Perform cross-validation with LDA-transformed data
precision_cv_score = model_selection.cross_val_score(
    gbm_model_with_lda, X_lda_fit, y_fit, cv=5, n_jobs=-2, scoring='precision_macro'
).mean()
recall_cv_score = model_selection.cross_val_score(
    gbm_model_with_lda, X_lda_fit, y_fit, cv=5, n_jobs=-2, scoring='recall_macro'
).mean()
f1_cv_score = model_selection.cross_val_score(
    gbm_model_with_lda, X_lda_fit, y_fit, cv=5, n_jobs=-2, scoring='f1_macro'
).mean()

print('fit: p:{0:.4f} r:{1:.4f} f:{2:.4f}'.format(precision_cv_score, recall_cv_score, f1_cv_score))

# Combine original TF-IDF features with LDA-transformed features
X_fit_with_lda = hstack([X_tfidf_fit, X_lda_fit]).tocsr()

# Perform cross-validation with combined features
precision_cv_score = model_selection.cross_val_score(
    gbm_model_with_lda, X_fit_with_lda, y_fit, cv=5, n_jobs=-2, scoring='precision_macro'
).mean()
recall_cv_score = model_selection.cross_val_score(
    gbm_model_with_lda, X_fit_with_lda, y_fit, cv=5, n_jobs=-2, scoring='recall_macro'
).mean()
f1_cv_score = model_selection.cross_val_score(
    gbm_model_with_lda, X_fit_with_lda, y_fit, cv=5, n_jobs=-2, scoring='f1_macro'
).mean()

print('fit: p:{0:.4f} r:{1:.4f} f:{2:.4f}'.format(precision_cv_score, recall_cv_score, f1_cv_score))


fit: p:0.6833 r:0.6724 f:0.6734
fit: p:0.6812 r:0.6704 f:0.6714


---
Page `45` of Handout 7

In class activity
- Make a TF-IDF + LSA + LDA version
- Please carefully design the dataflow
- Raw data -> TF-IDF vectorizer -> basic … (1)
- Raw data -> TF-IDF vectorizer -> LSA … (2)
- Raw data -> TF vectorizer -> LDA … (3)
- GBM( 1 + 2 + 3 ) -> predicted probability
- Make this TF-IDF + LSA + LDA a flask application

Firstly we build the Basic & LSA Pipelines

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import GradientBoostingClassifier
from scipy.sparse import hstack, csr_matrix
import numpy as np

# Assume data_texts is a DataFrame with 'title' and 'body' columns and y are the labels.
# Combine title and body
data_texts_combined = data_texts['title'] + ' ' + data_texts['body']

# Pipeline (1): TF-IDF representation
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,1))
X_tfidf = tfidf_vectorizer.fit_transform(data_texts_combined)

# Pipeline (2): LSA on TF-IDF features
# We use TruncatedSVD to reduce dimensionality (i.e. perform LSA)
lsa = TruncatedSVD(n_components=500, n_iter=100, random_state=0)
# Fit LSA on the TF-IDF matrix and transform it:
X_lsa = lsa.fit_transform(X_tfidf)  # This gives a dense matrix

# Convert LSA output to a sparse format so it can be hstacked with X_tfidf:
X_lsa_sparse = csr_matrix(X_lsa)

# At this point, pipelines (1) and (2) are built.

The we build the TF + LDA Pipeline

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

# Pipeline (3): Use a CountVectorizer (TF) for raw counts
count_vectorizer = CountVectorizer(ngram_range=(1,1))
X_tf = count_vectorizer.fit_transform(data_texts_combined)

# Apply LDA on the count matrix to extract topics
lda = LatentDirichletAllocation(n_components=500, random_state=0)
# Fit LDA and transform the count matrix:
X_lda = lda.fit_transform(X_tf)  # This is dense (each row = topic distribution)

# Convert LDA output to sparse format for stacking:
X_lda_sparse = csr_matrix(X_lda)

Combine Features and Train the Combined GBM Model

In [None]:
# Combine features from all three pipelines:
# (1) TF-IDF (basic), (2) LSA features, (3) LDA topic distribution
X_combined = hstack([X_tfidf, X_lsa_sparse, X_lda_sparse]).tocsr()

# Optionally, ensure X_combined and y have matching dimensions (e.g., slicing if needed)
X_combined = X_combined[:len(y)]

# Train/test split for evaluation (or cross-validation)
X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.3, random_state=42)

# Train a Gradient Boosting Model (or LightGBM, as preferred) on the combined features:
gbm_combined = GradientBoostingClassifier()
gbm_combined.fit(X_train, y_train)

# Evaluate using cross-validation (example: F1 macro score)
cv_score = cross_val_score(gbm_combined, X_combined, y, cv=5, scoring='f1_macro').mean()
print("Combined model cross-validation F1 score:", cv_score)

 Save the Trained Components

In [None]:
import pickle

pickle.dump(tfidf_vectorizer, open('tfidf_vectorizer.pkl', 'wb'))
pickle.dump(lsa, open('lsa.pkl', 'wb'))
pickle.dump(count_vectorizer, open('count_vectorizer.pkl', 'wb'))
pickle.dump(lda, open('lda.pkl', 'wb'))
pickle.dump(gbm_combined, open('gbm_combined.pkl', 'wb'))


Now we can make Flask application for it

In [None]:
from flask import Flask, request, jsonify
from scipy.sparse import hstack, csr_matrix
import pickle
import numpy as np

app = Flask(__name__)

# Load the pre-trained components
tfidf_vectorizer = pickle.load(open('tfidf_vectorizer.pkl', 'rb'))
lsa = pickle.load(open('lsa.pkl', 'rb'))
count_vectorizer = pickle.load(open('count_vectorizer.pkl', 'rb'))
lda = pickle.load(open('lda.pkl', 'rb'))
gbm_combined = pickle.load(open('gbm_combined.pkl', 'rb'))

@app.route('/predict/combined', methods=['POST'])
def predict_combined():
    # Expect input as JSON: {"text": "raw text here"}
    data = request.get_json()
    raw_text = data['text']
    
    # Prepare the combined input:
    # 1. Use TF-IDF vectorizer for pipeline (1)
    X_tfidf_input = tfidf_vectorizer.transform([raw_text])
    
    # 2. Get LSA features from TF-IDF
    X_lsa_input = lsa.transform(X_tfidf_input)
    X_lsa_input_sparse = csr_matrix(X_lsa_input)
    
    # 3. For LDA, use CountVectorizer (TF)
    X_tf_input = count_vectorizer.transform([raw_text])
    X_lda_input = lda.transform(X_tf_input)
    X_lda_input_sparse = csr_matrix(X_lda_input)
    
    # Combine the three representations
    X_combined_input = hstack([X_tfidf_input, X_lsa_input_sparse, X_lda_input_sparse]).tocsr()
    
    # Get predicted probability from the combined GBM model
    predicted_probability = gbm_combined.predict_proba(X_combined_input)[0]
    
    return jsonify({'predicted_probability': predicted_probability.tolist()})


In [None]:
# Run application here
app.run(debug=False)