### CODE TAKEN FROM Test-Classification.ipynb :

In [1]:
### Use a lot of packages from sklearn

import matplotlib.pyplot as plt
import html 
import numpy as np
import re
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.svm import LinearSVC

In [2]:
# df setup

file = '../blueprints-text/data/jdt-bugs-dataset/eclipse_jdt.csv.gz' ### real location
df = pd.read_csv(file)
df = df[['Title', 'Description', 'Priority']].dropna()
df['text'] = df['Title'] + ' ' + df['Description']
df = df.drop(columns=['Title', 'Description'])

In [3]:
# cleaning text
def clean(text):
    # convert html escapes like &amp; to characters.
    text = html.unescape(text) 
    # tags like <tab>
    text = re.sub(r'<[^<>]*>', ' ', text)
    # markdown URLs like [Some text](https://....)
    text = re.sub(r'\[([^\[\]]*)\]\([^\(\)]*\)', r'\1', text)
    # text or code in brackets like [0]
    text = re.sub(r'\[[^\[\]]*\]', ' ', text)
    # standalone sequences of specials, matches &# but not #cool
    text = re.sub(r'(?:^|\s)[&#<>{}\[\]+|\\:-]{1,}(?:\s|$)', ' ', text)
    # standalone sequences of hyphens like --- or ==
    text = re.sub(r'(?:^|\s)[\-=\+]{2,}(?:\s|$)', ' ', text)
    # sequences of white spaces
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

df['text'] = df['text'].apply(clean)
df = df[df['text'].str.len() > 50]

X_train, X_test, y_train, y_test = train_test_split(df['text'],
                                                    df['Priority'],
                                                    test_size=0.2, ### 80-20 train-test split
                                                    random_state=42,
                                                    stratify=df['Priority'])

In [4]:
# Sample 4000 bug reports with priority P3 
df_P3 = df[df['Priority'] == 'P3'].sample(n=4000, random_state=123)

# Create a separate dataframe containing all other bug reports
df_Rest = df[df['Priority'] != 'P3']

# Concatenate the two dataframes to create the new balanced bug reports dataset
df_balanced = pd.concat([df_Rest, df_P3])

df_new = df_balanced[['text', 'Priority']]
df_new = df_new.dropna()

# Step 1 - Data Preparation

df_new['text'] = df_new['text'].apply(clean)

# Step 2 - Train-Test Split
X_train_new, X_test_new, y_train_new, y_test_new = train_test_split(df_new['text'],
                                                    df_new['Priority'],
                                                    test_size=0.2,
                                                    random_state=42,
                                                    stratify=df_new['Priority'])


### MY CODE:

In [5]:
unbalanced_all = df['Priority'] # all priority values from the unbalanced df
unbalanced_train = y_train # training priority vals from unbalanced df

balanced_all = df_new['Priority'] # all priority vals from balanced df
balanced_train = y_train_new # priority vals from training from balanced df 

# calculates distribution of priority level
def calculate_distribution(datasets):
    res = []
    for data in datasets: # do all datasets at once 
        # get last char in priority (as there are < 10 priorities)
        # turn list of priorities into an np array and
        # get value counts 
        val_counts = np.bincount(np.array([int(val[-1]) for val in data]))
        # divide each value count by the total num of values
        # and return all datasets together 
        res.append(np.array([val / len(data) for val in val_counts]))
    return res

# calculate kl-divergence
def kl_divergence(p, q):
    return np.sum(np.where(p != 0, p * np.log(p/q), 0))


In [6]:
# collect all four datasets
datasets = [unbalanced_all, unbalanced_train, balanced_all, balanced_train]

# calculate distributions for each dataset
distributions = calculate_distribution(datasets)

# remove first value of dist. array, as it is 0
distributions = [d[1:] for d in distributions]

In [7]:
# kl-divergence of the unbalanced training vs entire unbalanced df
q_1 = kl_divergence(*distributions[:2])

# kl-divergence of balanced training vs balanced df
q_2 = kl_divergence(*distributions[2:])

In [8]:
print("Q1: ", q_1)
print("Q2: ", q_2)

Q1:  1.6866039547917605e-08
Q2:  7.944202426577905e-08


*Q1 and Q2 are the kl-divergence values that go with questions 1 and 2*

1. Yes, the distribution of the training data is close to that of df. This is because the 'stratify' argument is used in train-test-split.

2. I assume that in this question, df_balanced is meant to be df_new, as it does not make much sense to use df_balanced in this question. If this is the case, yes, the distributions are similar.

3. The selections are proportional to the underlying full datasets, however they are not close to each other. If we compare the unbalanced and balanced datasets, we get much different results:

In [9]:
kl_divergence(distributions[0], distributions[3])

0.45650157100814154