In [1]:
import math, re, string, os, glob
import pandas as pd
import numpy as np
from collections import Counter
from nltk.corpus import stopwords
from nltk.tokenize import wordpunct_tokenize, RegexpTokenizer
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import KMeans
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import VotingClassifier, RandomForestClassifier
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score, cross_validate
# import matplotlib.pyplot as plt
from scipy.sparse import hstack, vstack, csr_matrix

In [2]:
# fname = "/Users/saurabh/workspace/fss/project/data/data-collection/labeled_commits/auto/abinit_concat.csv"
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [3]:
sto_datapath = "/Users/saurabh/Downloads/ncsu/study/thesis/datasets/Classified_SE_preprocessed/"
num_classes = 10
name_classes = ["academia","cs","diy","expressionengine","judaism","photo","rpg","scifi","ux","webmasters"]
fnames = []
for i in range(0, num_classes):
    fnames.append("{}SE{}.txt".format(sto_datapath, str(i)))
print(fnames)

['/Users/saurabh/Downloads/ncsu/study/thesis/datasets/Classified_SE_preprocessed/SE0.txt', '/Users/saurabh/Downloads/ncsu/study/thesis/datasets/Classified_SE_preprocessed/SE1.txt', '/Users/saurabh/Downloads/ncsu/study/thesis/datasets/Classified_SE_preprocessed/SE2.txt', '/Users/saurabh/Downloads/ncsu/study/thesis/datasets/Classified_SE_preprocessed/SE3.txt', '/Users/saurabh/Downloads/ncsu/study/thesis/datasets/Classified_SE_preprocessed/SE4.txt', '/Users/saurabh/Downloads/ncsu/study/thesis/datasets/Classified_SE_preprocessed/SE5.txt', '/Users/saurabh/Downloads/ncsu/study/thesis/datasets/Classified_SE_preprocessed/SE6.txt', '/Users/saurabh/Downloads/ncsu/study/thesis/datasets/Classified_SE_preprocessed/SE7.txt', '/Users/saurabh/Downloads/ncsu/study/thesis/datasets/Classified_SE_preprocessed/SE8.txt', '/Users/saurabh/Downloads/ncsu/study/thesis/datasets/Classified_SE_preprocessed/SE9.txt']


In [10]:
exp_class = 0
fname = fnames[exp_class]

In [4]:
for fname in fnames:
    print(fname)
    raw_df = pd.read_csv(fname, delimiter='>>>', header=None, index_col=None)
    raw_df.columns = ["content", "label"]
    raw_df['idx'] = raw_df.index
    raw_df = raw_df.dropna().reset_index()
    raw_df["label"] = raw_df["label"].apply(lambda x: x.strip())
    raw_df.sample(3)

    raw_df["label_code"] = raw_df["label"].apply(lambda val: 1. if "yes" == str(val.strip()) else 0.)
    label_counts = raw_df.groupby("label")["idx"].count()
    label_code_counts = raw_df.groupby("label_code")["idx"].count()
    print(label_counts)
    print(label_code_counts)

/Users/saurabh/Downloads/ncsu/study/thesis/datasets/Classified_SE_preprocessed/SE0.txt
label
no     6008
yes     309
Name: idx, dtype: int64
label_code
0.0    6008
1.0     309
Name: idx, dtype: int64
/Users/saurabh/Downloads/ncsu/study/thesis/datasets/Classified_SE_preprocessed/SE1.txt


  This is separate from the ipykernel package so we can avoid doing imports until


label
no     47201
yes     1441
Name: idx, dtype: int64
label_code
0.0    47201
1.0     1441
Name: idx, dtype: int64
/Users/saurabh/Downloads/ncsu/study/thesis/datasets/Classified_SE_preprocessed/SE2.txt
label
no     60637
yes      195
Name: idx, dtype: int64
label_code
0.0    60637
1.0      195
Name: idx, dtype: int64
/Users/saurabh/Downloads/ncsu/study/thesis/datasets/Classified_SE_preprocessed/SE3.txt
label
no     83583
yes      654
Name: idx, dtype: int64
label_code
0.0    83583
1.0      654
Name: idx, dtype: int64
/Users/saurabh/Downloads/ncsu/study/thesis/datasets/Classified_SE_preprocessed/SE4.txt
label
no     50700
yes      719
Name: idx, dtype: int64
label_code
0.0    50700
1.0      719
Name: idx, dtype: int64
/Users/saurabh/Downloads/ncsu/study/thesis/datasets/Classified_SE_preprocessed/SE5.txt
label
no     59350
yes     1856
Name: idx, dtype: int64
label_code
0.0    59350
1.0     1856
Name: idx, dtype: int64
/Users/saurabh/Downloads/ncsu/study/thesis/datasets/Classified_SE_p

In [122]:
# no_features = min(1000, len(words))
no_features = 100

# tfidf_vectorizer = TfidfVectorizer(max_features=no_features)
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 3), max_df=0.95, min_df=2, max_features=no_features)
X_tfidf = tfidf_vectorizer.fit_transform(raw_df['content'])
tfidf_feature_names = tfidf_vectorizer.get_feature_names()

tf_vectorizer = CountVectorizer(max_features=no_features)
# tf_vectorizer = CountVectorizer(ngram_range=(1, 3), max_df=0.95, min_df=2, max_features=no_features)
X_tf = tf_vectorizer.fit_transform(raw_df['content'])
tf_feature_names = tf_vectorizer.get_feature_names()

In [123]:
print("tf-idf vectorized: ", X_tfidf.shape)
print("tf vectorized: ", X_tf.shape)

tf-idf vectorized:  (6317, 100)
tf vectorized:  (6317, 100)


In [124]:
no_topics = 2
num_iter = 50

In [125]:
# nmf = NMF(n_components=no_topics, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(X_tfidf)

In [126]:
lda_tf = LatentDirichletAllocation(n_components=no_topics, max_iter=num_iter, learning_method='online', learning_offset=50.,random_state=9, evaluate_every=100).fit(X_tf)

In [127]:
lda_tfidf = LatentDirichletAllocation(n_components=no_topics, max_iter=num_iter, learning_method='online', learning_offset=50.,random_state=9, evaluate_every=100).fit(X_tfidf)

In [128]:
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()

In [129]:
pyLDAvis.sklearn.prepare(lda_tf, X_tf, tf_vectorizer)
# pyLDAvis.sklearn.prepare(lda_tf, dtm_tf, tf_vectorizer, mds='tsne')

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [130]:
pyLDAvis.sklearn.prepare(lda_tfidf, X_tfidf, tfidf_vectorizer)

In [131]:
def display_topics(model, feature_names, no_top_words, topic_word_prob, feature_names_set):
    for topic_idx, topic in enumerate(model.components_):
#         print("Topic %d:" % (topic_idx))
        top_features = []
        if topic_idx not in topic_word_prob:
            topic_word_prob[topic_idx] = []
        top_features = [(feature_names[i], topic[i]) for i in topic.argsort()[:-no_top_words - 1:-1]]
        topic_word_prob[topic_idx] = top_features
#         print(" ".join(top_features))
        feature_set = set([val[0] for val in top_features])
        feature_names_set.update(feature_set)
        print("Topic %d:" % (topic_idx))
        print(", ".join([val[0] for val in top_features]))
    return topic_word_prob, feature_names_set

In [132]:
no_top_words = 100
topic_word_prob = {}
feature_names_set = set()
topic_word_prob, feature_names_set = display_topics(lda, tf_feature_names, no_top_words, topic_word_prob, feature_names_set)
feature_names_list = list(feature_names_set)

Topic 0:
use, would, move, like, robot, one, play, game, chess, posit, know, time, motor, question, control, open, black, need, white, make, want, look, get, amp, way, could, also, player, two, pawn, think, differ, find, seem, start, tri, possibl, point, work, system, problem, good, sensor, exampl, say, follow, mean, read, see, help, someth, even, current, first, line, howev, abl, right, idea, end, thank, new, run, chang, data, com, creat, file, valu, function, output, name, http, titl, site, type, set, code, local, return, form, block, error, imag, array, page, class, section, user, field, content, plugin, categori, entri, templat, craft, asset, url, php, div
Topic 1:
entri, craft, set, field, user, use, type, name, imag, page, php, get, work, error, tri, class, templat, section, plugin, site, categori, code, form, valu, like, url, block, way, titl, creat, array, http, div, asset, content, local, file, com, return, want, would, function, new, need, data, output, first, exampl, one, fo

In [133]:
print(topic_word_prob)
print(feature_names_list)

{0: [('use', 2775.5176540149055), ('would', 2223.1053025515084), ('move', 2031.628355799887), ('like', 1804.0835374230312), ('robot', 1670.7705765958103), ('one', 1621.491722292872), ('play', 1592.8593973567295), ('game', 1554.630376988787), ('chess', 1438.1433004648786), ('posit', 1369.4069074417728), ('know', 1278.8279944532185), ('time', 1241.840224277875), ('motor', 1237.3461011874501), ('question', 1078.9721561656825), ('control', 1021.3632235521964), ('open', 962.3166499384926), ('black', 946.1192562083041), ('need', 936.2592836250654), ('white', 928.1880972254155), ('make', 883.8918920990906), ('want', 864.074183567241), ('look', 862.320946900028), ('get', 845.7358385610067), ('amp', 817.1780447844216), ('way', 792.4304075370324), ('could', 775.9432452404782), ('also', 772.2546974956672), ('player', 762.5030226178554), ('two', 753.7943782269708), ('pawn', 749.0761342902363), ('think', 747.6280734345357), ('differ', 747.2623674892313), ('find', 744.2698435467739), ('seem', 725.06

In [134]:
lda_x = lda_tf.transform(X_tf)
print(lda_x.shape)

(6317, 2)


In [135]:
for i in range(no_topics):
    topic_name = "Topic_{}".format(str(i))
    raw_df[topic_name] = pd.Series(lda_x[:, i])
print(raw_df.shape)
raw_df.sample(5)

(6317, 7)


Unnamed: 0,index,content,label,idx,label_code,Topic_0,Topic_1
6142,6172,form span two page requir field page option fi...,no,6172,0.0,0.168854,0.831146
4547,4577,test stumbl across craftsessionid cooki differ...,no,4577,0.0,0.562772,0.437228
5419,5449,page drag drop abil reorder group structur ent...,no,5449,0.0,0.211544,0.788456
5872,5902,possibl manual clean databas cach templat beco...,no,5902,0.0,0.280171,0.719829
4013,4043,use algorithm quiet search zero window search ...,no,4043,0.0,0.648428,0.351572


In [136]:
raw_df.dtypes

index           int64
content        object
label          object
idx             int64
label_code    float64
Topic_0       float64
Topic_1       float64
dtype: object

In [137]:
topic0_word_prob_map = {}
topic1_word_prob_map = {}
topic_index1 = 1
topic_index0 = 0
for d in topic_word_prob[topic_index0]:
        topic0_word_prob_map[d[0]] = d[1]
for d in topic_word_prob[topic_index1]:
        topic1_word_prob_map[d[0]] = d[1]
print(topic0_word_prob_map)
print(topic1_word_prob_map)

{'use': 2775.5176540149055, 'would': 2223.1053025515084, 'move': 2031.628355799887, 'like': 1804.0835374230312, 'robot': 1670.7705765958103, 'one': 1621.491722292872, 'play': 1592.8593973567295, 'game': 1554.630376988787, 'chess': 1438.1433004648786, 'posit': 1369.4069074417728, 'know': 1278.8279944532185, 'time': 1241.840224277875, 'motor': 1237.3461011874501, 'question': 1078.9721561656825, 'control': 1021.3632235521964, 'open': 962.3166499384926, 'black': 946.1192562083041, 'need': 936.2592836250654, 'white': 928.1880972254155, 'make': 883.8918920990906, 'want': 864.074183567241, 'look': 862.320946900028, 'get': 845.7358385610067, 'amp': 817.1780447844216, 'way': 792.4304075370324, 'could': 775.9432452404782, 'also': 772.2546974956672, 'player': 762.5030226178554, 'two': 753.7943782269708, 'pawn': 749.0761342902363, 'think': 747.6280734345357, 'differ': 747.2623674892313, 'find': 744.2698435467739, 'seem': 725.0645911148151, 'start': 685.8738489552956, 'tri': 682.0796295459921, 'pos

In [138]:
def doc_word_mapping(words, topic0, topic1):
    is_max_0 = True
    topic_prob0 = topic0
    topic_prob1 = topic1
    if topic_prob1 > topic_prob0:
        is_max_0 = False
    weighted_words = [0]*(len(feature_names_list)+2)
    weighted_words[-1] = topic_prob1
    weighted_words[-2] = topic_prob0
    uniq_words = Counter(words.split(' '))
    for idx, w in enumerate(feature_names_list):
        count = 0
        if w in uniq_words:
            count = uniq_words[w]
        prob = 0
        if is_max_0 and  w in topic0_word_prob_map:
            prob += count * topic0_word_prob_map[w]
        if not is_max_0 and w in topic1_word_prob_map:
            prob += count * topic1_word_prob_map[w]
        weighted_words[idx] = prob
    return weighted_words

In [139]:
# idx = 5
# tmp_vals = raw_df.loc[idx, ['content', 'Topic_0', 'Topic_1']].values
# words, topic0, topic1 = tmp_vals[0], tmp_vals[1], tmp_vals[2]
# doc_word_mapping(words, topic0, topic1)

In [140]:
raw_df['word_prob'] = raw_df.apply(lambda x: doc_word_mapping(x['content'], x['Topic_0'], x['Topic_1']), axis=1)

In [141]:
print(raw_df['word_prob'][5])

[0.0, 0.0, 0.0, 0.0, 0.0, 996.3568773979187, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 336.52216370926897, 0.0, 0.0, 0.0, 0.0, 0.0, 217.1700013671455, 0.0, 0.0, 0.0, 0.0, 0.0, 1382.272293522713, 537.9047553153915, 0.0, 0.0, 337.8200010360883, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1688.2185471224984, 0.0, 0.0, 0.0, 0.0, 381.43084973967245, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.4878350066698597, 0.5121649933301403]


In [142]:
final_df = raw_df[["word_prob", "label_code"]]
print(final_df.shape)
final_df.sample(5)

(6317, 2)


Unnamed: 0,word_prob,label_code
2485,"[434.9976146132532, 0.0, 0.0, 0.0, 0.0, 0.0, 0...",0.0
2298,"[434.9976146132532, 0.0, 16.45572017313043, 0....",0.0
6055,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 225.41057130111...",0.0
1550,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 504.15159412514...",0.0
2933,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1621....",0.0


In [143]:
def cm(y_true, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()

    acc = (tp + tn) / (tn + fp + fn + tp)
    prec = tp / (tp + fp)
    rec = tp / (tp + fn)
    f1 = 2 * prec * rec / (prec + rec)

    beta = 3
    f2 = (1+np.power(beta, 2))*prec*rec/(np.power(beta,2)*prec + rec)

    print("accuracy", acc)
    print("precision", prec)
    print("recall", rec)
    print("f1", f1)
    print("f2", f2)
    print(tn, fp, fn, tp)
    return acc, prec, rec, f1, f2

In [144]:
X = final_df['word_prob'].values.tolist()
y = final_df['label_code']

In [145]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=9, stratify=y)

In [146]:
print("# positive in train set: {}".format(len(y_train[y_train == 1])),
      "\n# negative in train set: {}".format(len(y_train[y_train == 0])))
print("# positive in test set: {}".format(len(y_test[y_test == 1])),
      "\n# negative in test set: {}".format(len(y_test[y_test == 0])))

# positive in train set: 247 
# negative in train set: 4806
# positive in test set: 62 
# negative in test set: 1202


In [147]:
scaler = MinMaxScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [148]:
# clf = MultinomialNB()
# clf = LinearSVC(C=100, loss='hinge', random_state=9, max_iter=500000)
clf = SVC(C=100, kernel='linear', random_state=9)
# clf = DecisionTreeClassifier(random_state=0)
# clf = RandomForestClassifier(n_estimators=100, random_state=9)

clf.fit(X_train, y_train)
# y_pred = clf.predict(X_test)

SVC(C=100, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='linear', max_iter=-1, probability=False, random_state=9,
  shrinking=True, tol=0.001, verbose=False)

In [149]:
clf.n_support_

array([284, 233], dtype=int32)

In [151]:
y_pred = clf.predict(X_test)
cm(y_test, y_pred)

accuracy 0.9596518987341772
precision 0.9230769230769231
recall 0.1935483870967742
f1 0.31999999999999995
f2 0.21015761821366025
1201 1 50 12


(0.9596518987341772,
 0.9230769230769231,
 0.1935483870967742,
 0.31999999999999995,
 0.21015761821366025)