# Data Cleanup

commits-labeled.txt is the annotated dataset of 5,631 commits from a set of different large-scale open-source projects on Github. This file is pre-processed to generate commits.train

In [1]:
!cat ../data/commits-labeled.txt | sed -e "s/\([.\!?,'/()]\)/ \1 /g" | tr "[:upper:]" "[:lower:]" > ../data/commits.train

# Training

Now we train a supervised classifier using automatic hyperparameter tuning. The model is saved to disk.

In [5]:
import fasttext
model = fasttext.train_supervised(input='../data/commits.train', autotuneValidationFile='../data/commits.train', loss="hs")
model.save_model("../data/model_commits_v2.bin")
# Quantize the model
# model.quantize(input=None,
#                   qout=False,
#                   cutoff=0,
#                   retrain=False,
#                   epoch=None,
#                   lr=None,
#                   thread=None,
#                   verbose=None,
#                   dsub=2,
#                   qnorm=False,
#                  )
model.quantize(input='../data/commits.train', qnorm=True, retrain=True, cutoff=100000)
model.save_model("../data/model_commits_v2_quant.bin")

# Classification

In [6]:
import pandas as pd
import sys
from fasttext import load_model

In [7]:
classifier = load_model("../data/model_commits_v2_quant.bin") 

In [8]:
from pygit2 import Repository
from pygit2 import GIT_SORT_TOPOLOGICAL, GIT_SORT_REVERSE

#Path of repository to be analyzed
repo = Repository('/home/tussharm/origin/.git')

orig_messages = []

for commit in repo.walk(repo.head.target, GIT_SORT_TOPOLOGICAL):
    orig_messages.append(commit.message)
    
print("Total commits : " + str(len(orig_messages)))

df = pd.DataFrame(orig_messages, columns = ['message']) 

Total commits : 32034


In [9]:
# replace \n with space
df = df.replace('\n','', regex=True)

In [10]:
# check the shape of the Github data
df.shape

(32034, 1)

In [11]:
df.columns

Index(['message'], dtype='object')

##### Normalize the data

In [12]:
# convert the commit message column into string
commits = list(df['message'].astype(str))

##### Predict using fastText 

In [13]:
# predict the label with fastText
labels = classifier.predict(commits)  

In [14]:
res = list(zip(*labels))
res_list = [x[0] for x in res]
lst2 = [item[0] for item in res_list]
df['labels_predicted'] = lst2

##### Check the predictions made

In [15]:
df

Unnamed: 0,message,labels_predicted
0,Merge pull request #24981 from gnufied/cleanup...,__label__nonfunctional
1,UPSTREAM: 91689: Ensure CleanupActionHandle al...,__label__features
2,UPSTREAM: 91221: Tolerate pod not found errors...,__label__corrective
3,UPSTREAM: 90773: Storage e2es leaving namespac...,__label__features
4,Merge pull request #25113 from dmage/image-own...,__label__nonfunctional
...,...,...
32029,bump(github.com/spf13/pflag): 463bdc838f2b35e9...,__label__features
32030,bump(github.com/spf13/cobra): 8d72c1e167c7ed19...,__label__features
32031,bump(github.com/GoogleCloudPlatform/kubernetes...,__label__features
32032,"Build, test, and CLI environment for OpenShift 3",__label__features


##### Predicted labels for all commits saved in a csv file

In [16]:
df.to_csv("../output/output.csv", sep='\t')