# Classification of Refactoring Types

Import the required packages for extracting and pre-processing of data

In [None]:
# import all the required packages
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_validate, KFold, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('omw-1.4')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer

%matplotlib inline

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Read the data from the message csv file 

In [None]:
# Import the data from the csv file
df_message = pd.read_csv('/content/Message.csv')

In [None]:
df_message.head()

Unnamed: 0,Commit message,Class
0,extract method.,extract
1,Minor tweaks following review extraction of me...,extract
2,extract some stuff to a method[git p4: depot p...,extract
3,extract some methods in DoiServiceImpl,extract
4,refactoring getMenuSpace in Navigation: extrac...,extract


In [None]:
df_message.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5004 entries, 0 to 5003
Data columns (total 2 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Commit message  5004 non-null   object
 1   Class           5004 non-null   object
dtypes: object(2)
memory usage: 78.3+ KB


In [None]:
df_message.Class.unique()

array(['extract', 'move', 'inline', 'pull up', 'push down', 'rename'],
      dtype=object)

# Pre-Processing

Convert the case of the Commit Message to lower case.

In [None]:
df_message['Commit message'] = df_message['Commit message'].str.lower()

Tokenize each word in a sentence using NLTK after the stop word removal. 

In [None]:
# Function Definition for tokenization process
def tokenize(column):
  """
    Function Definition to tokenize the input records and removing the stop words.

    @param column: Record to be tokenized.
    @type column: Series

    @return: list of tokens
  """
  stop_words = set(stopwords.words('english'))

  word_tokens = word_tokenize(column)
  filtered_sentence = [w for w in word_tokens if not w.lower() in stop_words]
  final_sentence = [w for w in filtered_sentence if w.isalpha()]
  return  final_sentence 

In [None]:
# Function call for tokenization 
df_message['tokenized'] = df_message.apply(lambda x: tokenize(x['Commit message']), axis=1)
df_message[['tokenized']].head()

Unnamed: 0,tokenized
0,"[extract, method]"
1,"[minor, tweaks, following, review, extraction,..."
2,"[extract, stuff, method, git, depot, paths, ch..."
3,"[extract, methods, doiserviceimpl]"
4,"[refactoring, getmenuspace, navigation, extrac..."


Tokenized Example

In [None]:
df_message['tokenized'].iloc[9]

['extract',
 'setup',
 'default',
 'httpparams',
 'public',
 'static',
 'methodgit',
 'svn',
 'id',
 'https']

Perform lemmatization on the tokenized data

In [None]:
# Function Definition for Lemmatization of words
def lemmatize(row):
  """
    Function Definition to lemmatize each word in the specific record.

    @param row: The record to be lemmatized.
    @type row: list

    @return: List of lemmatized tokens
  """
  lemmatizer = WordNetLemmatizer()
  final_sentence = [lemmatizer.lemmatize(w) for w in row]
  return final_sentence

In [None]:
# Function Call for lemmatization process
df_message['Lemmatized'] = df_message.apply(lambda x: lemmatize(x['tokenized']), axis=1)
df_message[['Lemmatized']].head()

Unnamed: 0,Lemmatized
0,"[extract, method]"
1,"[minor, tweak, following, review, extraction, ..."
2,"[extract, stuff, method, git, depot, path, cha..."
3,"[extract, method, doiserviceimpl]"
4,"[refactoring, getmenuspace, navigation, extrac..."


In [None]:
df_message.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5004 entries, 0 to 5003
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Commit message  5004 non-null   object
 1   Class           5004 non-null   object
 2   tokenized       5004 non-null   object
 3   Lemmatized      5004 non-null   object
dtypes: object(4)
memory usage: 156.5+ KB


In [None]:
df_message["Concate"] = [ " ".join(w) for w in df_message["Lemmatized"]]
df_message["Concate"]

0                                          extract method
1       minor tweak following review extraction method...
2              extract stuff method git depot path change
3                           extract method doiserviceimpl
4       refactoring getmenuspace navigation extract is...
                              ...                        
4999           rename getprotocol getmechanism testclient
5000          rename mapping method signed luke hutchison
5001    renaming refactor deserialization related code...
5002    renamed usage description match name used comm...
5003    renamed isoccupied point point hasroaduseron p...
Name: Concate, Length: 5004, dtype: object

In [None]:
df_message['Concate'].iloc[9]

'extract setup default httpparams public static methodgit svn id http'

Perform tf-idf vectorization on the cleaned, tokenized and lemmitized input after splitting the data into test and train

In [None]:
# Splitting the input and the output features 
X = df_message['Concate']
y = df_message['Class']
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2)

# Doing the data set into 80% of training data and 20% testing data and applying tf-idf vectorizer.
pre_processor = TfidfVectorizer(ngram_range=(2,2), lowercase=False)

xx = pre_processor.fit_transform(X_train)
xy = pre_processor.transform(X_test)

# Model Training

Random Forest

In [None]:
# Classification done using Random Forest
clf = RandomForestClassifier(n_estimators=8, max_depth=32, min_samples_split=1, n_jobs=-1)
clf.fit(xx, y_train)
y_pred = clf.predict(xy)
#print(y_pred)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

     extract       0.63      0.12      0.20       181
      inline       0.36      0.02      0.04       171
        move       0.19      0.93      0.31       166
     pull up       0.28      0.10      0.15       146
   push down       0.16      0.04      0.07       163
      rename       0.56      0.14      0.22       174

    accuracy                           0.23      1001
   macro avg       0.36      0.23      0.17      1001
weighted avg       0.37      0.23      0.17      1001



Gradient Boosting Classifier

In [None]:
# Classification done using Gradient Boosting
gbc = GradientBoostingClassifier(max_leaf_nodes=20, min_samples_leaf=10, learning_rate=0.2)
ovr = OneVsRestClassifier(gbc)
ovr.fit(xx, y_train)
y_pred = ovr.predict(xy)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

     extract       0.52      0.23      0.32       181
      inline       0.18      0.04      0.07       171
        move       0.40      0.19      0.26       166
     pull up       0.39      0.18      0.25       146
   push down       0.17      0.73      0.28       163
      rename       0.63      0.17      0.26       174

    accuracy                           0.25      1001
   macro avg       0.38      0.26      0.24      1001
weighted avg       0.38      0.25      0.24      1001



Logistic Regression

In [None]:
# Classification done using Logistic Regression
lr = LogisticRegression(tol=1e-7, penalty='l2', solver='lbfgs', n_jobs=-1)
lr.fit(xx, y_train)
y_pred_lr = lr.predict(xy)
print(classification_report(y_test, y_pred_lr))

df_y_pred_lr = pd.DataFrame(y_pred_lr, columns=["y_pred"])
#df_y_pred_lr

              precision    recall  f1-score   support

     extract       0.54      0.38      0.44       181
      inline       0.36      0.23      0.28       171
        move       0.24      0.63      0.35       166
     pull up       0.30      0.31      0.31       146
   push down       0.35      0.20      0.26       163
      rename       0.68      0.37      0.48       174

    accuracy                           0.35      1001
   macro avg       0.41      0.35      0.35      1001
weighted avg       0.42      0.35      0.36      1001



Support Vector Classification

In [None]:
# Classification done using Support Vector Classification
svc = SVC(max_iter=1)
ovr = OneVsRestClassifier(svc)
ovr.fit(xx, y_train)
y_pred = ovr.predict(xy)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

     extract       0.18      0.99      0.31       181
      inline       0.00      0.00      0.00       171
        move       0.00      0.00      0.00       166
     pull up       0.00      0.00      0.00       146
   push down       0.00      0.00      0.00       163
      rename       0.50      0.01      0.01       174

    accuracy                           0.18      1001
   macro avg       0.11      0.17      0.05      1001
weighted avg       0.12      0.18      0.06      1001



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Naive Bayes

In [None]:
# Classification done using Naive Bayes
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB().fit(xx.toarray(), y_train)
gnb_predictions = gnb.predict(xy.toarray())
df_y_pred_gnb = pd.DataFrame(gnb_predictions, columns=["y_pred"])  
print(classification_report(y_test, gnb_predictions))

accuracy = gnb.score(xy.toarray(), y_test)

              precision    recall  f1-score   support

     extract       0.38      0.27      0.31       181
      inline       0.30      0.18      0.22       171
        move       0.41      0.35      0.38       166
     pull up       0.22      0.20      0.21       146
   push down       0.37      0.24      0.29       163
      rename       0.34      0.76      0.47       174

    accuracy                           0.34      1001
   macro avg       0.34      0.33      0.31      1001
weighted avg       0.34      0.34      0.32      1001



# K-Fold Cross Validation 

Cross Validation is preformed with 10 folds and the performance is checked.

In [3]:
# Function Definition for cross validation
def cross_validation(model,_X,_y,_cv=5):
  """
    Function Definition to perform k-fold cross validation and then testing with the model.

    @param model: The classification model used.
    
    @param _X: Input features for testing
    @type _X: DataFrame

    @param _y: Output features for testing
    @type _y: DataFrame
    
    @param cv: Number of folds
    @type: int

    @return: Dictionary of values
  """
  _scoring=['accuracy','precision','recall','f1']
  results = cross_validate(estimator=model,X=_X,y=_y,cv=_cv,scoring=_scoring,return_train_score=True)
  return results

In [None]:
# Result of Random Forest Classifier with Cross Validation
cv_results = cross_validate(clf, xx,y_train, cv=10, scoring = 'accuracy')
print(cv_results['test_score'].mean())

0.22233603491271822


In [None]:
# Result of Logistic Regression Classifier with Cross Validation
cv_results = cross_validate(lr, xx,y_train, cv=10, scoring = 'accuracy')
print(cv_results['test_score'].mean())

0.34273067331670826
