In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

#load data
data1=pd.read_csv("problem_data_multisemester_balanced.csv")
data2=pd.read_csv("suggestion_data_multisemester_balanced.csv")

In [2]:
data1=data1.rename(columns={'TAG':'problem'})
data2=data2.rename(columns={'TAG':'suggestion'})

In [3]:
result = pd.merge(data1, data2, how='inner',on='REVIEW')
result

Unnamed: 0,problem,REVIEW,suggestion
0,0,The test plan covers possible test cases and a...,0
1,0,Multiple diagrams visualizing and explaining t...,0
2,0,The design is well thought of. They propose 2 ...,0
3,1,The team has added details about UI testing. \...,1
4,1,The team has provided a usecase diagram which ...,1
...,...,...,...
6302,1,"The user interface is very simple, clear and s...",1
6303,1,The Code is structured as per basic rails norm...,1
6304,1,"Overall, the code looks fine.\nGood code is se...",1
6305,1,I Don't see the GIT Link to comment on this. R...,1


In [4]:
result['TAG'] = result['problem'] + result['suggestion']
result['TAG'].loc[ result['TAG'] > 0] = 1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


In [5]:
result.describe()

Unnamed: 0,problem,suggestion,TAG
count,6307.0,6307.0,6307.0
mean,0.573331,0.505787,0.597273
std,0.494633,0.500006,0.490486
min,0.0,0.0,0.0
25%,0.0,0.0,0.0
50%,1.0,1.0,1.0
75%,1.0,1.0,1.0
max,1.0,1.0,1.0


In [6]:
data=result.drop(columns=['problem', 'suggestion'])
data

Unnamed: 0,REVIEW,TAG
0,The test plan covers possible test cases and a...,0
1,Multiple diagrams visualizing and explaining t...,0
2,The design is well thought of. They propose 2 ...,0
3,The team has added details about UI testing. \...,1
4,The team has provided a usecase diagram which ...,1
...,...,...
6302,"The user interface is very simple, clear and s...",1
6303,The Code is structured as per basic rails norm...,1
6304,"Overall, the code looks fine.\nGood code is se...",1
6305,I Don't see the GIT Link to comment on this. R...,1


In [7]:
# Calculate reviews' length
for row in data:
    data['length']=[len(data.loc[line,'REVIEW'])for line in data.index]

data.head()

Unnamed: 0,REVIEW,TAG,length
0,The test plan covers possible test cases and a...,0,68
1,Multiple diagrams visualizing and explaining t...,0,92
2,The design is well thought of. They propose 2 ...,0,92
3,The team has added details about UI testing. \...,1,201
4,The team has provided a usecase diagram which ...,1,150


In [8]:
# Calculate reviews' readability indices
import textstat
for row in data:
    data['ARI']=[textstat.automated_readability_index(data.loc[line,'REVIEW'])for line in data.index] # Automated readability index
    data['CLI']=[textstat.coleman_liau_index(data.loc[line,'REVIEW'])for line in data.index] #The Coleman-Liau Index
    data['FRE']=[textstat.flesch_reading_ease(data.loc[line,'REVIEW'])for line in data.index] # Flesch Reading Ease
    data['FOG']=[textstat.gunning_fog(data.loc[line,'REVIEW'])for line in data.index] # Gunning fog index

data.head()

Unnamed: 0,REVIEW,TAG,length,ARI,CLI,FRE,FOG
0,The test plan covers possible test cases and a...,0,68,6.9,8.92,59.3,4.8
1,Multiple diagrams visualizing and explaining t...,0,92,10.6,11.88,47.79,16.67
2,The design is well thought of. They propose 2 ...,0,92,3.9,5.88,79.77,5.75
3,The team has added details about UI testing. \...,1,201,19.3,12.66,37.98,20.47
4,The team has provided a usecase diagram which ...,1,150,7.9,9.73,75.2,9.82


In [9]:
from textblob import TextBlob

In [10]:
# Calculate reviews' subjectivity and polarity
data['subjectivity']=0
data['polarity']=0
for idx,row in data.iterrows():
    text=TextBlob(data.loc[idx,'REVIEW'])
    data.loc[idx,'subjectivity']=text.sentiment.subjectivity
    data.loc[idx,'polarity']=text.sentiment.polarity

data.head()

Unnamed: 0,REVIEW,TAG,length,ARI,CLI,FRE,FOG,subjectivity,polarity
0,The test plan covers possible test cases and a...,0,68,6.9,8.92,59.3,4.8,1.0,0.0
1,Multiple diagrams visualizing and explaining t...,0,92,10.6,11.88,47.79,16.67,0.0,0.0
2,The design is well thought of. They propose 2 ...,0,92,3.9,5.88,79.77,5.75,1.0,0.0
3,The team has added details about UI testing. \...,1,201,19.3,12.66,37.98,20.47,0.5,0.5
4,The team has provided a usecase diagram which ...,1,150,7.9,9.73,75.2,9.82,0.75,0.8


In [11]:
from math import log2

def entropy(str):
    def shannon(boe):
        total = sum(boe.values()) 
        return sum(freq / total * log2(total / freq) for freq in boe.values())

    list = str.split(" ") 
    dict = {}
    for key in list: 
        if dict.get(key) == None: 
            dict[key] = 1
        else:
            dict[key] += 1 
    return shannon(dict)

In [12]:
# Calculate reviews' entropy
data['entropy']=0
for idx,row in data.iterrows():
    data.loc[idx,'entropy']=entropy(data.loc[idx,'REVIEW'])

data.head()

Unnamed: 0,REVIEW,TAG,length,ARI,CLI,FRE,FOG,subjectivity,polarity,entropy
0,The test plan covers possible test cases and a...,0,68,6.9,8.92,59.3,4.8,1.0,0.0,3.251629
1,Multiple diagrams visualizing and explaining t...,0,92,10.6,11.88,47.79,16.67,0.0,0.0,3.640224
2,The design is well thought of. They propose 2 ...,0,92,3.9,5.88,79.77,5.75,1.0,0.0,3.969816
3,The team has added details about UI testing. \...,1,201,19.3,12.66,37.98,20.47,0.5,0.5,4.85141
4,The team has provided a usecase diagram which ...,1,150,7.9,9.73,75.2,9.82,0.75,0.8,4.483856


In [13]:
# Select features
from sklearn.feature_selection import SelectKBest,f_classif
predictors=['length', 'ARI', 'CLI', 'FRE', 'FOG', 'subjectivity', 'polarity', 'entropy']
# The SelectKBest method selects the features according to the k highest score
sel=SelectKBest(f_classif,k=5)
sel.fit(data[predictors],data['TAG']) 
# Get the raw p-values for each feature, and transform from p-values into scores
print(sel.pvalues_)
scores=-np.log10(sel.pvalues_)
print('selected index:',sel.get_support(True))

[9.14261326e-130 8.99424817e-041 1.77099362e-027 3.55753945e-019
 3.20066888e-035 6.01806202e-012 4.39602774e-010 0.00000000e+000]
selected index: [0 1 2 4 7]


  scores=-np.log10(sel.pvalues_)


In [14]:
from sklearn import feature_selection  
from sklearn import model_selection
from sklearn import metrics 
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score,roc_auc_score
from time import time
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble.forest import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier



In [15]:
Target = ['TAG']
data_columns=['length', 'ARI','CLI', 'FRE','FOG']
#data_columns=['length', 'ARI', 'CLI', 'FRE', 'FOG', 'subjectivity', 'polarity', 'entropy']
columns = Target + data_columns

# Split the data to train data and test data
X_train, X_test, y_train, y_test = model_selection.train_test_split(data[data_columns],data[Target], random_state = 0)

In [16]:
alg1=LogisticRegression(max_iter=1000,random_state=25)
alg2=SVC(probability=True,random_state=25)  # SVM
alg3=RandomForestClassifier(random_state=25)# Random Forest
alg4=AdaBoostClassifier(random_state=25)# AdaBoost
alg5=GradientBoostingClassifier(random_state=25)

In [17]:
scores=[]
methods=["LogisticRegression","SVM","RandomForest","AdaBoost","GradientBoosting"]
# LogisticRegression
alg1_1 = alg1.fit(X_train, y_train.values.ravel())
score = alg1_1.score(X_test, y_test)
scores.append(score)
# SVM
alg2_2 = alg2.fit(X_train, pd.DataFrame(y_train).values.ravel())
score2 = alg2_2.score(X_test, y_test)
scores.append(score2)
# Random Forest
alg3_3 = alg3.fit(X_train, pd.DataFrame(y_train).values.ravel())
score3 = alg3_3.score(X_test, y_test)
scores.append(score3)
# AdaBoost
alg4_4 = alg4.fit(X_train, pd.DataFrame(y_train).values.ravel())
score4 = alg4_4.score(X_test, y_test)
scores.append(score4)
# GradientBoosting
alg5_5 = alg5.fit(X_train, pd.DataFrame(y_train).values.ravel())
score5 = alg5_5.score(X_test, y_test)
scores.append(score5)
scores

[0.7526949904882688,
 0.7571337983512999,
 0.7349397590361446,
 0.7533291058972733,
 0.7545973367152822]

In [18]:
def fit_model(alg,parameters):
    scorer=make_scorer(roc_auc_score)
    grid = GridSearchCV(alg,parameters,scoring=scorer,cv=5)
    start=time()  #计时
    grid=grid.fit(X_train, pd.DataFrame(y_train).values.ravel())
    end=time()
    t=round(end-start,3)
    print(round(grid.best_score_, 4))
    print (grid.best_params_)
    print ('searching time for {} is {} s'.format(alg.__class__.__name__,t)) 
    return grid

In [19]:
alg1=LogisticRegression(C=0.01,random_state=25)
alg2=SVC(C=1,gamma=0.001,kernel='rbf',probability=True,random_state=25)
alg3=RandomForestClassifier(n_estimators=170,max_depth=4,min_samples_split=2,random_state=25)
alg4=AdaBoostClassifier(learning_rate=0.5,n_estimators=10,random_state=25)
alg5=GradientBoostingClassifier(random_state=25)

In [20]:
scores=[]
methods=["LogisticRegression","SVM","RandomForest","AdaBoost","GradientBoosting"]
# Logistic Regression
alg1_1 = alg1.fit(X_train, y_train.values.ravel())
score = alg1_1.score(X_test, y_test)
scores.append(score)
# SVM
alg2_2 = alg2.fit(X_train, pd.DataFrame(y_train).values.ravel())
score2 = alg2_2.score(X_test, y_test)
scores.append(score2)
# Random Forest
alg3_3 = alg3.fit(X_train, pd.DataFrame(y_train).values.ravel())
score3 = alg3_3.score(X_test, y_test)
scores.append(score3)
# AdaBoost
alg4_4 = alg4.fit(X_train, pd.DataFrame(y_train).values.ravel())
score4 = alg4_4.score(X_test, y_test)
scores.append(score4)
# GradientBoosting
alg5_5 = alg5.fit(X_train, pd.DataFrame(y_train).values.ravel())
score5 = alg5_5.score(X_test, y_test)
scores.append(score5)
scores

[0.7533291058972733,
 0.7564996829422955,
 0.7488902980342422,
 0.7520608750792644,
 0.7545973367152822]