In [4]:
import sys
import os
import csv
import pandas
import spacy
import sklearn
import sklearn.feature_extraction
import sklearn.model_selection
import sklearn.pipeline
import sklearn.naive_bayes
import sklearn.svm
import sklearn.tree
import matplotlib.pyplot as plt
import numpy as np
import scipy as sp

# START WORKAROUND https://stackoverflow.com/questions/15063936/csv-error-field-larger-than-field-limit-131072
maxInt = sys.maxsize
while True:
    # decrease the maxInt value by factor 10
    # as long as the OverflowError occurs.

    try:
        csv.field_size_limit(maxInt)
        break
    except OverflowError:
        maxInt = int(maxInt/10)
# END WORKAROUND https://stackoverflow.com/questions/15063936/csv-error-field-larger-than-field-limit-131072

TRAINING_DATASET_PATH = '../training-data/discussions-sample-1000-annotated.csv'
DATASET_PATH = '../preprocess-dataset/output/reviews.csv'
CLASSIFIED_DATASET_PATH = '../design-classifier/output/reviews-predicted.csv'


def loadTrainingDataset():
    return pandas.read_csv(TRAINING_DATASET_PATH)

In [11]:
tdf = loadTrainingDataset()
# tdf.head()
tdf.info()
# tdf['isDesign'].value_counts()
# tdf.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 14 columns):
reviewRequestId           1000 non-null int64
repository                1000 non-null object
reviewRequestSubmitter    1000 non-null object
reviewId                  1000 non-null int64
diffCommentId             674 non-null float64
replyId                   1 non-null float64
replyDiffCommentId        0 non-null float64
type                      1000 non-null object
username                  1000 non-null object
timestamp                 1000 non-null object
text                      1000 non-null object
isDesign                  1000 non-null bool
concept                   949 non-null object
hadToLookAtCode           39 non-null object
dtypes: bool(1), float64(3), int64(2), object(8)
memory usage: 102.7+ KB


In [6]:
tdf['concept'].value_counts()


approval                         154
documentation                    106
betterSolution                   105
bot                               83
formatting                        68
                                ... 
acknowledgment;betterSolution      1
betterSolution;concurrency         1
documentation;previousComment      1
clarification;refactoring          1
documentation;todo                 1
Name: concept, Length: 78, dtype: int64

In [38]:
#concepts = np.unique(tdf[~pandas.isna(tdf['concept'])]['concept'].to_numpy())

# Counts frequency of concepts (incl. logic to deal with cells with multiple concepts).
conceptCounts = {}
for rowConcepts in tdf[~pandas.isna(tdf['concept'])]['concept']:
    rowConcepts = rowConcepts.replace(',', ';')
    concepts = [rowConcepts] if ';' not in rowConcepts else rowConcepts.split(';')
    for concept in concepts:
        conceptCounts[concept] = conceptCounts.get(concept, 0) + 1
conceptCounts


{'acknowledgment': 28,
 'approval': 160,
 'betterSolution': 135,
 'bot': 83,
 'bug': 63,
 'changesetPartitioning': 2,
 'clarification': 82,
 'concurrency': 5,
 'configuration': 6,
 'deadCode': 13,
 'dependencies': 5,
 'documentation': 113,
 'duplicateCode': 4,
 'encapsulation': 7,
 'exceptionHandling': 1,
 'featureAddition': 3,
 'formatting': 71,
 'generalQuestion': 2,
 'interfaceContract': 3,
 'logMessage': 15,
 'logPoint': 7,
 'methodExtraction': 3,
 'naming': 54,
 'performance': 9,
 'previousComment': 19,
 'redundantCode': 19,
 'refactoring': 28,
 'rejection': 2,
 'reviewProcess': 7,
 'safety': 3,
 'sequencing': 1,
 'testDesign': 14,
 'testRequirements': 19,
 'todo': 10,
 'ui': 5,
 'unknown': 6,
 'versionControl': 12,
 'wrongDependency': 4}

In [7]:
tdf['isDesign'].value_counts()

False    636
True     364
Name: isDesign, dtype: int64

In [3]:
textCellLengths = [] 
with open(DATASET_PATH, 'r') as fin:
    dcfin = csv.DictReader(x.replace('\0', '') for x in fin)  # ignore NULL bytes, which crash the csv.reader
    for row in dcfin:
        textCellLengths.append(len(row['text']))

In [4]:
len(textCellLengths)

267843

In [39]:
np.percentile(textCellLengths, 99.37)

9780.121799999848

In [41]:
# np.percentile(textCellLengths, 99)
biggest = np.asarray(textCellLengths)
biggest = biggest[biggest > 32767] 
len(biggest)

1505

In [None]:
plt.hist(textCellLengths, bins=10)


In [18]:
cdf = pandas.read_csv(CLASSIFIED_DATASET_PATH)
cdf.info()



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 267843 entries, 0 to 267842
Data columns (total 12 columns):
reviewRequestId           267843 non-null int64
repository                267843 non-null object
reviewRequestSubmitter    267843 non-null object
reviewId                  267843 non-null int64
diffCommentId             181526 non-null float64
replyId                   867 non-null float64
replyDiffCommentId        0 non-null float64
type                      267843 non-null object
username                  267843 non-null object
timestamp                 267843 non-null object
text                      267617 non-null object
isDesign                  267843 non-null bool
dtypes: bool(1), float64(3), int64(2), object(6)
memory usage: 22.7+ MB


In [12]:
# repoNameMapper = {'bookkeeper-git': 'bookkeeper', 'cloudstack-git': 'cloudstack', 
#                   'deltacloud-git': 'deltacloud', 'drill-git': 'drill', 
#                   'falcon-git': 'falcon', 'flume-git': 'flume', 'giraph-git': 'giraph',
#                   'hadoop.git': 'hadoop', 'hbase-git': 'hbase', 'hcatalog-git': 'hcatalog',
#                   'helix-git': 'helix', 'hive-git': 'hive', 'mahout-git': 'mahout',
#                   'oozie-git': 'oozie', 'pig-git': 'pig', 'qpid-proton-git': 'qpid-proton', 
#                   'sqoop-SQOOP-1367': 'sqoop', 'sqoop-sqoop2': 'sqoop', 'sqoop-trunk': 'sqoop',
#                   'zookeeper-git': 'zookeeper'}
# 
# for index, row in cdf.iterrows():
#     if row['repository'] in repoNameMapper:
#         # row['repository'] = repoNameMapper[row['repository']]
#          cdf.at[index, 'repository'] = repoNameMapper[row['repository']]
#     
# 
# np.unique(cdf['repository'].to_numpy())

array(['Airavata', 'Flume', 'S4', 'SIS', 'Thrift', 'accumulo', 'ambari',
       'atlas', 'aurora', 'aurora-packaging', 'bigtop', 'bookkeeper',
       'climate', 'cloudstack', 'cloudstack-cloudmonkey',
       'cloudstack-docs', 'cotton', 'crunch', 'datafu', 'deltacloud',
       'directory', 'drill', 'falcon', 'flume', 'geode', 'giraph',
       'hadoop', 'hama', 'hawq', 'hbase', 'hcatalog', 'helix', 'hive',
       'infrastructure-puppet-kitchen', 'kafka', 'kylin', 'lens',
       'lucene-solr', 'mahout', 'mesos', 'mesos-incubating', 'metamodel',
       'mrunit', 'netbeans', 'nutch', 'oodt', 'oozie', 'phoenix', 'pig',
       'qpid', 'qpid-cpp', 'qpid-dispatch', 'qpid-proton', 'ranger',
       'rave', 'samza', 'samza-hello-samza', 'sentry', 'shindig',
       'slider', 'sqoop', 'streams', 'tajo', 'tez', 'tika', 'twill',
       'wave', 'whirr', 'zetacomponents', 'zookeeper'], dtype=object)

In [16]:
cdf['isDesign'].value_counts()

False    158145
True     109698
Name: isDesign, dtype: int64

In [68]:
# np.unique(cdf['repository'].to_numpy())

# len(np.unique(cdf['repository'].to_numpy()))

g = cdf.groupby(['repository', 'isDesign']).size()
designTotal = {}
discussionTotal = {}
for i, v in g.iteritems():
    repository = i[0]
    isDesign = i[1]
    if isDesign:
        designTotal[repository] = designTotal.get(repository, 0) + v
    discussionTotal[repository] = discussionTotal.get(repository, 0) + v
print(designTotal)
print(discussionTotal)
designRatio = {}
for repository in discussionTotal:
    # Ignore repositories with very little discussions.
    if discussionTotal[repository] > 1000:
        designRatio[repository] = designTotal.get(repository, 0) / discussionTotal[repository] 

# for repository in designRatio:
#     print(repository, '%.2f%%' % (designRatio[repository] * 100))


# discussionTotalDf = pandas.DataFrame.from_dict(discussionTotal, orient='index', columns=['Design Discussions'])
# discussionTotalDf.describe()

designRatioDf = pandas.DataFrame.from_dict(designRatio, orient='index', columns=['Design Discussion %'])
designRatioDf.describe()


# repository - design - not design - total



{'Airavata': 14, 'Flume': 63, 'S4': 10, 'SIS': 3, 'Thrift': 26, 'accumulo': 1097, 'ambari': 8770, 'atlas': 4171, 'aurora': 3821, 'aurora-packaging': 47, 'bigtop': 52, 'bookkeeper': 81, 'bookkeeper-git': 309, 'climate': 139, 'cloudstack-cloudmonkey': 2, 'cloudstack-git': 1833, 'cotton': 1, 'crunch': 17, 'datafu': 110, 'deltacloud-git': 8, 'directory': 1, 'drill-git': 1358, 'falcon-git': 1447, 'flume-git': 1260, 'geode': 1971, 'giraph': 157, 'giraph-git': 401, 'hadoop.git': 61, 'hama': 5, 'hbase-git': 14281, 'hcatalog': 517, 'hcatalog-git': 107, 'helix-git': 196, 'hive': 603, 'hive-git': 9348, 'kafka': 3164, 'kylin': 1, 'lens': 2829, 'lucene-solr': 42, 'mahout': 100, 'mahout-git': 95, 'mesos': 29703, 'mesos-incubating': 417, 'metamodel': 16, 'mrunit': 1, 'nutch': 21, 'oodt': 163, 'oozie': 1722, 'oozie-git': 2941, 'phoenix': 12, 'pig': 562, 'pig-git': 726, 'qpid': 1103, 'qpid-cpp': 18, 'qpid-dispatch': 28, 'qpid-proton-git': 208, 'ranger': 2694, 'rave': 188, 'samza': 1730, 'samza-hello-sa

Unnamed: 0,Design Discussion %
count,27.0
mean,0.465486
std,0.108434
min,0.259139
25%,0.379921
50%,0.49004
75%,0.554475
max,0.600438


In [39]:
len(np.unique(cdf['reviewRequestId'].to_numpy()))



36452