# Python Code Readability

Will Tholke, Alex Truong, Andrew Zhang

## Imports

In [1020]:
from scipy import sparse
from sklearn import linear_model
from collections import Counter
import numpy as np
import nltk
import math
import re
from scipy.stats import norm

In [1021]:
!python -m nltk.downloader punkt

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/willtholke/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Data Loading & Pre-processing

In [1022]:
def preprocess(text):
    cleaned_text = text.replace('<newline>', '\n')
    return cleaned_text

In [1023]:
def load_ordinal_data(filename, ordering):
    X = []
    Y = []
    orig_Y = []

    for _ in ordering:
        Y.append([])

    with open(filename, encoding="utf-8") as file:
        for line in file:
            cols = line.split("\t")
            label = cols[2].lstrip().rstrip()
            text = cols[3]

            preprocess(text)
            X.append(text)

            index = ordering.index(label)
            for i in range(len(ordering)):
                if index > i:
                    Y[i].append(1)
                else:
                    Y[i].append(0)
            orig_Y.append(label)

    return X, Y, orig_Y

#### Ordinal Classifier Definition

In [1024]:
class OrdinalClassifier:

    def __init__(self, ordinal_values, feature_method, trainX, trainY, devX, devY, testX, testY, orig_trainY, orig_devY, orig_testY):
        self.ordinal_values=ordinal_values
        self.feature_vocab = {}
        self.feature_method = feature_method
        self.min_feature_count=2
        self.log_regs = [None] * (len(self.ordinal_values)-1)

        self.trainY=trainY
        self.devY=devY
        self.testY=testY

        self.orig_trainY=orig_trainY
        self.orig_devY=orig_devY
        self.orig_testY=orig_testY

        self.trainX = self.process(trainX, training=True)
        self.devX = self.process(devX, training=False)
        self.testX = self.process(testX, training=False)

        self.predictions = []

    # Featurize entire dataset
    def featurize(self, data):
        featurized_data = []
        for text in data:
            feats = self.feature_method(text)
            featurized_data.append(feats)
        return featurized_data

    # Read dataset and returned featurized representation as sparse matrix + label array
    def process(self, X_data, training = False):

        data = self.featurize(X_data)

        if training:
            fid = 0
            feature_doc_count = Counter()
            for feats in data:
                for feat in feats:
                    feature_doc_count[feat]+= 1

            for feat in feature_doc_count:
                if feature_doc_count[feat] >= self.min_feature_count:
                    self.feature_vocab[feat] = fid
                    fid += 1

        F = len(self.feature_vocab)
        D = len(data)
        X = sparse.dok_matrix((D, F))
        for idx, feats in enumerate(data):
            for feat in feats:
                if feat in self.feature_vocab:
                    X[idx, self.feature_vocab[feat]] = feats[feat]

        return X

    def train(self):
        (D,F) = self.trainX.shape

        for idx, ordinal_value in enumerate(self.ordinal_values[:-1]):
            best_dev_accuracy=0
            best_model=None
            for C in [0.0001, 0.001, 0.1, 1, 5, 10, 50, 100, 1000]:

                log_reg = linear_model.LogisticRegression(C = C, max_iter=1000)
                log_reg.fit(self.trainX, self.trainY[idx])

                development_accuracy = log_reg.score(self.devX, self.devY[idx])
                if development_accuracy > best_dev_accuracy:
                    best_dev_accuracy=development_accuracy
                    best_model=log_reg


            self.log_regs[idx]=best_model

    def test(self):
        cor=tot=0
        counts=Counter()
        preds=[None]*(len(self.ordinal_values)-1)
        for idx, ordinal_value in enumerate(self.ordinal_values[:-1]):
            preds[idx]=self.log_regs[idx].predict_proba(self.testX)[:,1]

        preds=np.array(preds)

        for data_point in range(len(preds[0])):


            ordinal_preds=np.zeros(len(self.ordinal_values))
            for ordinal in range(len(self.ordinal_values)-1):
                if ordinal == 0:
                    ordinal_preds[ordinal]=1-preds[ordinal][data_point]
                else:
                    ordinal_preds[ordinal]=preds[ordinal-1][data_point]-preds[ordinal][data_point]

            ordinal_preds[len(self.ordinal_values)-1]=preds[len(preds)-1][data_point]

            prediction=np.argmax(ordinal_preds)
            self.predictions.append(prediction+1)

            counts[prediction]+=1
            if prediction == self.ordinal_values.index(self.orig_testY[data_point]):
                cor+=1
            tot+=1

        return cor/tot

## Feature Encoding

### Feature 1: Code Length and Repetition (Section 2.1)

In [1028]:
def check_line_length(text, code_max_length=79, docstring_max_length=72):
    """ Check if lines of code, docstrings, and inline comments do not exceed their maximum allowable lengths. """
    lines = text.split('\n')
    for line in lines:
        if len(line) > code_max_length:
            return False
        if line.startswith('#') or line.strip().startswith('"""') or line.strip().startswith("'''"):
            if len(line) > docstring_max_length:
                return False
    return True

### Feature 2: Inline Comments and Docstrings (Section 2.2)

In [1029]:
def check_comments_docstrings(text):
    """ Check if at least one of the following is present: docstring, inline comment, assert statement, error handling. """
    if re.search(r'"""(.|\n)*?"""', text) or re.search(r"'''(.|\n)*?'''", text) or re.search(r'#', text):
        return True
    # Omitted: checks for assert statements and error handling

    return False

### Feature 3: Naming Conventions and Case (Section 2.3)

In [1030]:
def check_naming_conventions(text):
    """ Check if the class name is in CamelCase or the function name is in snake_case. Check if variables are in snake_case. """
    if not (re.search(r'\bclass\s+[A-Z][a-zA-Z0-9]*', text) or re.search(r'\bdef\s+[a-z_][a-zA-Z0-9]*', text)):
        return False

    variable_pattern = r'\b[a-z][a-zA-Z0-9_]*\s*='
    variable_matches = re.findall(variable_pattern, text)

    for match in variable_matches:
        var_name = match.strip().rstrip('=').strip()
        if not re.match(r'^[a-z][a-zA-Z0-9_]*$', var_name):
            return False

    return True

def check_descriptive_params(text):
    """ Check if function and class parameters are self-explanatory. """
    bad_params = {'x', 'y', 'temp', 'param', 'input', 'temp', 'i', 'j', 'k'}
    function_pattern = r'\bdef\s+[a-z_][a-zA-Z0-9]*\s*\((.*?)\)'
    class_pattern = r'\bclass\s+[A-Z][a-zA-Z0-9]*\s*\((.*?)\)'
    function_matches = re.findall(function_pattern, text)
    class_matches = re.findall(class_pattern, text)

    for params in function_matches + class_matches:
        param_list = [p.strip() for p in params.split(',')]
        for param in param_list:
            if param in bad_params:
                return False
    return True

### Feature 4: Whitespace (Section 2.4)

In [1031]:
def check_whitespace(text):
    """ Check for consistency between tabs and spaces for indentation. """
    lines = text.split('\n')
    space_indented = [line.startswith(' ') for line in lines if line.strip()]
    tab_indented = [line.startswith('\t') for line in lines if line.strip()]
    return not (any(space_indented) and any(tab_indented))

def check_blank_lines(text):
    """ Check whether blank lines are used sparingly. """
    lines = text.split('\n')
    blank_line = 0

    for line in lines:
        if not line.strip():
            blank_line += 1
            if blank_line > 1:
                return False
        else:
            blank_line = 0

    return True

### Feature 5: Miscellaneous (Section 2.5)

In [1032]:
def check_misc(text):
    """ Check for residual to-do statements and comparison of boolean values. """
    if re.search(r'\bTrue\b', text) or re.search(r'\bFalse\b', text) or re.search(r'todo', text, re.IGNORECASE):
        return False
    return True

### Feature Combination

In [1033]:
def feature_method(text):
    features = {
        'line_length': check_line_length(text),
        'comments_docstrings': check_comments_docstrings(text),
        'naming_conventions': check_naming_conventions(text),
        'descriptive_params': check_descriptive_params(text),
        'whitespace_consistency': check_whitespace(text),
        'blank_lines': check_blank_lines(text),
        'misc': check_misc(text),
    }
    return features

#### Training

In [1034]:
def confidence_intervals(accuracy, n, significance_level):
    critical_value=(1-significance_level)/2
    z_alpha=-1*norm.ppf(critical_value)
    se=math.sqrt((accuracy*(1-accuracy))/n)
    return accuracy-(se*z_alpha), accuracy+(se*z_alpha)

In [1035]:
def run(trainingFile, devFile, testFile, ordinal_values):
    trainX, trainY, orig_trainY = load_ordinal_data(trainingFile, ordinal_values)
    devX, devY, orig_devY = load_ordinal_data(devFile, ordinal_values)
    testX, testY, orig_testY = load_ordinal_data(testFile, ordinal_values)

    simple_classifier = OrdinalClassifier(ordinal_values, feature_method, trainX, trainY, devX, devY, testX, testY, orig_trainY, orig_devY, orig_testY)
    simple_classifier.train()
    accuracy = simple_classifier.test()

    lower, upper = confidence_intervals(accuracy, len(testY[0]), .95)
    print("Test accuracy for best dev model: %.3f, 95%% CIs: [%.3f %.3f]\n" % (accuracy, lower, upper))
    return simple_classifier.predictions

In [1037]:
trainingFile = "splits/train.txt"
devFile = "splits/dev.txt"
testFile = "splits/test.txt"

ordinal_values = ["1", "2", "3", "4", "5"]

run(trainingFile, devFile, testFile, ordinal_values);

Test accuracy for best dev model: 0.730, 95% CIs: [0.643 0.817]



### Part B: Analysis

In [1038]:
# The analysis here should communicate what kind of information others should know if they were to use our model as well as aspects of some fundamental concept that you hadn't considered while annotating. We will be graded on the depth of our analysis.

In [1039]:
run(trainingFile, devFile, testFile, ordinal_values);

Test accuracy for best dev model: 0.730, 95% CIs: [0.643 0.817]

