## TF-IDF + Log Reg
### Training Pipeline

In [4]:
# !pip install dagshub

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import *

import dagshub
import pickle

In [2]:
def load_corpus(DATASET_PATH, CODE_COLUMN):
    df = pd.read_csv(DATASET_PATH, encoding='utf-8', comment='#', sep='\t')#, quoting=csv.QUOTE_NONE, error_bad_lines=False)#, sep=','
    print(df.head())
    corpus = df[CODE_COLUMN]
    test_size = 0.1
    test_rows = round(df.shape[0]*test_size)
    train_rows = df.shape[0] - test_rows
    train_corpus = df[CODE_COLUMN][0:test_rows]
    test_corpus = df[CODE_COLUMN][train_rows:]
    return df, corpus

In [3]:
def tfidf_transform(corpus, tfidf_params, TFIDF_DIR):
#     tfidf = TfidfVectorizer(min_df=5
#                             , max_df = 0.3
#                             , ngram_range = (1,2)
#                             , smooth_idf = True
#                            )
    # tfidf = TfidfVectorizer(tfidf_params)
    # features = tfidf.fit_transform(corpus)
    tfidf = pickle.load(open(TFIDF_DIR, 'rb'))
    features = tfidf.transform(corpus)
    return features

In [4]:
def tfidf_fit_transform(code_blocks, tfidf_params, TFIDF_DIR):
    tfidf = TfidfVectorizer(tfidf_params)
    tfidf = tfidf.fit(code_blocks)
    pickle.dump(tfidf, open(TFIDF_DIR, "wb"))
    code_blocks_tfidf = tfidf.transform(code_blocks)
    return code_blocks_tfidf

In [5]:
def logreg_evaluate(df, code_blocks, TAG_TO_PREDICT):
    code_blocks_tfidf = tfidf_fit_transform(code_blocks, tfidf_params, TFIDF_DIR)
    X_train, X_test, y_train, y_test = train_test_split(code_blocks_tfidf, df[TAG_TO_PREDICT], test_size=0.25)
    clf = LogisticRegression(random_state=421).fit(X_train, y_train)
    # clf.fit(X_train, y_train)
    print("saving the model")
    pickle.dump(clf, open(MODEL_DIR, 'wb'))
    y_pred = clf.predict(X_test)
    accuracy = clf.score(X_test, y_test)
    f1 = f1_score(y_pred, y_test)
    print(f'Mean Accuracy {round(accuracy*100, 2)}%')
    print(f'F1-score {round(f1*100, 2)}%')
    errors = y_test - y_pred
    plt.hist(errors)
    plot_precision_recall_curve(clf, X_test, y_test)
    plot_confusion_matrix(clf, X_test, y_test, values_format='d')
    def mean_confidence_interval(data, confidence=0.95):
        a = 1.0 * np.array(data)
        n = len(a)
        m, se = np.mean(a), scipy.stats.sem(a)
        h = se * scipy.stats.t.ppf((1 + confidence) / 2., n-1)
        return m, m-h, m+h
    conf_interval = mean_confidence_interval(errors, 0.95)
    print(conf_interval)
    metrics = {'test_accuracy': accuracy
               , 'test_f1_score': f1}
    return metrics

In [6]:
def get_predictions(X, y, TAG_TO_PREDICT, MODEL_DIR):
    clf = pickle.load(open(MODEL_DIR, 'rb'))
    # result = loaded_model.score(X, y)
    y_pred = clf.predict(X)
    accuracy = accuracy_score(y_pred, y)
    f1 = f1_score(y_pred, y, average='weighted')
    print(f'Mean Accuracy {round(accuracy*100, 2)}%')
    print(f'F1-score {round(f1*100, 2)}%')
    errors = y - y_pred
    plt.hist(errors)
    plot_precision_recall_curve(clf, X, y)
    plot_confusion_matrix(clf, X, y, values_format='d')
    def mean_confidence_interval(data, confidence=0.95):
        a = 1.0 * np.array(data)
        n = len(a)
        m, se = np.mean(a), scipy.stats.sem(a)
        h = se * scipy.stats.t.ppf((1 + confidence) / 2., n-1)
        return m, m-h, m+h
    conf_interval = mean_confidence_interval(errors, 0.95)
    print(conf_interval)
    metrics = {'test_accuracy': accuracy
               , 'test_f1_score': f1}
    return metrics

### Evaluation

In [7]:
# if __name__ == '__main__':
#     DATASET_PATH = './data/code_blocks_regex_graph_v2.1.csv'
#     MODEL_DIR = './models/logreg_regex_graph_v2.1.sav'
#     TFIDF_DIR = './models/tfidf_logreg_graph_v2.1.pickle'
#     CODE_COLUMN = 'code_block'
#     TAG_TO_PREDICT = 'preprocessing'
#     SCRIPT_DIR = 'logreg_classifier.ipynb'

#     df, code_blocks = load_corpus(DATASET_PATH, CODE_COLUMN)
#     nrows = df.shape[0]
#     print("loaded")
#     tfidf_params = {'min_df': 5
#                     , 'max_df': 0.3
#                     , 'smooth_idf': True}
#     data_meta = {'DATASET_PATH': DATASET_PATH
#                 ,'nrows': nrows
#                 ,'label': TAG_TO_PREDICT
#                 ,'model': MODEL_DIR
#                 ,'script_dir': SCRIPT_DIR}
#     print("tfidf-ed")
#     with dagshub.dagshub_logger() as logger:
#         metrics = logreg_evaluate(df, code_blocks, TAG_TO_PREDICT)
#         # metrics = get_predictions(features, df[TAG_TO_PREDICT], TAG_TO_PREDICT, MODEL_DIR)
#         logger.log_hyperparams(data_meta)
#         logger.log_hyperparams(tfidf_params)
#         logger.log_metrics(metrics)
#     print("finished")

### Validation (different chunk_sizes)

In [8]:
# if __name__ == '__main__':
#     all_metrics = []
#     chunk_sizes = [5, 10, 15, 20, 25, 30, 40]
#     for i in chunk_sizes:
#         DATASET_PATH = './data/chunks_{}_validate.csv'.format(i)
#         MODEL_DIR = './models/logreg_regex_graph_v2.1.sav'
#         TFIDF_DIR = './models/tfidf_logreg_graph_v2.1.pickle'
#         CODE_COLUMN = 'code'
#         TAG_TO_PREDICT = 'tag'
#         df, corpus = load_corpus(DATASET_PATH, CODE_COLUMN)
#         nrows = df.shape[0]
#         print("loaded")
#         params = {'min_df': 5
#                 , 'max_df': 0.3
#                 , 'smooth_idf': True}
#         data_meta = {'DATASET_PATH': DATASET_PATH
#                     ,'nrows': nrows
#                     ,'label': TAG_TO_PREDICT
#                     ,'model': 'Logistic Regression'}
#         features = tfidf_transform(corpus, params, TFIDF_DIR)
#         print("tfidf-ed")
#         with dagshub.dagshub_logger() as logger:
#             # metrics = logreg_evaluate(df, features, TAG_TO_PREDICT)
#             metrics = get_predictions(features, df[TAG_TO_PREDICT], TAG_TO_PREDICT, MODEL_DIR)
#             all_metrics.append(metrics)
#             logger.log_hyperparams(data_meta)
#             logger.log_hyperparams(params)
#             logger.log_metrics(metrics)
#         print("finished")

In [9]:
# all_acc = [all_metrics[i]['test_accuracy'] for i in range(len(all_metrics))]
# all_f1 = [all_metrics[i]['test_f1_score'] for i in range(len(all_metrics))]
# plt.scatter(chunk_sizes, all_acc)
# plt.scatter(chunk_sizes, all_f1)

## Log Reg Interpretation

In [10]:
# def analyze_predictions(X, y, TAG_TO_PREDICT, MODEL_DIR):
#     clf = pickle.load(open(MODEL_DIR, 'rb'))
#     # result = loaded_model.score(X, y)
#     y_pred = clf.predict(X)
#     accuracy = accuracy_score(y_pred, y)
#     f1 = f1_score(y_pred, y, average='weighted')
#     print(f'Mean Accuracy {round(accuracy*100, 2)}%')
#     print(f'F1-score {round(f1*100, 2)}%')
#     errors = y - y_pred
#     plt.hist(errors)
#     plot_precision_recall_curve(clf, X, y)
#     plot_confusion_matrix(clf, X, y, values_format='d')
#     def mean_confidence_interval(data, confidence=0.95):
#         a = 1.0 * np.array(data)
#         n = len(a)
#         m, se = np.mean(a), scipy.stats.sem(a)
#         h = se * scipy.stats.t.ppf((1 + confidence) / 2., n-1)
#         return m, m-h, m+h
#     conf_interval = mean_confidence_interval(errors, 0.95)
#     print(conf_interval)
#     metrics = {'test_accuracy': accuracy
#                , 'test_f1_score': f1}
#     return X, y, y_pred
# if __name__ == '__main__':
#     DATASET_PATH = './data/chunks_10_validate.csv'
#     MODEL_DIR = './models/logreg_regex_graph_v2.sav'
#     TFIDF_DIR = './models/tfidf_logreg_graph_v2.pickle'
#     CODE_COLUMN = 'code'
#     TAG_TO_PREDICT = 'tag'
#     df, corpus = load_corpus(DATASET_PATH, CODE_COLUMN)
#     nrows = df.shape[0]
#     print("loaded")
#     params = {'min_df': 5
#              , 'max_df': 0.3
#              , 'smooth_idf': True}
#     data_meta = {'DATASET_PATH': DATASET_PATH
#                 ,'nrows': nrows
#                 ,'label': TAG_TO_PREDICT
#                 ,'model': 'Logistic Regression'}
#     features = tfidf_transform(corpus, params, TFIDF_DIR)
#     print("tfidf-ed")
#     X, y, y_pred = analyze_predictions(features, df[TAG_TO_PREDICT], TAG_TO_PREDICT, MODEL_DIR)
#     print("finished")

In [11]:
# pd.set_option('max_colwidth', 500)
# pd.set_option('max_rows', 500)

In [12]:
# ## False Positives (preprocessing y/n)
# corpus[(y_pred-y)==-1].reset_index(drop=True)[:]

In [35]:
from operator import itemgetter
def show_most_informative_features(model, vectorizer=None, text=None, n=20):
    """
    Extract the vectorizer and the classifier from the pipeline
    """
    if vectorizer is None:
        vectorizer = model.named_steps['vectorizer']
    else:
        text = vectorizer.transform([text])

    classifier = model#.named_steps['classifier']
    feat_names = vectorizer.get_feature_names()

    # Check to make sure that we can perform this computation
    if not hasattr(classifier, 'coef_'):
        raise TypeError(
            "Cannot compute most informative features on {}.".format(
                classifier.__class__.__name__
            )
        )    

    # Otherwise simply use the coefficients
    tvec = classifier.coef_

    # Zip the feature names with the coefs and sort   
    coefs = sorted(
        zip(tvec[0], feat_names),
        key=itemgetter(0), reverse=True
    )

    # Get the top n and bottom n coef, name pairs
    topn  = zip(coefs[:n], coefs[:-(n+1):-1])

    # Create the output string to return
    output = []
    # output = pd.DataFrame()
    # If text, add the predicted value to the output.
    if text is not None:
        # output.append("\"{}\"".format(text))
        # output.append(
        #     "Classified as: {}".format(model.predict(text))
        # )
        # output.append("")
        print("Classified as: {}".format(model.predict(text)))
    # Create two columns with most negative and most positive features.

    for (cp, fnp), (cn, fnn) in topn:
        print(cp, fnp, cn, fnn)
        output.append(
            "{:0.2f}{: >15}    {:0.2f}{: >15}".format(
                cp, fnp, cn, fnn
            )
        )

    return "\n".join(output)

In [36]:
MODEL_DIR = './models/logreg_regex_graph_v2.1.sav'
TFIDF_DIR = './models/tfidf_logreg_graph_v2.1.pickle'
DATASET_PATH = './data/chunks_10_validate.csv'
CODE_COLUMN = 'code'
TAG_TO_PREDICT = 'tag'
df, corpus = load_corpus(DATASET_PATH, CODE_COLUMN)
interpret = show_most_informative_features(model=pickle.load(open(MODEL_DIR, 'rb')),
                                vectorizer=pickle.load(open(TFIDF_DIR, 'rb')),
                                text=corpus[5],
                                n = 500)
# with open('logreg_interpret.txt', mode='w') as f:
#     f.write(interpret)

2.3542671378198365 preprocess_data -0.8115179934301682 optim
2.342591875572756 converters -0.8112821672026299 revenue
2.3088188325493317 normalize_train_df -0.8112814693317606 max_depth
2.3080509442376456 reset_index -0.8095950170915466 cmap
2.305053061122067 ps -0.8047111148717763 dataframe
2.260900244373241 iowa_model -0.8046399704184807 thresh
2.2451201005523447 nthread -0.7962325925645427 imputed_x_train
2.240481402449969 count_vectorizer -0.794739429361593 test_index
2.2296438526009816 x_test_scale -0.7931005736137324 rf
2.2290998691768245 data_new_scaled -0.7907898954797188 datetimeindex
2.211130640681788 224 -0.7898133774608898 isnull
2.205590909029939 preprocessed -0.7896623174046836 datasets
2.2036217412115833 mask -0.7896136899535711 12
2.195748737030879 payload -0.7881389924885668 scatterplot
2.1953703862459375 fit_intercept -0.7876030358874339 feature_importances_
2.1929268336200267 q0 -0.7845745506246161 num_words
2.1868158331436174 grp -0.7841018450234203 to_categorical
2