In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import *

import dagshub
import pickle

## Weights Analysis

In [26]:
GRAPH_VER = 4
DATASET_PATH = './data/code_blocks_regex_graph_v{}.csv'.format(GRAPH_VER)
MODEL_DIR = './models/logreg_regex_graph_v{}.sav'.format(GRAPH_VER)
TFIDF_DIR = './models/tfidf_logreg_graph_v{}.pickle'.format(GRAPH_VER)
CODE_COLUMN = 'code_block'
TAGS_TO_PREDICT = ['import', 'data_import', 'data_export', 'preprocessing',
                    'visualization', 'model', 'train', 'predict']
# PREDICT_COL = 'pred_{}'.format(TAGS_TO_PREDICT)

# VAL_CHUNK_SIZE = 10
# VAL_CODE_COLUMN = 'code'
# VAL_TAGS_TO_PREDICT = 'tag'
# VAL_DATASET_PATH = './data/chunks_{}_validate.csv'.format(VAL_CHUNK_SIZE)

In [33]:
def load_corpus(DATASET_PATH, CODE_COLUMN):
    df = pd.read_csv(DATASET_PATH, encoding='utf-8', comment='#', sep='\t')#, quoting=csv.QUOTE_NONE, error_bad_lines=False)#, sep=','
    df.dropna(axis=0, inplace=True)
    corpus = df[CODE_COLUMN]
    test_size = 0.1
    test_rows = round(df.shape[0]*test_size)
    train_rows = df.shape[0] - test_rows
    train_corpus = df[CODE_COLUMN][0:test_rows]
    test_corpus = df[CODE_COLUMN][train_rows:]
    return df, corpus

In [34]:
from operator import itemgetter
def show_most_informative_features(model, vectorizer=None, text=None, n=20):
    """
    Extract the vectorizer and the classifier from the pipeline
    """
    if vectorizer is None:
        vectorizer = model.named_steps['vectorizer']
    else:
        text = vectorizer.transform([text])

    classifier = model#.named_steps['classifier']
    feat_names = vectorizer.get_feature_names()

    # Check to make sure that we can perform this computation
    if not hasattr(classifier, 'coef_'):
        raise TypeError(
            "Cannot compute most informative features on {}.".format(
                classifier.__class__.__name__
            )
        )    

    # Otherwise simply use the coefficients
    tvec = classifier.coef_

    # Zip the feature names with the coefs and sort   
    coefs = sorted(
        zip(tvec[0], feat_names),
        key=itemgetter(0), reverse=True
    )

    # Get the top n and bottom n coef, name pairs
    topn  = zip(coefs[:n], coefs[:-(n+1):-1])

    # Create the output string to return
    output = []
    # output = pd.DataFrame()
    # If text, add the predicted value to the output.
    if text is not None:
        # output.append("\"{}\"".format(text))
        # output.append(
        #     "Classified as: {}".format(model.predict(text))
        # )
        # output.append("")s
        print("Classified as: {}".format(model.predict(text)))
    # Create two columns with most negative and most positive features.

    for (cp, fnp), (cn, fnn) in topn:
        print(cp, fnp, cn, fnn)
        output.append(
            "{:0.2f}{: >15}    {:0.2f}{: >15}".format(
                cp, fnp, cn, fnn
            )
        )

    return "\n".join(output)

In [36]:
df, corpus = load_corpus(DATASET_PATH, CODE_COLUMN)
model = pickle.load(open(MODEL_DIR, 'rb'))
tfidf = pickle.load(open(TFIDF_DIR, 'rb'))

if hasattr(model, 'estimators_'):
    for estimator in model.estimators_:
        show_most_informative_features(model=estimator,
                                        vectorizer=tfidf,
                                        text=corpus[5],
                                        n = 500)
        # with open('logreg_interpret.txt', mode='w') as f:
        #     f.write(interpret)
else:
    estimator = model
    show_most_informative_features(model=estimator,
                                    vectorizer=tfidf,
                                    text=corpus[5],
                                    n = 500)
    # with open('logreg_interpret.txt', mode='w') as f:
    #     f.write(interpret)

1889343024887102 score -0.8447525908559158 random_grid
2.188844015587808 neg_root_mean_squared_error -0.8379948649636675 any
2.1718848333955547 final_x_test -0.837435680165424 query
2.16726862340458 jaccard_similarity_score -0.8365062624260183 citytracker
2.166547215444048 predicted_data -0.8320213967117405 county
2.163906859447693 neg_mean_absolute_error -0.8320045300723122 imread
2.155739060154632 model_type -0.8286695761065626 onfusion_matrix
2.1477646423118673 yhat -0.8283448211194873 imshow
2.1422554860230654 n_jobs -0.8263191886098348 super
2.137728052476054 mean_ratings -0.8216608001514049 df
2.1360717176538024 train_idx -0.8197142237926279 train_path
2.135570165267831 redictions -0.8187856768894801 crosstab
2.129616566803731 ravel -0.818184820255393 timeseries
2.1199419648945796 run_experiment -0.8181689423426325 mh
2.1109153117755017 this_preds -0.8162024402323635 cast
2.1104237832593173 mean_train_score -0.8133992503614267 code
2.109287888554826 df_out -0.8123208812630214 tic