In [1]:
import pandas as pd
import itertools
import os 
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score
import seaborn as sns
from sklearn.calibration import CalibratedClassifierCV 
pd.set_option('display.max_columns', 999)
import tensorflow as tf
from sklearn.preprocessing import LabelBinarizer, LabelEncoder  
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras.preprocessing import text, sequence
from keras import utils
from numpy import argmax
import pickle 
import threading 

Using TensorFlow backend.


In [6]:
summaries_df = pd.read_csv('bill_summaries.csv')
subjects_df = pd.read_csv('bill_metadata/subjects/comparative_agendas_labeled_bills.csv')
merged_df = pd.merge(subjects_df, summaries_df, how='left',
              left_on='json_bill_id', right_on='bill_number')
df = merged_df[['bill_number', 'summary', 'majortopic', 'subtopic']]
X = df['summary'].astype(str)
y = df['majortopic'] #.astype(str)
print ('Null values:', y.isna().sum())
y = y.fillna(404)
print ('Null values after filling:', y.isna().sum()) 

Null values: 2520
Null values after filling: 0


In [None]:
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2',
                        encoding='latin-1', ngram_range=(1, 2), stop_words='english')
features = tfidf.fit_transform(X)
labels = y
features.shape

In [2]:
def top_probs_and_classes(model_scores, model_classes, top_n):
    score_number_list = []
    prob_list = []
    predicted_label_list = []
    descending_order = (-model_scores).argsort()
    #ordered_scores = descending_order[0][:top_n]
    for model_score in range(len(model_scores)):
        ordered_scores = descending_order[model_score][:top_n]
        for score in ordered_scores:
            predicted_label = model_classes[score]
            prob = (model_scores[model_score][score]) * 100
            prob = "%.3f" % round(prob, 3)
            prob_list.append(prob)
            predicted_label_list.append(predicted_label)
            score_number_list.append(model_score)
        
    data_dict = {'score_number': score_number_list,
                 'probability': prob_list,
                'predicted_label': predicted_label_list}
        
    df = pd.DataFrame(data=data_dict)
    
    return df 

In [3]:
def top_n_accuracy(known_label_list, prediction_df):
    known_labels = known_label_list.reset_index(drop=True)
    hits = 0
    
    for i in range(len(known_labels)):
        if known_labels[i] in list(prediction_df['predicted_label'][
            prediction_df['score_number'] == i]):
            hits +=1
        else:
            pass
        
    accuracy = round(((hits / len(known_labels))*100), ndigits=2) 
    
    
    #print (accuracy,'%')
    return accuracy 

In [7]:
sub_df_1 = df[df['majortopic'] == 1]
sub_df_2 = df[df['majortopic'] == 2]
sub_df_3 = df[df['majortopic'] == 3]
sub_df_4 = df[df['majortopic'] == 4]
sub_df_5 = df[df['majortopic'] == 5]
sub_df_6 = df[df['majortopic'] == 6]
sub_df_7 = df[df['majortopic'] == 7]
sub_df_8 = df[df['majortopic'] == 8]
sub_df_9 = df[df['majortopic'] == 9]
sub_df_10 = df[df['majortopic'] == 10]
sub_df_12 = df[df['majortopic'] == 12]
sub_df_13 = df[df['majortopic'] == 13]
sub_df_14 = df[df['majortopic'] == 14]
sub_df_15 = df[df['majortopic'] == 15]
sub_df_16 = df[df['majortopic'] == 16]
sub_df_17 = df[df['majortopic'] == 17]
sub_df_18 = df[df['majortopic'] == 18]
sub_df_19 = df[df['majortopic'] == 19] 
sub_df_20 = df[df['majortopic'] == 20]
sub_df_21 = df[df['majortopic'] == 21]
sub_df_99 = df[df['majortopic'] == 99] 
sub_df_404 = df[df['majortopic'].isna()]

In [None]:
sub_df_1

In [None]:
list_of_sub_dfs = [sub_df_1,
                  sub_df_2,
                  sub_df_3,
                  sub_df_4,
                  sub_df_5,
                  sub_df_6,
                  sub_df_7,
                  sub_df_8,
                  sub_df_9,
                  sub_df_10,
                  sub_df_12,
                  sub_df_13,
                  sub_df_14,
                  sub_df_15,
                  sub_df_16,
                  sub_df_17,
                  sub_df_18,
                  sub_df_19,
                  sub_df_20,
                  sub_df_21,
                  sub_df_99,
                  sub_df_404]

In [None]:
for l in list_of_sub_dfs:
    print (l['subtopic'].nunique())

In [None]:
sub_df_404['subtopic'].nunique()

In [None]:
def minor_subject_classifier_eval(summaries_and_labels_df):
    
    too_few_labels = []
    too_few_counts = [] 
    
    unique_labels = list(summaries_and_labels_df['subtopic'].unique())
    for label in unique_labels:
        
        if len(summaries_and_labels_df['subtopic'][summaries_and_labels_df[
            'subtopic'] == label])/len(summaries_and_labels_df)*100 > 0.1:
            pass
        else:
            too_few_labels.append(label)
            too_few_counts.append(len(summaries_and_labels_df['subtopic'][
                summaries_and_labels_df['subtopic'] == label]))
            
    if len(too_few_labels) > 0:
        summaries_and_labels_df = summaries_and_labels_df[~
            summaries_and_labels_df['subtopic'].isin(too_few_labels)]
    else:
        pass  
            
            
    
    X = summaries_and_labels_df['summary'].astype(str)
    y = summaries_and_labels_df['subtopic'] #.astype(str)
    y = y.fillna(404)
    
    
    tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2',
                        encoding='latin-1', ngram_range=(1, 2), stop_words='english')
    
    features = tfidf.fit_transform(X)
    labels = y 

    models = [
        RandomForestClassifier(n_estimators=200, max_depth=3, random_state=42),
        LinearSVC(),
        MultinomialNB(),
        LogisticRegression(random_state=42),]

    CV = 5
    cv_df = pd.DataFrame(index=range(CV * len(models)))
    entries = []
    for model in models:
        model_name = model.__class__.__name__
        accuracies = cross_val_score(model, features, labels, scoring='accuracy', cv=CV,
                                    n_jobs=-1, verbose=1)
        for fold_idx, accuracy in enumerate(accuracies):
            entries.append((model_name, fold_idx, accuracy))
    cv_df = pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'accuracy'])
    
    error_dict = {'label': too_few_labels, 'count': too_few_counts}
    
    error_df = pd.DataFrame(error_dict)
                                                          
    print ('Mean Scores:', cv_df.groupby('model_name').accuracy.mean(),
          '\n'*2, 'Median Scores:', cv_df.groupby('model_name').accuracy.median())
    
    
                                                          
    return cv_df 
     

In [5]:
def ml_model_trainer(input_df, label_col, input_list, n, desired_output):
    
    too_few_labels = []
    too_few_counts = [] 
    
    unique_labels = list(input_df[label_col].unique())
    if len(unique_labels) > 1:
        for label in unique_labels:
        
            if len(input_df[label_col][input_df[
                label_col] == label])/len(input_df)*100 > 0.1:
                pass
            else:
                too_few_labels.append(label)
                too_few_counts.append(len(input_df[label_col][
                    input_df[label_col] == label]))
            
        if len(too_few_labels) > 0:
            input_df = input_df[~
                input_df[label_col].isin(too_few_labels)]
        else:
            pass
    
        error_dict = {'label': too_few_labels, 'count': too_few_counts}
        error_df = pd.DataFrame(error_dict)
    
        X = input_df['summary'].astype(str)
        y = input_df[label_col] #.astype(str)
        y = y.fillna(404)
    
    
    
        tfidf = TfidfVectorizer(sublinear_tf=True,
                                min_df=5,
                                norm='l2',
                                encoding='latin-1',
                                ngram_range=(1, 2),
                                stop_words='english')
        if desired_output == 'accuracy_score':
            X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)
            model = LinearSVC()
            clf = CalibratedClassifierCV(model)
            X_train = tfidf.fit_transform(X_train)
            X_test = tfidf.transform(X_test)
            clf.fit(X_train, y_train)
            classes = clf.classes_
            scores = clf.predict_proba(X_test)
            prediction_df = top_probs_and_classes(model_scores=scores,
                                              model_classes=classes,
                                              top_n=n)
            accuracy_score = top_n_accuracy(y_test, prediction_df)
        
            print ('\n'*2, 'Insufficient Scores Report:', '\n', error_df)
            print ('\n'*2, 'Model Accuracy Scores:', accuracy_score,'%') 
        else:
            model = LinearSVC()
            clf = CalibratedClassifierCV(model)
            features = input_df['summary'].fillna('No summary available.')
            features = tfidf.fit_transform(features)
            labels = input_df[label_col]
            clf.fit(features, labels)
            if desired_output == 'data':
                input_list = tfidf.transform(input_list)
                classes = clf.classes_
                scores = clf.predict_proba(input_list)
                prediction_df = top_probs_and_classes(model_scores=scores,
                                                model_classes=classes,
                                                top_n=n)
                print ('\n'*2, 'Insufficient Scores Report:', '\n', error_df)
                return prediction_df
            else:
                s = pickle.dumps(clf)
                predictor = pickle.loads(s)
                print ('\n'*2, 'Insufficient Scores Report:', '\n', error_df)
                model_dict = {'model':predictor, 'vectorizer': tfidf}
                return model_dict
    else:
        print ('no dice')

In [None]:
test_input_list = sub_df_1['summary'] 

In [None]:
preliminary_test = ml_model_trainer(df, 'majortopic', test_input_list, 3, 0)

In [None]:
preliminary_test 

In [4]:
def slice_every_n(df, n, category):
    column_names = []
    value_cols = []
    for i in range(n):
        column_names.append(category + '_label ' + str(i+1))
        column_names.append(category + '_prob ' + str(i+1))
    for i in range(n):
        value_cols.append(df['predicted_label'].iloc[i::n].astype('float'))
        value_cols.append(df['probability'].iloc[i::n].astype('float'))
    table = pd.DataFrame(value_cols)
    transpose = table.transpose()
    v = transpose.values
    i = np.arange(v.shape[1])
    a = np.isnan(v).argsort(0, kind='mergesort')
    v[:] = v[a, i] 
    output_df = transpose
    output_df.columns = (column_names)
    output_df = output_df.dropna()
    
    return output_df

In [None]:
slicer_test = slice_every_n(preliminary_test, 3)

In [None]:
slicer_test

In [None]:
column_slice = slicer_test.loc[:0,::2].values

In [None]:
column_slice

In [None]:
column_slice.values

In [None]:
column_slice.head(1).val

 subdfs without multiple unique subtopics: 99, 9, 404

# Pickling the subtopic models

In [9]:
empty_list = [] 

In [None]:
pickle_major = ml_model_trainer(df, 'majortopic', empty_list, 3, 'pickle') 

In [10]:
pickle_1 = ml_model_trainer(sub_df_1, 'subtopic', empty_list, 3, 'pickle')
pickle_2 = ml_model_trainer(sub_df_2, 'subtopic', empty_list, 3, 'pickle')
pickle_3 = ml_model_trainer(sub_df_3, 'subtopic', empty_list, 3, 'pickle')
pickle_4 = ml_model_trainer(sub_df_4, 'subtopic', empty_list, 3, 'pickle')
pickle_5 = ml_model_trainer(sub_df_5, 'subtopic', empty_list, 3, 'pickle')
pickle_6 = ml_model_trainer(sub_df_6, 'subtopic', empty_list, 3, 'pickle')
pickle_7 = ml_model_trainer(sub_df_7, 'subtopic', empty_list, 3, 'pickle')
pickle_8 = ml_model_trainer(sub_df_8, 'subtopic', empty_list, 3, 'pickle')
pickle_10 = ml_model_trainer(sub_df_10, 'subtopic', empty_list, 3, 'pickle')
pickle_12 = ml_model_trainer(sub_df_12, 'subtopic', empty_list, 3, 'pickle')
pickle_13 = ml_model_trainer(sub_df_13, 'subtopic', empty_list, 3, 'pickle')
pickle_14 = ml_model_trainer(sub_df_14, 'subtopic', empty_list, 3, 'pickle')
pickle_15 = ml_model_trainer(sub_df_15, 'subtopic', empty_list, 3, 'pickle')
pickle_16 = ml_model_trainer(sub_df_16, 'subtopic', empty_list, 3, 'pickle')
pickle_17 = ml_model_trainer(sub_df_17, 'subtopic', empty_list, 3, 'pickle')
pickle_18 = ml_model_trainer(sub_df_18, 'subtopic', empty_list, 3, 'pickle')
pickle_19 = ml_model_trainer(sub_df_19, 'subtopic', empty_list, 3, 'pickle')
pickle_20 = ml_model_trainer(sub_df_20, 'subtopic', empty_list, 3, 'pickle')
pickle_21 = ml_model_trainer(sub_df_21, 'subtopic', empty_list, 3, 'pickle') 





 Insufficient Scores Report: 
     label  count
0  1000.0      1
1  1001.0      2
2  1006.0      1
3  1209.0      1






 Insufficient Scores Report: 
     label  count
0   321.0      1
1   325.0      1
2   332.0      1
3  2000.0      3
4  2004.0      3






 Insufficient Scores Report: 
     label  count
0   308.0      1
1  1209.0      1
2     NaN      0






 Insufficient Scores Report: 
 Empty DataFrame
Columns: [label, count]
Index: []






 Insufficient Scores Report: 
    label  count
0  302.0      1
1  507.0      1
2  509.0      1






 Insufficient Scores Report: 
     label  count
0  1612.0      1






 Insufficient Scores Report: 
    label  count
0  408.0      1






 Insufficient Scores Report: 
    label  count
0  804.0      1






 Insufficient Scores Report: 
    label  count
0  710.0      1






 Insufficient Scores Report: 
     label  count
0   104.0      1
1  2103.0      3
2  2104.0      1






 Insufficient Scores Report: 
     label  count
0   332.0      1
1  1403.0      1






 Insufficient Scores Report: 
     label  count
0  1421.0      1
1  1504.0      1






 Insufficient Scores Report: 
 Empty DataFrame
Columns: [label, count]
Index: []






 Insufficient Scores Report: 
     label  count
0  1618.0      2
1  1621.0      1






 Insufficient Scores Report: 
     label  count
0  1205.0      1
1  1702.0      3






 Insufficient Scores Report: 
     label  count
0  1801.0      1
1  1825.0      1






 Insufficient Scores Report: 
     label  count
0  1916.0      2






 Insufficient Scores Report: 
     label  count
0   208.0      1
1  2033.0      1






 Insufficient Scores Report: 
     label  count
0  1203.0      1
1  2000.0      3
2  2001.0      1


In [None]:
pickle_1.classes_

In [11]:
pickle_dict = {1: pickle_1, 2: pickle_2, 3: pickle_3, 4: pickle_4,
              5: pickle_5, 6: pickle_6, 7: pickle_7, 8: pickle_8,
              10: pickle_10, 12: pickle_12, 13: pickle_13, 14: pickle_14,
              15: pickle_15, 16: pickle_16, 17: pickle_17, 18: pickle_18,
              19: pickle_19, 20: pickle_20, 21: pickle_21}

one_subclass_dict = {99:9999, 9:900, 404:404}

In [None]:
one_subclass_dict.keys()

In [None]:
summaries_df.head(3)

In [None]:
one_subclass_df_dict = {''}

In [None]:
len(summaries_df)

In [None]:
def major_label_function(df, n):
    tfidf = pickle_major['vectorizer']
    clf = pickle_major['model']
    #transformed_summaries = tfidf.fit_transform(df['summary'])
    transformed_summaries = tfidf.transform(df['summary'])
    input_list = transformed_summaries
    m_classes = clf.classes_
    m_scores = clf.predict_proba(input_list)
    m_prediction_df = top_probs_and_classes(model_scores=m_scores,
                                            model_classes=m_classes,
                                            top_n=n)
    m_df = slice_every_n(m_prediction_df, n, 'major')
    m_df = pd.concat([df, m_df], axis=1)
    return m_df

In [None]:
major_subjects_df = major_label_function(summaries_df, 3)

In [None]:
len(major_subjects_df) 

In [None]:
len(summaries_df)

In [None]:
major_subjects_df.head(3)

In [12]:
single_label_dict = {'major_label': [9, 99, 404],
                     'minor_label 1': [900, 9999, 404], 'minor_prob 3': [100, 100, 100],
                     'minor_label 2': [900, 9999, 404], 'minor_prob 2': [100, 100, 100],
                    'minor_label 3': [900, 9999, 404], 'minor_prob 3': [100, 100, 100]}


In [13]:
single_label_df = pd.DataFrame(single_label_dict)

In [14]:
single_label_df

Unnamed: 0,major_label,minor_label 1,minor_prob 3,minor_label 2,minor_prob 2,minor_label 3
0,9,900,100,900,100,900
1,99,9999,100,9999,100,9999
2,404,404,100,404,100,404


In [None]:
pickle_dict[5]

In [None]:
pickle_major

In [15]:
major_subjects_df = pd.read_csv('major_subjects_data.csv')

In [16]:
def sub_label_function(df, n, label_list):
    row_list = []
    m_labels = label_list
    for label in range(len(m_labels)):
        if m_labels[label] in one_subclass_dict.keys():
            s_df = single_label_df[single_label_df['major_label'] == m_labels[label]]
        else:      
            model = pickle_dict[m_labels[label]]
            tfidf = model['vectorizer']
            clf = model['model']
            text = [df['summary'][label]] 
            transformed_summary = tfidf.transform(text)
            input_list = transformed_summary
            s_classes = clf.classes_
            s_scores = clf.predict_proba(input_list)
            s_prediction_df = top_probs_and_classes(model_scores=s_scores,
                                                  model_classes=s_classes,
                                                  top_n=n)
            s_df = slice_every_n(s_prediction_df, n, 'minor')
        row_list.append(s_df)
    output_df = pd.DataFrame(row_list)
    return output_df
    

In [None]:
def t_sub_label_function(df, n, label_list, output_list):
    row_list = []
    m_labels = label_list
    for label in range(len(m_labels)):
        if m_labels[label] in one_subclass_dict.keys():
            s_df = single_label_df[single_label_df['major_label'] == m_labels[label]]
        else:      
            model = pickle_dict[m_labels[label]]
            tfidf = model['vectorizer']
            clf = model['model']
            text = [df['summary'][label]] 
            transformed_summary = tfidf.transform(text)
            input_list = transformed_summary
            s_classes = clf.classes_
            s_scores = clf.predict_proba(input_list)
            s_prediction_df = top_probs_and_classes(model_scores=s_scores,
                                                  model_classes=s_classes,
                                                  top_n=n)
            s_df = slice_every_n(s_prediction_df, n, 'minor')
        row_list.append(s_df)
    output_df_1 = pd.DataFrame(row_list)
    output_df = pd.concat([m_labels, output_df_1], axis=1)
    output_list.append(output_df)
    

In [None]:
def data_threading_optimization(df, label_list, n_splits):
    list_of_dfs = np.array_split(df, n_splits)
    list_of_label_lists = np.array_split(label_list, n_splits)
    for i in list_of_dfs:
        i.reset_index(drop=True, inplace=True)
    for c in list_of_label_lists:
        c.reset_index(drop=True, inplace=True)
    output_dict = {'sub_dfs': list_of_dfs, 'sub_lists': list_of_label_lists}
    return output_dict
    
    

In [None]:
major_label_1_subdata = data_threading_optimization(major_subjects_df,
                                                   major_label_list,
                                                   12)

In [None]:
len(major_label_1_subdata) 

In [None]:
len(major_label_1_subdata['sub_dfs'])

In [None]:
len(major_label_1_subdata['sub_dfs'][4])

In [None]:
major_label_1_subdata['sub_dfs'][4] 

In [None]:
ol1 = []
ol2 = []
ol3 = []
ol4 = []
ol5 = []
ol6 = []
ol7 = []
ol8 = []
ol9 = []
ol10 = []
ol11 = []
ol12 = []
ol_list = [ol1, ol2, ol3, ol4, ol5, ol6, ol7, ol8, ol9, ol10, ol11, ol12]

In [None]:
for i in ol_list:
    print (len(i))

In [None]:
ol1[0]

In [None]:
def thread_executor(sub_data_dict, output_lists, start_val):
    dfs = sub_data_dict['sub_dfs']
    lists = sub_data_dict['sub_lists']
    n = 3 
    t1 = threading.Thread(target=t_sub_label_function,
                          args=(dfs[0], n,
                                lists[0], output_lists[0]))
    t2 = threading.Thread(target=t_sub_label_function,
                          args=(dfs[1], n,
                                lists[1], output_lists[1]))
    t3 = threading.Thread(target=t_sub_label_function,
                          args=(dfs[2], n,
                                lists[2], output_lists[2]))
    t4 = threading.Thread(target=t_sub_label_function,
                          args=(dfs[3], n,
                                lists[3], output_lists[3]))
    t5 = threading.Thread(target=t_sub_label_function,
                          args=(dfs[4], n,
                                lists[4], output_lists[4]))
    t6 = threading.Thread(target=t_sub_label_function,
                          args=(dfs[5], n,
                                lists[5], output_lists[5]))
    t7 = threading.Thread(target=t_sub_label_function,
                          args=(dfs[6], n,
                                lists[6], output_lists[6]))
    t8 = threading.Thread(target=t_sub_label_function,
                          args=(dfs[7], n,
                                lists[7], output_lists[7]))
    t9 = threading.Thread(target=t_sub_label_function,
                          args=(dfs[8], n,
                                lists[8], output_lists[8]))
    t10 = threading.Thread(target=t_sub_label_function,
                          args=(dfs[9], n,
                                lists[9], output_lists[9]))
    t11 = threading.Thread(target=t_sub_label_function,
                          args=(dfs[10], n,
                                lists[10], output_lists[10]))
    t12 = threading.Thread(target=t_sub_label_function,
                          args=(dfs[11], n,
                                lists[11], output_lists[11]))
    if start_val == 1: 
        t1.start()
        t2.start()
        t3.start()
        t4.start()
        t5.start()
        t6.start()
        t7.start()
        t8.start()
        t9.start()
        t10.start()
        t11.start()
        t12.start()
    else:
        pass

In [None]:
test_thread_executor = thread_executor(major_label_1_subdata,
                                      ol_list,
                                      1)

In [None]:
major_subjects_df.to_csv('major_subjects_data.csv', index=False)

In [17]:
major_label_list_1 = major_subjects_df['major_label 1'] 
major_label_list_2 = major_subjects_df['major_label 2']
major_label_list_3 = major_subjects_df['major_label 3']

In [None]:
subtopic_1_df = sub_label_function(major_subjects_df, 3, major_label_list_1) 

In [None]:
subtopic_2_df = sub_label_function(major_subjects_df, 3, major_label_list_2)

In [None]:
subtopic_3_df = sub_label_function(major_subjects_df, 3, major_label_list_3)

In [None]:
subtopic_1_df = test_subtopic_df

In [None]:
test_subtopic_df[0][0]

In [None]:
test_subtopic_df.head()

In [None]:
test_subtopic_df.shape 

In [None]:
test_subtopic_df.info()

In [None]:
concat_test = pd.concat([r for r in test_subtopic_df[0]], ignore_index=True)

In [None]:
concat_test

In [None]:
for label in range(len(major_label_list)):
    print (label)

In [None]:
def major_label_function(df, n):
    tfidf = pickle_major['vectorizer']
    clf = pickle_major['model']
    #transformed_summaries = tfidf.fit_transform(df['summary'])
    transformed_summaries = tfidf.transform(df['summary'])
    input_list = transformed_summaries
    m_classes = clf.classes_
    m_scores = clf.predict_proba(input_list)
    m_prediction_df = top_probs_and_classes(model_scores=m_scores,
                                            model_classes=m_classes,
                                            top_n=n)
    
    m_df = slice_every_n(m_prediction_df, n, 'major')
    m_df = pd.concat([df, m_df], axis=1)
    return m_df

In [None]:
"""

        m_labels = m_df.loc[:0,::2].values
        
        for label in m_labels:
            if label in one_subclass_dict.keys():
                
            model = pickle_dict[label]
            input_list = transformed_summaries[bill]
            s_classes = model.classes_
            s_scores = model.predict_proba(input_list)
            s_prediction_df = top_probs_and_classes(model_scores=s_scores,
                                              model_classes=s_classes,
                                              top_n=n)
            s_df = slice_every_n(s_prediction_df, n, 'minor')
            m_df =pd.concat([m_df, s_df], axis=1)
            
        output_df.append(m_df)
    

"""