# Imports and Get Data

In [1]:
import json
import pandas as pd
import re
import random
import numpy as np
from sklearn.model_selection import train_test_split
from collections import Counter
#from sklearn.metrics import roc_curve, auc, roc_auc_score
import sklearn.metrics # import confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
import sklearn.model_selection as ms
import matplotlib.pyplot as plt
import seaborn as sns
import time

In [2]:
data = pd.read_json('merchantSwipeDump.json')

In [55]:
data.sample(20)

Unnamed: 0,mapped_brand,mcc,merchant_string,network,mapped_brand_response,merchant_string1
124857,,5541.0,kwik trip 298,V,,"[kwik, trip]"
183961,,5814.0,papa john's #02644,V,,"[papa, john's]"
54055,,5541.0,colvin oil i llc,V,,"[colvin, oil, i, llc]"
98402,,5814.0,jimmy johns - 1526,V,,"[jimmy, johns]"
70290,,5399.0,goodwill store #29,V,,[goodwill]
293067,,4121.0,uber *eats hjiyo,V,,"[uber, *eats, hjiyo]"
17732,,5541.0,blues son grocery,V,,"[blues, son, grocery]"
71627,,5812.0,goma tei ramen ala mo,V,,"[goma, tei, ramen, ala, mo]"
279985,,5814.0,wendys #6541,V,,[wendys]
68288,,7993.0,gameflp*73-dstp,V,,"[gameflp*73, dstp]"


# Cleaning Functions

In [3]:
def preprocess(df):
    ## Remove ".0" from MCC column
    df['mcc']=df['mcc'].apply(lambda x: str(x).strip(".0"))
    ## Change mcc nan to NaN
    df['mcc'] = np.where(df['mcc']=='nan', np.nan, df['mcc'])
    df['merchant_string'] = df['merchant_string'].apply(lambda x: x.lower())

In [4]:
## Map by MCC as well
mcc_dict = {'6011': 'atm', '6010': 'atm', '7523':'parking'}

def mcc_dict_funct(df, col_origin, col_output, mcc_dict):
    for key, value in mcc_dict.items():
            df[col_output] = np.where(df[col_origin]==key, value, df[col_output])

In [5]:
def most_common_words(df, col):
    # Turn merchant string into list and flatten list of sublists
    words_merchant_string_2 = [elem for sublist in df[col].tolist() for elem in sublist]

    # Get DataFrame of words with their count
    wordcnt_df = pd.DataFrame.from_dict(dict(Counter(words_merchant_string_2)), orient='index')\
        .reset_index().rename(columns={"index": "keyword", 0: "cnt"})\
        .sort_values(by='cnt',ascending=False)
    most_common_words = list(wordcnt_df['keyword'][0:1000])
    most_common_words1 = ["\\" + x if x[0] == "*" else x for x in most_common_words]
    return wordcnt_df, most_common_words, most_common_words1

In [6]:
def dummify_data(df, most_common_words1):
    for keyword in most_common_words1:
        df[keyword] = np.where(df['merchant_string'].str.contains(keyword),1,0)

# Clean and Prepare Data

In [7]:
%run clean.py
preprocess(data)
data['mapped_brand_response'] = ""
mcc_dict_funct(data, 'mcc', 'mapped_brand_response', mcc_dict)

clean(data, old_col='merchant_string', col='merchant_string1',split_string=split_string,
  remove_string = remove_string,lowercase=False, remove_empty_strings_bi=True,
  join_mcc_bi=False,rejoin_col_strings_bi=False)

wordcnt_df, most_common_words, most_common_words1 = most_common_words(data,'merchant_string1')

data_dummified = data.copy()

In [8]:
#list(pd.read_csv('most_common_words1.csv'))

In [9]:
# Dummify data (on most common words in merchant string cleaned)
start_time = time.time()
dummify_data(data_dummified, most_common_words1)  

# Dummify additional columns (mcc, network) and drop merchant string column
data_dummified=pd.get_dummies(data_dummified, prefix=['mcc', 'network'], columns=['mcc', 'network'])

#Delete merchant_string_columns
del_cols = ['mapped_brand_response']
for x in data_dummified.columns:
    if bool(re.search('^merchant_string',x)):
        del_cols.append(x)

#data_dummified.columns.contain(del_cols), axis = 1)
data_dummified =data_dummified.drop(del_cols, axis = 1)

print("--- %s seconds ---" % (time.time() - start_time))
#takes about 7 mins (417 sec)

--- 256.6143112182617 seconds ---


In [10]:
#data_dummified.sample(20)

In [153]:
data_cols = pd.DataFrame(list(data_dummified.columns))

In [154]:
data_cols.to_csv('data_cols.csv')

# Modelling Functions

In [11]:
# Train/Test/Holdout
def train_test_holdout(df):
    # Train and Test from original full dfset
    train_full = df[df.mapped_brand.notna()]
    test = df[df.mapped_brand.isna()]

    # Split into X and y for each set
    X_train_full = train_full.drop('mapped_brand', axis=1)
    y_train_full = train_full['mapped_brand']
    X_test = test.drop('mapped_brand', axis=1)
    y_test = test['mapped_brand']

    # Train/Holdout split
    X_train_wo_holdout, X_holdout, y_train_wo_holdout, y_holdout = train_test_split(
         X_train_full, y_train_full, test_size=0.2, random_state=42)

    # Combine X and Y columns for Holdout and Train wo Holdout
#     holdout = train_full[train_full.index.isin(X_holdout.index.values)]
#     train_wo_holdout = train_full[train_full.index.isin(X_train_wo_holdout.index.values)]
    return X_train_full, y_train_full, X_test, y_test, X_train_wo_holdout, X_holdout, y_train_wo_holdout, y_holdout

def multinomial_regression(X_train_wo_holdout, y_train_wo_holdout,X_holdout, y_holdout):
    multinomial = LogisticRegression(multi_class='multinomial', random_state = 42, solver='lbfgs', C=1e5, class_weight = 'balanced') 
    multinomial.fit(X_train_wo_holdout, y_train_wo_holdout)
    mapped_brand_proba = multinomial.predict_proba(X_holdout)
    
    probs = pd.DataFrame(mapped_brand_proba)
    probs['max_prob'] = probs.max(axis = 1)
    probs = probs['max_prob']
    
    mapped_brand_predicted = multinomial.predict(X_holdout)
    
    score = multinomial.score(X_holdout, y_holdout)
    
    mapped_brand_predicted1 = pd.DataFrame(mapped_brand_predicted)
    
    return multinomial, mapped_brand_proba, probs, mapped_brand_predicted, score, mapped_brand_predicted1

def multinomial_output_train(df, mapped_brand_predicted1, probs):
#    mapped_brand_predicted1 = pd.DataFrame(mapped_brand_predicted)
    X_holdout1 = df.assign(mapped_brand_predicted1=mapped_brand_predicted1.values)[['mapped_brand_predicted1']]
    X_holdout1 = X_holdout1.assign(probs = probs.values)
    
    output = data.join(X_holdout1, how = 'inner')
    output['mcc'] = output['mcc'].fillna(-1)
    output['network'] = output['network'].fillna(-1)
    output['mapped_brand_response'] = np.where(output['mapped_brand_response']=='', output['mapped_brand_predicted1'], output['mapped_brand_response'])
    #can add rule about replacing item in column when probability is above a certain threshold
    output.drop('mapped_brand_predicted1',axis=1, inplace = True)
    
    # Add flag on whether mapped brand and predicted mapped brand are same
    output['correct_flag'] = np.where(output['mapped_brand'] == output['mapped_brand_response'], 1, 0)

    correct_overall = output.agg(['sum','count','mean'])[['correct_flag']]
    
    correct_by_brand = output.groupby('mapped_brand').agg(['sum','count','mean'])['correct_flag'].reset_index()\
    .sort_values(by='count',ascending=False)\
    .rename(columns={'sum':'nbr_correct', 'count':'nbr_records', 'mean':'pct_correct'})
    
    return output, correct_overall, correct_by_brand

def multinomial_output_test(df, mapped_brand_predicted1, probs):
#    mapped_brand_predicted1 = pd.DataFrame(mapped_brand_predicted)
    X_holdout1 = df.assign(mapped_brand_predicted1=mapped_brand_predicted1.values)[['mapped_brand_predicted1']]
    X_holdout1 = X_holdout1.assign(probs = probs.values)
    
    output = data.join(X_holdout1, how = 'inner')
    output['mcc'] = output['mcc'].fillna(-1)
    output['network'] = output['network'].fillna(-1)
    output['mapped_brand_response'] = np.where(output['mapped_brand_response']=='', output['mapped_brand_predicted1'], output['mapped_brand_response'])
    #can add rule about replacing item in column when probability is above a certain threshold
    output.drop('mapped_brand_predicted1',axis=1, inplace = True)
    
    # Add flag on whether mapped brand and predicted mapped brand are same
    output['correct_flag'] = np.where(output['mapped_brand'] == output['mapped_brand_response'], 1, 0)
    
    return output


mapping_dict = {'vend': 'vendingmachine', #'usa': 'vendingmachine',
                'wal-mart': 'walmart', 'walmart': 'walmart', 'wm supercenter': 'walmart',
                'uber ': 'uber', # or ubereats
                'paypal': 'paypal',
                "mcdonald's": 'mcdonalds', 'mcdonalds': 'mcdonalds',
                'target t-': 'target', 'target.com': 'target',
                'cvs/pharm': 'cvs',
                'walgreens': 'walgreens',
                'starbucks': 'starbucks', 
                'chick-fil-a': 'chickfila',
                'gamestop': 'gamestop', 
                'google \*': 'googleplay', # or google,
                'kroger': 'kroger',
                'chipotle': 'chipotle',
                'apl\* itunes.com/bill': 'appleitunes', # needs a \ before *
                'dunkin': 'dunkindonuts',
                'amazon': 'amazon',
                'lyft': 'lyft',
                '7-eleven': 'seveneleven', '7 eleven': 'seveneleven',
                "victoria's secret": 'victoriassecret', 'victoriassecret.com':'victoriassecret',
                'etsy.com': 'etsy', 'etsy': 'etsy',
                'duane reade': 'duanereade',
                'taco bell': 'tacobell',
                'dollar-general': 'dollargeneral', 'dollar general': 'dollargeneral', 'dollar ge': 'dollargeneral',
                "wendy's": 'wendys', 'wendys': 'wendys',
                'amc ': 'amc',
                'safeway store': 'safeway', 'safeway': 'safeway',
                'panera bread': 'panerabread',
                'subway restaurant': 'subway',
                'sonic': 'sonic',
                'rite aid store': 'riteaidpharmacy',
                'chevron/': 'chevron',
                'forever 21': 'forever21',
                'dollar tr': 'dollartree',
                "claire's": 'claires',
                'dairy queen': 'dairyqueen',
                "sq \*tomy's": 'tomys', # needs a \ before *
                'qt ': 'quiktrip',
                'microsoft ': 'microsoft',
                'ulta.com': 'ultabeauty', 'ulta #': 'ultabeauty',
                'playstation network': 'playstation',
                'barnes an': 'barnesandnoble', 'barnes & noble': 'barnesandnoble', 'barnesnob': 'barnesandnoble',
                'burger king': 'burgerking',
                'riotgam\*': 'riotgames',
                'michaels stores': 'michaels',
                'sephora': 'sephora',
                'five guys': 'fiveguys', '5guys': 'fiveguys',
                'five below': 'fivebelow',
                'bath and body works': 'bathandbodyworks', 'bath & body works' : 'bathandbodyworks',
                'shake shack': 'shakeshack',
                'chopt': 'chopt',
                'urban-out': 'urbanoutfitters', 'urban out': 'urbanoutfitters',
                "domino's": 'dominos',
                'regal cinemas': 'regalcinemas', 'edwards':'regalcinemas',
                'circle k': 'circlek',
                'sweetgreen': 'sweetgreen',
                'wholefds': 'wholefoods',
                'coca cola': 'cocacola', 'coca-cola': 'cocacola',
                'nyctaxi': 'nyctaxi', 'nyc taxi': 'nyctaxi',
                'shell': 'shell',
                'pacsun': 'pacsun',
                'tjmaxx': 'tjmaxx', 't j maxx': 'tjmaxx', 'tj maxx': 'tjmaxx',
                'toys r us': 'toysrus',
                'lush us': 'lush', 'lush upper west': 'lush',
                'best buy': 'bestbuy',
                'steamgames.com': 'steam',
                'jamba juice': 'jambajuice',
                'jimmy johns': 'jimmyjohns'
               }
def mapping_dict_funct(df, col_origin, col_output, mapping_dict):
    for key, value in mapping_dict.items():
        df[col_output] = np.where(df[col_origin].str.contains(key), value, df[col_output])

# Modelling on Train

In [12]:
data_dummified3 = data_dummified[(data_dummified['mcc_6011'] != 1)]
data_dummified4 = data_dummified3[(data_dummified3['mcc_7523'] != 1)]
#data_dummified2.drop(['mcc_6011', 'mcc_7523'], axis = 1, inplace = True)

In [13]:
%%time
# Test/Train/Holdout split, saving off each df returned
X_train_full, y_train_full, X_test, y_test, X_train_wo_holdout, X_holdout, y_train_wo_holdout, y_holdout = \
train_test_holdout(data_dummified4) 

# can add input parameter for 80/20 split
#takes about 14 seconds to run

CPU times: user 3.21 s, sys: 7.11 s, total: 10.3 s
Wall time: 12.9 s


In [14]:
%%time
multinomial_train, mapped_brand_proba_train, probs_train, mapped_brand_predicted_train, score_train, mapped_brand_predicted1_train = multinomial_regression(X_train_wo_holdout, y_train_wo_holdout, X_holdout, y_holdout)

CPU times: user 1min 25s, sys: 4.63 s, total: 1min 29s
Wall time: 46.4 s


In [15]:
score_train

0.9525831564048125

In [16]:
output, correct_overall, correct_by_brand = multinomial_output_train(X_holdout, mapped_brand_predicted1_train, probs_train)

In [18]:
output.head(5)

Unnamed: 0,mapped_brand,mcc,merchant_string,network,mapped_brand_response,merchant_string1,probs,correct_flag
314731,starbucks,5814,starbucks store 17000,V,starbucks,[starbucks],1.0,1
314734,starbucks,5814,starbucks store 00885,V,starbucks,[starbucks],1.0,1
314737,starbucks,5814,starbucks store 47931,V,starbucks,[starbucks],1.0,1
314748,starbucks,5814,starbucks store 29856,V,starbucks,[starbucks],1.0,1
314750,starbucks,5814,starbucks store 21929,V,starbucks,[starbucks],1.0,1


In [18]:
len(output[(output['correct_flag'] == 1) & \
           (output['mcc'] !='6011') & \
           (output['mcc'] !='7523')])/len(output[(output['mcc'] !='6011') & (output['mcc'] !='7523')])

NameError: name 'output' is not defined

# Prepare Data for Testing

In [26]:
testing_data_dummified = data.copy()
new_merchant_col(testing_data_dummified, 'merchant_string', 'merchant_string_dict') # so not cleaned
testing_data_dummified['mapped_brand_dict_3'] = ''
mapping_dict_funct(testing_data_dummified, 'merchant_string_dict', 'mapped_brand_dict_3', mapping_dict)
mcc_dict_funct(testing_data_dummified, 'mcc', 'mapped_brand_dict_3', mcc_dict)
testing_data_dummified.sample(5)

Unnamed: 0,mapped_brand,mcc,merchant_string,network,mapped_brand_response,merchant_string1,merchant_string_dict,mapped_brand_dict_3
192326,,5941,paypal *unlimitedwa,V,,"[paypal, *unlimitedwa]",paypal *unlimitedwa,paypal
212291,,8661,sq *bigstuf ministries,V,,"[sq, *bigstuf, ministries]",sq *bigstuf ministries,
165249,,5812,ota-ya japanese restau,V,,"[ota, ya, japanese, restau]",ota-ya japanese restau,
234800,,5814,starbucks store 18624,V,,[starbucks],starbucks store 18624,starbucks
114366,,5309,duty free stores - kah,V,,"[duty, free, stores, kah]",duty free stores - kah,


In [28]:
# Dummify data (on most common words in merchant string cleaned)
start_time = time.time()
dummify_data(testing_data_dummified, most_common_words1)  

# Dummify additional columns (mcc, network) and drop merchant string column
testing_data_dummified=pd.get_dummies(testing_data_dummified, prefix=['mcc', 'network'], columns=['mcc', 'network'])

print("--- %s seconds ---" % (time.time() - start_time))
#takes about 7 mins (417 sec)

--- 227.99500179290771 seconds ---


In [46]:
len(testing_data_dummified['mapped_brand'])

334836

In [47]:
testing_data_dummified['mapped_brand_response'] = \
[testing_data_dummified['mapped_brand'][i] if testing_data_dummified['mapped_brand'][i] != None \
else testing_data_dummified['mapped_brand_dict_3'][i] \
for i in range(len(testing_data_dummified))]

In [51]:
testing_data_dummified.sample(3)

Unnamed: 0,mapped_brand,merchant_string,mapped_brand_response,merchant_string1,merchant_string_dict,mapped_brand_dict_3,uber,\*trip,sq,paypal,...,mcc_8931,mcc_8999,mcc_9211,mcc_9222,mcc_9311,mcc_9399,mcc_9402,mcc_9405,network_D,network_V
218853,,sp dogwood skate sho,,"[sp, dogwood, skate, sho]",sp dogwood skate sho,,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
244656,,sq *heidi wiegert-duch,,"[sq, *heidi, wiegert, duch]",sq *heidi wiegert-duch,,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
52088,,checkers truck,,"[checkers, truck]",checkers truck,,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [52]:
testing_data_dummified1 = testing_data_dummified.drop(['mapped_brand', 'merchant_string','merchant_string1', 'merchant_string_dict', 'mapped_brand_dict_3'], axis=1)

In [54]:
testing_data_dummified2 = testing_data_dummified1[(testing_data_dummified1['mcc_6011'] != 1)]
testing_data_dummified3 = testing_data_dummified2[(testing_data_dummified2['mcc_7523'] != 1)]

In [None]:
train_full = df[df.mapped_brand_response.notna()]
test = df[df.mapped_brand_response.isna()]

X_train_full1 = train_full.drop('mapped_brand_response', axis=1)
y_train_full1 = train_full['mapped_brand_response']
X_test1 = test.drop('mapped_brand_response', axis=1)
y_test1 = test['mapped_brand_response']
    X_train_wo_holdout, X_holdout, y_train_wo_holdout, y_holdout = train_test_split(
         X_train_full, y_train_full, test_size=0.2, random_state=42)

# Modelling on Test

In [16]:
%%time
#THIS TAKES ABOUT 8 MINS TO RUN
y_test1 = y_test.fillna("Unknown")
#testing model on test using full train set
multinomial, mapped_brand_proba, probs, mapped_brand_predicted, score, mapped_brand_predicted1 = multinomial_regression(X_train_full, y_train_full, X_test, y_test1)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


CPU times: user 2min 50s, sys: 59.4 s, total: 3min 50s
Wall time: 2min 34s


In [20]:
multinomial.score(X_test, y_test)

0.0

In [55]:
%%time
output = multinomial_output_test(X_test, mapped_brand_predicted1, probs)

CPU times: user 2.02 s, sys: 5.13 s, total: 7.15 s
Wall time: 6.11 s


In [56]:
new_merchant_col(output, 'merchant_string', 'merchant_string_dict') # so not cleaned
lowercase_col(output, 'merchant_string_dict') # dictionary below needs lowercase to work
output['mapped_brand_dict_3'] = ''

# Run dictionary functions on data
mapping_dict_funct(output, 'merchant_string_dict', 'mapped_brand_dict_3', mapping_dict)
mcc_dict_funct(output, 'mcc', 'mapped_brand_dict_3', mcc_dict)
# Create new column flags for predicted, correct, train/test, row count
# Can be used to summarize, if desired
output['predicted_flag'] = np.where(output['mapped_brand_dict_3'] != '',1,0)
output['equals_prediced_flag_1'] = np.where(output['mapped_brand_response'] == output['mapped_brand_dict_3'],1,0)

In [24]:
#test_model_output = output[(output['mcc'] !='6011') & (output['mcc'] !='7523') & (output['predicted_flag'] == 1)]

In [None]:
%%time
test_accuracy = output.mean()[['equals_prediced_flag_1']]
###TAKES 46 MINUTES TO RUN
####ACCURACY: 0.99127!!!!!!
#####Potentially faster ways to get accuracy
##1
#test_model_output['equals_prediced_flag_1'].mean()
##2
#incorrect_test_model_output = output[(output['mcc'] !='6011') & (output['mcc'] !='7523') & (output['predicted_flag'] != 1)]
#len(incorrect_test_model_output)/len(test_model_output)

In [147]:
test_accuracy

equals_prediced_flag_1    0.99127
dtype: float64

In [30]:
test_model_output[test_model_output['equals_predicted_flag_1'] == 0]
# ideas: remove ATM/parking, a number of best buy/burger kind wrong, maybe remove edwards or vend from dictionary, maybe stopword restaurant

In [140]:
#output[(output['mcc'] !='6011') & (output['mcc'] !='7523') & (output['predicted_flag'] != 1)].sample(30)

In [141]:
from sklearn.metrics import confusion_matrix

In [None]:
output[(output['mcc'] !='6011') & (output['mcc'] !='7523') & (output['predicted_flag'] == 1)].sample(5)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
conf_mat_top10 = test_model_output[test_model_output['mapped_brand_dict_3'].isin(top10)]
conf_mat = confusion_matrix(conf_mat_top10['mapped_brand_dict_3'], conf_mat_top10['mapped_brand_response'])
fig, ax = plt.subplots(figsize=(10,10))
sns.heatmap(conf_mat, annot=True, fmt='d',
            xticklabels=conf_mat_top10['mapped_brand_response'].values, yticklabels=conf_mat_top10['mapped_brand_dict_3'].values)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()

In [207]:
conf_mat_top10 = test_model_output[test_model_output['mapped_brand_dict_3'].isin(top10)]

In [199]:
top10 = list(test_model_output['mapped_brand_dict_3'].value_counts().head(10).index)

['uber',
 'paypal',
 'mcdonalds',
 'starbucks',
 'dollargeneral',
 'walmart',
 'walgreens',
 'tacobell',
 'gamestop',
 'dunkindonuts']

In [206]:
pd.DataFrame(test_model_output[test_model_output['mapped_brand_dict_3'].isin(top10)]['mapped_brand_response'].unique())

Unnamed: 0,0
0,starbucks
1,tacobell
2,dunkindonuts
3,aramark
4,walmart
5,walgreens
6,dollargeneral
7,gamestop
8,googleplay
9,hobbylobby


# Pickling

In [133]:
import pickle
import sys
def save_as_pickled_object(obj, filepath):
    max_bytes = 2**31 - 1
    bytes_out = pickle.dumps(obj)
    n_bytes = sys.getsizeof(bytes_out)
    with open(filepath, 'wb') as f_out:
        for idx in range(0, n_bytes, max_bytes):
            f_out.write(bytes_out[idx:idx+max_bytes])


save_as_pickled_object(multinomial,'final_ML_model.sav')

import os
def try_to_load_as_pickled_object_or_None(filepath):
            """
            This is a defensive way to write pickle.load, allowing for very large files on all platforms
            """
            max_bytes = 2**31 - 1

            input_size = os.path.getsize(filepath)
            bytes_in = bytearray(0)
            with open(filepath, 'rb') as f_in:
                for _ in range(0, input_size, max_bytes):
                    bytes_in += f_in.read(max_bytes)
            obj = pickle.loads(bytes_in)

            return obj

multinomial2 = try_to_load_as_pickled_object_or_None('final_ML_model.sav')

In [138]:
%%time
#len(multinomial2.predict(X_test))
#314152
#len(multinomial.predict(X_test))
#314152
sum(multinomial2.predict(X_test) != multinomial.predict(X_test))

CPU times: user 50.7 s, sys: 31.5 s, total: 1min 22s
Wall time: 1min 2s


0

In [None]:
#X_holdout1 = X_holdout.copy()
#X_holdout1['multinom_pickle'] = multinomial2.predict(X_holdout)
#X_holdout1['multinom'] = multinomial_train.predict(X_holdout)
#X_holdout1[X_holdout1['multinom_pickle'] != X_holdout1['multinom']]

In [157]:
data_dummified2 = data_dummified.copy()

In [159]:
data_dummified2.drop(['network_V', 'mcc_5814'], axis=1, inplace = True)

In [163]:
data_dummified2 = data_dummified2[0:5]

In [169]:
new_cols = list(set(data_dummified.columns) - set(data_dummified2.columns))

In [171]:
for i in new_cols:
    data_dummified2[i] = 0

In [172]:
list(set(data_dummified.columns) - set(data_dummified2.columns))

[]

In [173]:
data_dummified2[new_cols[0]]

0    0
1    0
2    0
3    0
4    0
Name: mcc_5814, dtype: int64

In [174]:
data_dummified2[new_cols[1]]

0    0
1    0
2    0
3    0
4    0
Name: network_V, dtype: int64