In [1]:
import pandas as pd
import pickle
from sklearn.feature_extraction.text import CountVectorizer
import re
import numpy as np

# Changelog
# ======================
## 2019-09-14 
### citi_to_stdcategory 
#### added 'lifestyle#entertainment' : {'std_category' : 'entertainment', 'apply_tax' : 'False'},
#### added 'shopping#fashion and accessories' : {'std_category' : 'shopping', 'apply_tax' : 'False'},

### hsbc_to_stdcategory
#### added 'korea': {'std_category': 'hotels, travel & travel accessories','apply_tax': 'False'},

### scb_to_stdcategory
#### added 'online#online' : {'std_category' : 'hotels, travel & travel accessories', 'apply_tax' : 'False'}

### dbs_to_stdcategory
#### added 'mobile & e-payments' : {'std_category' : 'lifestyle', 'apply_tax' : 'False'}

## 2019-09-30
### hsbc_to_stdcategory
### 'cny' : {'std_category' : 'drink & dine', 'apply_tax' : 'False'} 


## 2019-10-01

### scb_to_stdcategory
### 'dining#western' : {'std_category' : 'drink & dine', 'apply_tax' : 'False'}, 




## 2019-10-07

### scb_to_stdcategory
### 'online#retail-online' : {'std_category' : 'shopping', 'apply_tax' : 'False'}

In [2]:
#####################################################################
# Process : std_category to relevant_google_type mapping
#####################################################################
stdcategory_to_googletype ={
'drink & dine' : ['bakery', 'bar', 'cafe', 'liquor_store', 'meal_delivery', 'meal_takeaway', 'restaurant', 'food'], 
'hotels, travel & travel accessories' : ['travel_agency', 'hotel', 'store'], 
'wellness, health & leisure' : ['beauty_salon', 'gym', 'spa', 'hair_care'], 
'entertainment' : ['casino', 'movie_rental', 'movie_theater', 'museum', 'night_club', 'stadium', 'zoo'], 
'lifestyle' : ['electronics_store', 'furniture_store', 'home_goods_store', 'school'], 
'auto & petrol' : ['car_dealer', 'car_rental', 'car_repair', 'car_wash', 'gas_station'], 
'shopping' : ['clothing_store', 'convenience_store', 'department_store', 'hardware_store', 'jewelry_store', 'shoe_store', 'shopping_mall', 'store', 'supermarket','pet_store'],
'online marketplace & electricity plans' : []
}

pickle.dump(stdcategory_to_googletype,open('stdcategory_to_googletype.pickle', 'wb'))

std_category_taxonomy ={
'drink & dine' : ['restaurant', 'cafe', 'food','snacks', 'drinks', 'beer', 'wine', 'liquor', 'meal', 'buffet', '1-for-1', 'dine', 'dining', 'oneforone', 'cuisine', 'starter', 'la carte', 'a la carte', 'carte ', 'burger', 'curry', 'chicken', 'beef', 'mutton', 'lamb', 'fish', 'pasta', 'rice', 'noodle', 'salad', 'steak', 'egg', 'vegetarian', 'soup', 'halal', 'takeaway', 'palate', 'midautumn', 'mid-autumn', 'mid autumn', 'mooncake', 'durian', 'lunch', 'dinner', 'mediterranean', 'italian', 'confectionary', 'chocolate', 'ice-cream', 'brunch'],
'hotels, travel & travel accessories' : ['airticket', 'taxi', 'train', 'cruise','travel', 'trek', 'hotel', 'motel', 'room', 'luggage', 'travel accessories', 'wifi', 'air miles', 'miles', 'holiday', 'staycation', 'tour', 'sightseeing', 'boarding pass', 'air fares', 'destinations', 'ticketing', 'flight', 'car rental', 'journey','economy fare', 'economy class', 'business class', 'first class', 'dutyfree', 'duty free', 'emirates', 'singapore airlines', 'cathay pacific', 'qantas', 'qatar air', 'eva air', 'ana all nippon'], 
'wellness, health & leisure' : ['health', 'wellness', 'yoga', 'spa', 'beauty', 'salon', 'hair care', 'gym'],
'entertainment' : ['movie', 'play', 'theatre', 'museum', 'art gallery', 'casino', 'night club', 'golf', 'tournament', 'zoo', 'safari', 'bird park', 'games', 'sentosa', 'studios', 'sports'],
'lifestyle' : ['furniture', 'electronics', 'learning', 'classes', ' school', 'mobile & e-payments'],
'auto & petrol' : ['automotive','petrol', 'fuel', 'esso', 'chevron', 'caltex', 'shell', 'sinopec', 'petro', 'cng', 'spc'],
'shopping' : ['installment','mall', 'receipts', 'stocks last'],
'online marketplace & electricity plans' : ['online', 'website', 'promo code', 'shipping fees', 'electricity plan', 'electricity', 'electric']
}

pickle.dump(std_category_taxonomy,open('std_category_taxonomy.pickle', 'wb'))

#####################################################################	
# Process : bank_category to std_category
#####################################################################
# SCB & Cit the key is category#subcategory
# rest key is category only
# you may combine all banks into a big one and add bankey as an item as well
# bankkey =0 means only category is used as key
# bankkey =1 means both category and subcategory concatenated by #
#####################################################################

bankkey = { 'citi':1,'scb':1, 'ocbc':0, 'uob':0, 'scb':0, 'hsbc':0 }


citi_to_stdcategory ={
'bars#bars' : {'std_category' : 'drink & dine', 'apply_tax' : 'False'}, 
'bars#nan' : {'std_category' : 'drink & dine', 'apply_tax' : 'False'}, 
'hotels#online' : {'std_category' : 'hotels, travel & travel accessories', 'apply_tax' : 'False'}, 
'hotels#hotels' : {'std_category' : 'hotels, travel & travel accessories', 'apply_tax' : 'False'}, 
'hotels#bars' : {'std_category' : 'hotels, travel & travel accessories', 'apply_tax' : 'False'}, 
'travel#car rental' : {'std_category' : 'hotels, travel & travel accessories', 'apply_tax' : 'False'},
'travel#bars' : {'std_category' : 'hotels, travel & travel accessories', 'apply_tax' : 'False'}, 
'travel#airlines' : {'std_category' : 'hotels, travel & travel accessories', 'apply_tax' : 'False'}, 
'travel#others' : {'std_category' : 'hotels, travel & travel accessories', 'apply_tax' : 'False'}, 
'travel#online' : {'std_category' : 'hotels, travel & travel accessories', 'apply_tax' : 'False'}, 
'travel#tour/sightseeing' : {'std_category' : 'hotels, travel & travel accessories', 'apply_tax' : 'False'}, 
'lifestyle#beauty, health and spa' : {'std_category' : 'wellness, health & leisure', 'apply_tax' : 'False'}, 
'lifestyle#online' : {'std_category' : 'lifestyle', 'apply_tax' : 'False'}, 
'lifestyle#others' : {'std_category' : 'entertainment', 'apply_tax' : 'False'}, 
'lifestyle#bars' : {'std_category' : 'entertainment', 'apply_tax' : 'False'}, 
'lifestyle#golf' : {'std_category' : 'entertainment', 'apply_tax' : 'False'}, 
'lifestyle#fitness' : {'std_category' : 'entertainment', 'apply_tax' : 'False'}, 
'lifestyle#entertainment' : {'std_category' : 'entertainment', 'apply_tax' : 'False'}, 
'shopping#online' : {'std_category' : 'shopping', 'apply_tax' : 'False'}, 
'shopping#others' : {'std_category' : 'shopping', 'apply_tax' : 'False'}, 
'shopping#bars' : {'std_category' : 'shopping', 'apply_tax' : 'False'},
'shopping#fashion and accessories' : {'std_category' : 'shopping', 'apply_tax' : 'False'},
'restaurants#italian' : {'std_category' : 'drink & dine', 'apply_tax' : 'False'}, 
'restaurants#bars' : {'std_category' : 'drink & dine', 'apply_tax' : 'False'}, 
'restaurants#online' : {'std_category' : 'drink & dine', 'apply_tax' : 'False'}, 
'restaurants#french' : {'std_category' : 'drink & dine', 'apply_tax' : 'False'}, 
'restaurants#others' : {'std_category' : 'drink & dine', 'apply_tax' : 'False'}, 
'restaurants#cafe and delights' : {'std_category' : 'drink & dine', 'apply_tax' : 'False'}, 
'restaurants#fine dining' : {'std_category' : 'drink & dine', 'apply_tax' : 'False'}, 
'restaurants#japanese' : {'std_category' : 'drink & dine', 'apply_tax' : 'False'}, 
'restaurants#western' : {'std_category' : 'drink & dine', 'apply_tax' : 'False'}, 
'restaurants#asian' : {'std_category' : 'drink & dine', 'apply_tax' : 'False'}, 
'restaurants#international' : {'std_category' : 'drink & dine', 'apply_tax' : 'False'}, 
'restaurants#chinese' : {'std_category' : 'drink & dine', 'apply_tax' : 'False'}, 
}

scb_to_stdcategory ={
'retail#other_retail' : {'std_category' : 'shopping', 'apply_tax' : 'False'}, 
'automotive#caltex' : {'std_category' : 'auto & petrol', 'apply_tax' : 'False'}, 
'automotive#nan' : {'std_category' : 'auto & petrol', 'apply_tax' : 'False'}, 
'retail#jewellery' : {'std_category' : 'shopping', 'apply_tax' : 'False'},
'retail#nan' : {'std_category' : 'shopping', 'apply_tax' : 'False'}, 
'nan#nan' : {'std_category' : '', 'apply_tax' : 'True'}, 
'dining#fastfood' : {'std_category' : 'drink & dine', 'apply_tax' : 'False'}, 
'online#apparel' : {'std_category' : 'shopping', 'apply_tax' : 'False'}, 
'dining#wine' : {'std_category' : 'drink & dine', 'apply_tax' : 'False'}, 
'dining#western' : {'std_category' : 'drink & dine', 'apply_tax' : 'False'}, 
'online#lifestyle' : {'std_category' : 'online marketplace & electricity plans', 'apply_tax' : 'False'}, 
'bars#confectionary' : {'std_category' : 'drink & dine', 'apply_tax' : 'False'},
'dining#confectionary' : {'std_category' : 'drink & dine', 'apply_tax' : 'False'}, 
'dining#italian' : {'std_category' : 'drink & dine', 'apply_tax' : 'False'}, 
'dining#japanese' : {'std_category' : 'drink & dine', 'apply_tax' : 'False'}, 
'dining#european' : {'std_category' : 'drink & dine', 'apply_tax' : 'False'}, 
'dining#asian' : {'std_category' : 'drink & dine', 'apply_tax' : 'False'}, 
'dining#chillout' : {'std_category' : 'drink & dine', 'apply_tax' : 'False'}, 
'restrelax#lifestyle-restrelax' : {'std_category' : 'entertainment', 'apply_tax' : 'False'}, 
'food_wine#dining' : {'std_category' : 'drink & dine', 'apply_tax' : 'False'},
'food_wine#nan' : {'std_category' : 'drink & dine', 'apply_tax' : 'False'}, 
'restrelax#healthfitness' : {'std_category' : 'wellness, health & leisure', 'apply_tax' : 'False'}, 
'restrelax#beauty' : {'std_category' : 'wellness, health & leisure', 'apply_tax' : 'False'}, 
'restrelax#hotelstay' : {'std_category' : 'hotels, travel & travel accessories', 'apply_tax' : 'False'}, 
'lifestyle#health_beauty_spa' : {'std_category' : 'wellness, health & leisure', 'apply_tax' : 'False'},
'lifestyle#nan' : {'std_category' : 'wellness, health & leisure', 'apply_tax' : 'False'},
'dining#nan' : {'std_category' : 'drink & dine', 'apply_tax' : 'False'}, 
'dining#diningathotels' : {'std_category' : 'drink & dine', 'apply_tax' : 'False'}, 
'automotive#travel-automotive' : {'std_category' : 'auto & petrol', 'apply_tax' : 'False'}, 
'retail#online' : {'std_category' : 'shopping', 'apply_tax' : 'False'}, 
'retail#lifestyle' : {'std_category' : 'shopping', 'apply_tax' : 'False'}, 
'restrelax#wellness' : {'std_category' : 'wellness, health & leisure', 'apply_tax' : 'False'}, 
'restrelax#nan' : {'std_category' : 'wellness, health & leisure', 'apply_tax' : 'False'},
'dining#mediterranean' : {'std_category' : 'drink & dine', 'apply_tax' : 'False'}, 
'online#nan ' : {'std_category' : 'hotels, travel & travel accessories', 'apply_tax' : 'False'},
'online#travel' : {'std_category' : 'hotels, travel & travel accessories', 'apply_tax' : 'False'},
'online#online' : {'std_category' : 'hotels, travel & travel accessories', 'apply_tax' : 'False'},
'online#retail-online' : {'std_category' : 'shopping', 'apply_tax' : 'False'}
}

dbs_to_stdcategory ={
'1-for-1' : {'std_category' : 'drink & dine', 'apply_tax' : 'False'}, 
'dine' : {'std_category' : 'drink & dine', 'apply_tax' : 'False'}, 
'service' : {'std_category' : 'wellness, health & leisure', 'apply_tax' : 'False'}, 
'play' : {'std_category' : 'entertainment', 'apply_tax' : 'False'}, 
'home & living' : {'std_category' : 'lifestyle', 'apply_tax' : 'False'}, 
'shop' : {'std_category' : 'shopping', 'apply_tax' : 'False'}, 
'travel' : {'std_category' : 'hotels, travel & travel accessories', 'apply_tax' : 'False'}, 
'mid-autumn festival' : {'std_category' : '', 'apply_tax' : 'True'}, 
'online deals' : {'std_category' : '', 'apply_tax' : 'True'}, 
'overseas deals' : {'std_category' : '', 'apply_tax' : 'True'},
'mobile & e-payments' : {'std_category' : 'lifestyle', 'apply_tax' : 'False'}
}


ocbc_to_stdcategory ={
'dining' : {'std_category' : 'drink & dine', 'apply_tax' : 'False'}, 
'dining:1' : {'std_category' : 'drink & dine', 'apply_tax' : 'False'}, 
'electronics' : {'std_category' : 'lifestyle', 'apply_tax' : 'False'}, 
'fashion' : {'std_category' : 'shopping', 'apply_tax' : 'False'}, 
'featured' : {'std_category' : '', 'apply_tax' : 'True'}, 
'health' : {'std_category' : 'wellness, health & leisure', 'apply_tax' : 'False'}, 
'home' : {'std_category' : 'lifestyle', 'apply_tax' : 'False'}, 
'instalment' : {'std_category' : 'shopping', 'apply_tax' : 'False'}, 
'leisure' : {'std_category' : 'entertainment', 'apply_tax' : 'False'}, 
'midautumn' : {'std_category' : 'drink & dine', 'apply_tax' : 'False'}, 
'online' : {'std_category' : '', 'apply_tax' : 'True'}, 
'petrol' : {'std_category' : 'auto & petrol', 'apply_tax' : 'False'}, 
'regional' : {'std_category' : '', 'apply_tax' : 'True'}, 
'sportshub' : {'std_category' : 'entertainment', 'apply_tax' : 'False'}, 
'travel' : {'std_category' : 'hotels, travel & travel accessories', 'apply_tax' : 'False'},
'voyage' : {'std_category' : '', 'apply_tax' : 'True'}, 
'nan' : {'std_category' : '', 'apply_tax' : 'True'}
}

hsbc_to_stdcategory ={
'1-dines-free' : {'std_category' : 'drink & dine', 'apply_tax' : 'False'}, 
'best-of-asian-cuisine' : {'std_category' : 'drink & dine', 'apply_tax' : 'False'}, 
'casual-dining' : {'std_category' : 'drink & dine', 'apply_tax' : 'False'}, 
'dining' : {'std_category' : 'drink & dine', 'apply_tax' : 'False'}, 
'gusto-italiano' : {'std_category' : 'drink & dine', 'apply_tax' : 'False'}, 
'high-tea' : {'std_category' : 'drink & dine', 'apply_tax' : 'False'}, 
'lunch-brunch' : {'std_category' : 'drink & dine', 'apply_tax' : 'False'}, 
'midautumn' : {'std_category' : 'drink & dine', 'apply_tax' : 'False'}, 
'national-day' : {'std_category' : 'drink & dine', 'apply_tax' : 'False'}, 
'hotel-dining' : {'std_category' : 'drink & dine', 'apply_tax' : 'False'}, 
'entertainer-with-hsbc' : {'std_category' : 'online marketplace & electricity plans', 'apply_tax' : 'True'}, 
'entertainer-with-hsbc-hong-kong' : {'std_category' : 'online marketplace & electricity plans', 'apply_tax' : 'False'}, 
'entertainer-with-hsbc-london' : {'std_category' : 'online marketplace & electricity plans', 'apply_tax' : 'False'}, 
'entertainer-with-hsbc-malaysia' : {'std_category' : 'online marketplace & electricity plans', 'apply_tax' : 'False'}, 
'entertainer-with-hsbc-uae' : {'std_category' : 'online marketplace & electricity plans', 'apply_tax' : 'False'}, 
'homenfurnishing' : {'std_category' : 'lifestyle', 'apply_tax' : 'False'}, 
'kid-family' : {'std_category' : 'lifestyle', 'apply_tax' : 'False'}, 
'wellness' : {'std_category' : 'wellness, health & leisure', 'apply_tax' : 'False'}, 
'leisure' : {'std_category' : 'entertainment', 'apply_tax' : 'False'}, 
'retail' : {'std_category' : 'shopping', 'apply_tax' : 'False'}, 
'shell' : {'std_category' : 'auto & petrol', 'apply_tax' : 'False'}, 
'caltex' : {'std_category' : 'auto & petrol', 'apply_tax' : 'False'}, 
'hotels' : {'std_category' : 'hotels, travel & travel accessories', 'apply_tax' : 'False'}, 
'flight' : {'std_category' : 'hotels, travel & travel accessories', 'apply_tax' : 'False'}, 
'cruise' : {'std_category' : 'hotels, travel & travel accessories', 'apply_tax' : 'False'}, 
'staycations' : {'std_category' : 'hotels, travel & travel accessories', 'apply_tax' : 'False'}, 
'travel' : {'std_category' : 'hotels, travel & travel accessories', 'apply_tax' : 'False'}, 
'travel-agents' : {'std_category' : 'hotels, travel & travel accessories', 'apply_tax' : 'False'}, 
'travel-essentials' : {'std_category' : 'hotels, travel & travel accessories', 'apply_tax' : 'False'},
'korea': {'std_category': 'hotels, travel & travel accessories','apply_tax': 'False'},
'resorts-world-sentosa' : {'std_category' : 'hotels, travel & travel accessories', 'apply_tax' : 'False'},
'nan' : {'std_category' : '', 'apply_tax' : 'True'},
'cny' : {'std_category' : 'drink & dine', 'apply_tax' : 'False'} 
}

uob_to_stdcategory ={
'dining' : {'std_category' : 'drink & dine', 'apply_tax' : 'False'}, 
'fashion-and-beauty' : {'std_category' : 'shopping', 'apply_tax' : 'False'}, 
'lb-fashion-and-beauty' : {'std_category' : 'shopping', 'apply_tax' : 'False'}, 
'gd-fashion-and-beauty' : {'std_category' : 'shopping', 'apply_tax' : 'False'}, 
'gd-home-and-living' : {'std_category' : 'lifestyle', 'apply_tax' : 'False'}, 
'gd-lifestyle-and-family' : {'std_category' : 'wellness, health & leisure', 'apply_tax' : 'True'}, 
'gd-marketplaces-and-services' : {'std_category' : 'online marketplace & electricity plans', 'apply_tax' : 'False'},
'lb-marketplaces-and-services' : {'std_category' : 'online marketplace & electricity plans', 'apply_tax' : 'False'},
'home-and-living' : {'std_category' : 'lifestyle', 'apply_tax' : 'False'},
'home-and-living ' : {'std_category' : 'lifestyle', 'apply_tax' : 'False'},
'lb-home-and-living' : {'std_category' : 'lifestyle', 'apply_tax' : 'False'}, 
'lifestyle-and-family' : {'std_category' : 'wellness, health & leisure', 'apply_tax' : 'False'}, 
'marketplaces-and-services' : {'std_category' : 'online marketplace & electricity plans', 'apply_tax' : 'False'} 
}


In [3]:
cat_to_stdcat = {
    'dbs' : dbs_to_stdcategory,
    'citi' : citi_to_stdcategory,
    'scb' : scb_to_stdcategory,
    'uob' : uob_to_stdcategory,
    'hsbc' : hsbc_to_stdcategory,
    'ocbc' : ocbc_to_stdcategory
}
pickle.dump(cat_to_stdcat,open('cat_to_stdcat.pickle', 'wb'))