In [46]:
from pymongo import MongoClient
import os
import bson

def mongoCollection(connstr, db, collection):
    client = MongoClient(connstr)
    db = client[db]
    return db[collection]

import re
import datetime

collection = mongoCollection(os.environ.get('MONGODB_CONN_STR'), 'smsinfo', 'transactions')

from IPython.display import HTML, display
import tabulate

def extract_transactions(i):
    keys = ['expense_amount', 'payment_mode', 'merchant', 'datetime', 'category', 'sub_category']
#     return list(i[1]['transaction'].values()) + [str(i[1]['_id'])]
    return [i[1]['transaction'].get(key, None) for key in keys] + [str(i[1]['_id'])]

def mark_duplicate(collection, _id):
    result = collection.update_one({'_id': bson.ObjectId(_id)}, {'$set' : {'status': {'duplicate': True}}})
    print('success' if result.modified_count == 1 else 'failed')
    return result

def add_category_info(collection, _id, category='', sub_category=''):
    result = collection.update_one(
        {'_id': bson.ObjectId(_id)}, 
        {'$set' : {'transaction.category': category, 'transaction.sub_category': sub_category}})
    print('success' if result.modified_count == 1 else 'failed')
    return result

def add_category_info_for_merchant(collection, merchant, category='', sub_category=''):
    result = collection.update_many(
        {'transaction.merchant': merchant, 'transaction.category': None}, 
        {'$set' : {'transaction.category': category, 'transaction.sub_category': sub_category}})
    print(merchant + ' modified ' + str(result.modified_count))
    return result

In [66]:
year = 2020
month = 2

next_month = 1 if month == 12 else month + 1
next_year = year + 1 if month == 12 else year

monthly = list(enumerate(collection.find({'status': {'analysis_done': True},
                 'transaction.datetime' : {
                     '$gte': datetime.datetime(year,month,1), '$lt': datetime.datetime(next_year,next_month,1)}
                  }, {'transaction': 1})))

monthly_transactions = list(map(extract_transactions, monthly))

In [67]:
display(HTML(tabulate.tabulate(monthly_transactions, tablefmt='html')))

0,1,2,3,4,5,6
970.0,XX983,VPS*PAY Sreen.,2020-02-01 00:00:00,meet-up,alcohol,5e3a783185ce1ed04ce1b6b9
120.0,XX983,VPS*DROMI .,2020-02-02 00:00:00,meet-up,food,5e3a783185ce1ed04ce1b6bd
500.0,CREDIT Card xx3690,PAYTM,2020-02-02 20:44:58,,,5e3a783285ce1ed04ce1b6c3
465.5,CREDIT Card xx3690,FRESHTOH3631510,2020-02-02 20:42:47,grocery,non-veg,5e3a783285ce1ed04ce1b6c4
465.5,CREDIT Card xx3690,FRESHTOH3631510 in BANGALORE,2020-02-02 20:42:48,grocery,non-veg,5e3a783285ce1ed04ce1b6c5
400.0,CREDIT Card xx3690,PAYTM3852398,2020-02-02 20:49:59,,,5e3a783285ce1ed04ce1b6c6
268.0,XX983,VPS*HORTICULT.,2020-02-03 00:00:00,grocery,vegetables,5e3a783385ce1ed04ce1b6c7
199.0,Debit Card xx1827,NETFLIX ENTERTAINMENT,2020-02-03 19:28:14,subscription,entertainment,5e3a783385ce1ed04ce1b6ca
500.0,CREDIT Card xx3690,PAYTM3852398,2020-02-03 23:14:18,,,5e3a783385ce1ed04ce1b6cb
60.0,XXX983,bharatpe90200570491@yesbankltd,2020-02-04 00:00:00,meet-up,breakout,5e3a783385ce1ed04ce1b6cc


#feb-2020 message.date > 602238600089916032

`add_category_info(collection, '5e497fb75bc90c80c1c82a81', category='purchase', sub_category='toys')`

In [65]:
# add_category_info(collection, '5e3a783185ce1ed04ce1b6b9', category='meet-up', sub_category='alcohol')


merchant_category_mapping = [
    {'merchant': ['Uber', 'UBER'], 'category': 'taxi', 'sub_category': 'uber'},
    {'merchant': ['SWIGGY', 'zomato@hdfcbank', 'ZOMATO11120', 'Eat Fit', 'www.zomato.com', 'SWIGGYXL3549786', 'Paratha Corner', 'bharatpe.9040576993@icici'], 'category': 'food', 'sub_category': 'delivery'},
    {'merchant': ['..MATTO COFFEA_'], 'category': 'meet-up', 'sub_category': 'food'},
    {'merchant': ['bharatpe90200570491@yesbankltd', 'q22904860@ybl', 'bharatpe90200570491¡yesbankltd'], 'category': 'meet-up', 'sub_category': 'breakout'},
    {'merchant': ['NEW FRUIT LAND', 'NEW FRUITLAND'], 'category': 'grocery', 'sub_category': 'vegetables'},
    {'merchant': ['www.bigbasket.', 'M K RETAIL'], 'category': 'grocery', 'sub_category': 'all'},
    {'merchant': ['VIN*APPLE COM.', 'VSI*APPLE COM.'], 'category': 'subscription', 'sub_category': 'apps'},
    {'merchant': ['VPS*HORTICULT.'], 'category': 'grocery', 'sub_category': 'vegetables'},
    {'merchant': ['FRESHTOH3631510', 'FRESHTOH3631510 in BANGALORE', 'Freshtohome'], 'category': 'grocery', 'sub_category': 'non-veg'},
    {'merchant': ['AJIO', 'MYNTRA72883'], 'category': 'purchase', 'sub_category': 'cloths'},
    {'merchant': ['IIN*Amazon .'], 'category': 'purchase', 'sub_category': 'household'},
    {'merchant': ['amazonsellerservices.98397377@hdfcbank'], 'category': 'purchase', 'sub_category': 'toiletry'},
    {'merchant': ['Solanki medicals'], 'category': 'medical', 'sub_category': 'medicine'},
    {'merchant': ['Bharti Airtel Limited'], 'category': 'utility', 'sub_category': 'mobile'},
    {'merchant': ['Instapay BBPS'], 'category': 'utility', 'sub_category': 'bescom'},
    {'merchant': ['LIC'], 'category': 'insurance', 'sub_category': 'insurance'},
    {'merchant': ['..RAJDHANI PHOENIX_', '..ARENA_', 'bharatpe09600003315¡yesbankltd'], 'category': 'food', 'sub_category': 'eating-out'},
    {'merchant': ['The lassi club'], 'category': 'food', 'sub_category': 'snacks'},
    {'merchant': ['H M LEISURE'], 'category': 'purchase', 'sub_category': 'toys'},
    {'merchant': ['PEPPERFRY64213'], 'category': 'purchase', 'sub_category': 'furniture'},
    {'merchant': ['HPCL HINDUSTAN PETROLE', 'HINDUSTAN PETROLEUM CO'], 'category': 'auto', 'sub_category': 'fuel'},
    {'merchant': ['vijualoor@okhdfcbank', 'cru5ty.d3m0nx-2@okhdfcbank'], 'category': 'others', 'sub_category': 'others'},
]

# bharatpe09600003315¡yesbankltd => Empire hotel

for _m in merchant_category_mapping:
    for _merchant in _m['merchant']:
        add_category_info_for_merchant(collection, _merchant, category=_m['category'], sub_category=_m['sub_category'])


failed
failed
failed
failed
failed
failed
failed
success
Uber modified 0
UBER modified 0
SWIGGY modified 0
zomato@hdfcbank modified 0
ZOMATO11120 modified 0
Eat Fit modified 0
www.zomato.com modified 0
SWIGGYXL3549786 modified 0
Paratha Corner modified 0
bharatpe.9040576993@icici modified 0
..MATTO COFFEA_ modified 0
bharatpe90200570491@yesbankltd modified 0
q22904860@ybl modified 0
bharatpe90200570491¡yesbankltd modified 0
NEW FRUIT LAND modified 0
NEW FRUITLAND modified 0
www.bigbasket. modified 0
M K RETAIL modified 0
VIN*APPLE COM. modified 0
VSI*APPLE COM. modified 0
VPS*HORTICULT. modified 0
FRESHTOH3631510 modified 0
FRESHTOH3631510 in BANGALORE modified 0
Freshtohome modified 0
AJIO modified 0
MYNTRA72883 modified 0
IIN*Amazon . modified 0
amazonsellerservices.98397377@hdfcbank modified 0
Solanki medicals modified 0
Bharti Airtel Limited modified 0
Instapay BBPS modified 0
LIC modified 0
..RAJDHANI PHOENIX_ modified 0
..ARENA_ modified 0
bharatpe09600003315¡yesbankltd modified 

In [82]:
# Remove OTP

def ignore_otp(collection, gt=7602238600089916032, lte=0):
    result = collection.update_many(
        {"message.text": {'$regex': u"OTP"}},
        {'$set' : {'status.ignore': True}})
    print('otp ignored ' + str(result.modified_count))
    return result

In [83]:
gt = 0 #602238600089916032
lte = 604643260481582080
result = ignore_otp(collection, gt=gt, lte=lte)

otp ignored 303


In [86]:
def extract_documents(i):
    keys = ['expense_amount', 'payment_mode', 'merchant', 'datetime', 'category', 'sub_category']
    return[str(i[1]['_id'])] + [i[1]['message'].values()]
    
monthly = list(enumerate(collection.find({
                 "message.date": {'$gt': 602238600089916032, '$lte': 604643260481582080},
                  })))

monthly_messages = list(map(extract_documents, monthly))
display(HTML(tabulate.tabulate(monthly_messages, tablefmt='html')))

KeyError: 'transaction'