In [1]:
import requests
from datetime import datetime, timedelta
import math
import time

In [4]:
def get_date_time_hour(hour_delta):
    """
    returns datetime object for start-of-day day_delta days from the current datetime
    """
    dtime = datetime.now()
    dtime = dtime.replace(minute = 0, second = 0, microsecond = 0)
    dtime = dtime + timedelta(hours = hour_delta)
    return dtime

In [17]:
def getModifiedUrl(url, from_param=None, size_param=None, startEpoch=None, endEpoch=None):
    result_url = url
    if (from_param!=None and size_param!=None):
        result_url = result_url + "&from=" + str(from_param) + "&size=" + str(size_param)
    if (startEpoch!=None and endEpoch!=None):
        result_url = result_url + "&sDateEpoch=" + str(startEpoch) + "&eDateEpoch=" + str(endEpoch)
    return result_url

In [42]:
def process_paginated_comments_from_elastic(url, size_param, force = False, to_process_till = None):
    
    final_url = getModifiedUrl(url, from_param = 0, size_param = size_param)
    json_raw_response = requests.get(final_url).json()

    total_comment_count = json_raw_response['hits']['total']
    
    ELASTIC_LIMIT = 10000
    if (to_process_till != None and to_process_till < total_comment_count):
        total_comment_count = to_process_till
        
    if (total_comment_count > ELASTIC_LIMIT):
        print("total_comment_count greater than {}: {}".format(ELASTIC_LIMIT, total_comment_count))
        if not force:
            print("aborting")
            return
        else:
            total_comment_count = ELASTIC_LIMIT
    
    num_iters = math.ceil(total_comment_count/size_param)

    print("total_count : {}".format(total_comment_count))
    print("num_iters : {}".format(num_iters))

    for iter_val in range(0, num_iters, 1):
        print("iteration : %d" %(iter_val))
        from_val = iter_val * size_param
        final_url = getModifiedUrl(url, from_param = from_val, size_param = size_param)
        json_raw_response = requests.get(final_url).json()
        list_comments = [(x['_source']['c_id'], x['_source']['C_T']) for x in json_raw_response['hits']['hits']]
        process_list_cmts_elastic(list_comments)   

In [37]:
def process_list_cmts_elastic(list_comments):
    global global_counter
    for c_id, c_t in list_comments:
        global_counter = global_counter + 1
        process_res = process_per_comment(c_t)
#         print("{:>5}. {} : comment id: {}".format(global_counter, process_res, c_id))
        
        if(process_res):
            print("comment id {} : {}".format(c_id, process_res))
            print("comment text : {}".format(c_t))
        else:
            print("comment id {} : {}".format(c_id, process_res))

In [1]:
import re
def process_per_comment(c_text):
    c_text = re.sub(" ", "", c_text)
    match_found = re.findall('[0-9]{9,}', c_text)
    return (True if match_found else False)

In [40]:
def get_date_time_from_epoch(time_epoch):
    """
    returns time string formatted in the given format for the given epochs in milliseconds
    """
    return time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time_epoch/1000))

In [39]:
batch_write_size = 250
url = "http://commentmoderator.indiatimes.com/mytimes/elasticCommentQuery?sort=desc&filterCommentStatus=APPROVED&appKey=ET"
start_date_time = int(get_date_time_hour(-1).timestamp()*1000)
end_date_time = int(get_date_time_hour(0).timestamp()*1000)
print("duration : {} - {}".format(get_date_time_from_epoch(start_date_time), get_date_time_from_epoch(end_date_time)))
url = getModifiedUrl(url, startEpoch = start_date_time, endEpoch = end_date_time)
process_paginated_comments_from_elastic(url, size_param = batch_write_size)

duration : 2019-06-21 11:00:00 - 2019-06-21 12:00:00
total_count : 43
num_iters : 1
iteration : 0
comment id 2506275397 : False
comment id 2506275418 : False
comment id 2506275242 : False
comment id 2506275268 : False
comment id 2506275148 : False
comment id 2506275041 : False
comment id 2506275007 : False
comment id 2506275010 : False
comment id 2506274902 : False
comment id 2506274892 : False
comment id 2506274625 : False
comment id 2506274213 : False
comment id 2506273945 : False
comment id 2506273869 : False
comment id 2506273487 : False
comment id 2506273402 : False
comment id 2506273416 : False
comment id 2506272402 : False
comment id 2506272278 : False
comment id 2506272027 : False
comment id 2506271926 : False
comment id 2506271872 : False
comment id 2506271707 : False
comment id 2506271654 : False
comment id 2506271262 : False
comment id 2506271218 : False
comment id 2506271209 : False
comment id 2506271061 : False
comment id 2506270978 : False
comment id 2506270900 : False
co

In [21]:
# base_url = "http://commentmoderator.indiatimes.com/mytimes/elasticCommentQuery?appKey={}&sort=desc&from=0&size=0&aggField=F_ADD&aggSize=150&aggMinDocCount={}&sDateEpoch={:.0f}&eDateEpoch={:.0f}"

In [68]:
import re
def process_per_comment_v2(c_text):
    c_text = re.sub(" ", "", c_text)

    for match in re.findall('[0-9]{9,}', c_text):
        if(match[0] in '6,7,8,9,0'.split(",")):
            return True
    return False

In [70]:
c_text = "phone par customer service 287628 034786867886769798796856 centre support number 8287628034786867886769798796856====//"

process_per_comment_v2(c_text)

True