In [3]:
import requests
from bs4 import BeautifulSoup
from bs4.element import Tag
import json
import re
import os
from deco import *
from __future__ import print_function
import pandas as pd
import numpy as np

In [2]:
import time
from functools import wraps


def retry(ExceptionToCheck, tries=4, delay=3, backoff=2, logger=None):
    """Retry calling the decorated function using an exponential backoff.

    http://www.saltycrane.com/blog/2009/11/trying-out-retry-decorator-python/
    original from: http://wiki.python.org/moin/PythonDecoratorLibrary#Retry

    :param ExceptionToCheck: the exception to check. may be a tuple of
        exceptions to check
    :type ExceptionToCheck: Exception or tuple
    :param tries: number of times to try (not retry) before giving up
    :type tries: int
    :param delay: initial delay between retries in seconds
    :type delay: int
    :param backoff: backoff multiplier e.g. value of 2 will double the delay
        each retry
    :type backoff: int
    :param logger: logger to use. If None, print
    :type logger: logging.Logger instance
    """
    def deco_retry(f):

        @wraps(f)
        def f_retry(*args, **kwargs):
            mtries, mdelay = tries, delay
            while mtries > 1:
                try:
                    return f(*args, **kwargs)
                except ExceptionToCheck as e:
                    msg = "%s, Retrying in %d seconds..." % (str(e), mdelay)
                    if logger:
                        logger.warning(msg)
                    else:
                        print(msg)
                    time.sleep(mdelay)
                    mtries -= 1
                    mdelay *= backoff
            return f(*args, **kwargs)

        return f_retry  # true decorator

    return deco_retry

In [3]:
def persist(data, fileName, path):
    json_string = json.dumps(data)
    print("Writing to file " + path + fileName)
    with open(path + fileName, "w") as file:
        file.write(json_string)

In [4]:
def persistDF(df, symbol, path):
    print("Persisting " + symbol)
    if(os.path.exists(path + 'NSE:' + symbol + ".csv")):
        print("%s already persisted" % symbol)
    else:
        df.to_csv(path + 'NSE:' + symbol + ".csv")    

In [5]:
def load(fileName, path):
    data = {}
    with open(path + fileName) as json_file:
        data = json.load(json_file)
    return data

In [6]:
def splitAndStrip(text, delimiter):
    return map(lambda s: s.strip(), text.split(delimiter))

In [7]:
def buildMCIndexList():
    index_url_prefix = "https://www.moneycontrol.com/india/stockpricequote/"
    letters = map(chr, range(65, 91))
    letters.append("others")
    IndexList = []
    for letter in letters:
        page_url = index_url_prefix + letter
        page = requests.get(page_url)
        soup = BeautifulSoup(page.content, 'lxml')
        table = soup.find('table', class_='pcq_tbl MT10')
        links = table.find_all('a')
        for link in links:
            IndexList.append({'title':link['title'], 'link':link['href']})
    return IndexList

In [8]:
def getMetaData(soup) :
    div = soup.find('div', class_='FL gry10')
    return {ele[0]: ele[1] for ele in map(lambda ele: splitAndStrip(ele, ':') ,filter(None, splitAndStrip(div.text, '|')))}

In [9]:
def getInfo(soup):
    slider = soup.find(id='slider')
    children = slider.children
    interested = ["CORPORATE ACTION", "FINANCIALS", "ANNUAL REPORT", "SHAREHOLDING"]
    info = {}
    for child in children:
        if((isinstance(child, Tag)) and (child.name == 'dt') and (child.text in interested)):
            key = child.text
            value = {}
            while True:
                next_child = next(children)
                if (isinstance(next_child, Tag)):
                    lis = next_child.find_all('a')
                    value.update({li.text : 'https://www.moneycontrol.com' + li['href'] for li in lis})
                    break
            info.update({key : value})
    return info

In [10]:
def cleanIndex(index):
    title = index['title']
    link = index['link']
    return {'title': title, 'link': link}

In [11]:
def updateIndex(index):
    index = cleanIndex(index)
    page_url = index['link']
    page = requests.get(page_url)
    soup = BeautifulSoup(page.content, 'html.parser')
    metadata = getMetaData(soup)
    info = getInfo(soup)
    index['info'] = info
    index['metadata'] = metadata
    return index

In [12]:
def persistIndex(index):
    title = index['title']
    fileName = title.replace(" ", "")
    path = 'data/MC/'
    persist(index, fileName, path)


In [13]:
def buildFileMap():
    fileMap = {}
    files = os.listdir('data/MC')
    files.sort()
    for f in files:
        data = load(f, 'data/MC/')
        if('metadata' in data):
            if(('NSE' in data['metadata']) and (len(data['metadata']['NSE']) is not 0)):
                fileMap.update({data['metadata']['NSE'] : f})
            elif(('BSE' in data['metadata']) and (len(data['metadata']['BSE']) is not 0)):
                fileMap.update({data['metadata']['BSE'] : f})
    path = 'data/MC/'
    fileName = 'fileMap'
    persist(fileMap, fileName, path)

In [14]:
def getFileMap():
    path = 'data/MC/'
    fileName = 'fileMap'
    return load(fileName, path)

In [15]:
def parseCA(page_url):
    page = requests.get(page_url)
    soup = BeautifulSoup(page.content, 'lxml')
    table = soup.find('table', class_='tbldivid')
    columns = map(lambda col: col.text, table.find_all('th'))
    data_list = map(lambda ele: ele.text if((not 'information available for' in ele.text) and (len(ele.text) != 0)) else None, table.find_all('td'))
    data_list = filter(lambda x: x is not None, data_list)
    rows = map(lambda lst: dict(zip(columns, lst)) , [data_list[i:i+len(columns)] for i in range(0, len(data_list), len(columns))])
    return rows

In [16]:
def parseAndPersistCA(symbol):
    if(os.path.exists('data/CA/' + symbol) == False):
        interested = ['Splits', 'Bonus']
        symbol = symbol.upper()
        fileMap = getFileMap()
        fileName = fileMap[symbol]
        metaData = load(fileName, 'data/MC/')
        ca = metaData['info']['CORPORATE ACTION']
        data = map(lambda i: {i : parseCA(ca[i])}, interested)
        persist(data, symbol, 'data/CA/')

In [17]:
@concurrent
@retry(Exception, tries=3)
def updateAndPersistIndex(index):
    title = index['title']
    fileName = title.replace(" ", "")
    if(not os.path.exists('data/MC/' + fileName)):
        print("Processing : " + title)
        index = updateIndex(index)
        persistIndex(index)
    else:
        print("Already Processed " + title)

In [18]:
@synchronized
def run():
    mcIndexList = buildMCIndexList()
    print("To Process" + str(len(mcIndexList)))
    for index in mcIndexList:
        updateAndPersistIndex(index)

In [19]:
def computeSplitFactor(splits):
    factor = []
    for split in splits:
        rate = float(split['Old FV'])/ float(split['New FV'])
        date = split['Ex-Split Date'] if split['Ex-Split Date'] != '-' else split['Announcement Date']
        factor.append({'date': date, 'factor': rate})
    if(len(factor) != 0):
        df = pd.DataFrame(factor)
        df.date = pd.to_datetime(df.date, format = '%d-%m-%Y')
        df = df.set_index('date')
        df.sort_index(inplace=True)
        return df
    else:
        return pd.DataFrame()

In [20]:
def computeBonusFactor(bonuses):
    factor = []
    for bonus in bonuses:
        ratio = bonus['Bonus Ratio'].split(':')
        rate = (float(ratio[0]) + float(ratio[1])) / float(ratio[1])
        date = bonus['Ex-Bonus Date'] if bonus['Ex-Bonus Date'] != '-' else bonus['Announcement Date']
        factor.append({'date': date, 'factor': rate})
    if(len(factor) != 0):
        df = pd.DataFrame(factor)
        df.date = pd.to_datetime(df.date, format = '%d-%m-%Y')
        df = df.set_index('date')
        df.sort_index(inplace=True)
        return df
    else: 
        return pd.DataFrame()

In [21]:
def coporateFactor(ca):
    bonus_factor = computeBonusFactor(ca[1][u'Bonus'])
    split_factor = computeSplitFactor(ca[0][u'Splits'])
    return bonus_factor.multiply(split_factor, fill_value=1)

In [22]:
def adjustmentFactor(symbol):
    ca = load(symbol, 'data/CA/')
    bNsFactor = coporateFactor(ca)
    bNsFactor = bNsFactor.sort_index(ascending=False).cumprod().sort_index()
    default_factor = pd.DataFrame(pd.date_range('1990-01-01', pd.Timestamp.today()), columns=['date'])
    default_factor['factor'] = 1
    default_factor = default_factor.set_index(['date'])
    adjFactor = default_factor.multiply(bNsFactor, fill_value=1)
    adjFactor[adjFactor['factor'] == 1] = np.nan
    adjFactor = adjFactor.shift(-1)
    adjFactor = adjFactor.fillna(method='bfill').fillna(1)
    adjFactor = adjFactor[~adjFactor.index.duplicated()]
    return adjFactor

In [23]:
def createAdjOHLC(symbol):
    print("Creating Adj OHLC for " + symbol)
    parser = lambda date: pd.datetime.strptime(date, '%Y-%m-%d')
    if(os.path.exists("data/" + 'NSE:' + symbol + "_unadj" + ".csv") == True):
        df = pd.read_csv("data/" + 'NSE:' + symbol + "_unadj" + ".csv", parse_dates=True, date_parser=parser, header=0)
        df.Date = pd.to_datetime(df.Date, format = '%Y-%m-%d')
        df = df.set_index(['Date'])
        df = df[~df.index.duplicated()]
        adjFactor = adjustmentFactor(symbol)[df.index.min():df.index.max()]
        df['AdjClose'] = (df['Close'] / adjFactor['factor']).dropna()
        df['AdjFactor'] = adjFactor['factor']
        persistDF(df, symbol, 'data/')

In [2]:
page = requests.get(index_url)

NameError: name 'requests' is not defined