# Options data extraction from NSE files

## Extract data


Extracting data from NSE files

In [7]:
import pandas as pd
import numpy as np
import zipfile
import os

In [None]:
# PARAMETERS
directory:str = "/home/data"
pattern:str = "OPTIDXNIFTY\d"
output_file:str = "/tmp/options.h5"
cpus:int = os.cpu_count()

In [None]:
for a,b,c in os.walk(directory):
    files = [os.path.join(a,f) for f in c]
    break

In [None]:
def extract_data(filename, pattern):
    try:
        z = zipfile.ZipFile(filename)
        collect = []
        for f in z.filelist:
            df = pd.read_csv(z.open(f.filename))
            collect.append(df)
        df2 = pd.concat(collect)
        df2['DATE'] = pd.to_datetime(filename.split('/')[-1][2:8],dayfirst=True)
        df2.columns = [str(x).lower() for x in df2.columns]
        return df2[df2.contract_d.str.match(pattern)]
    except Exception as e:
        print(e)
        # return an empty dataframe since this would be useful in
        # concantenating dataframes
        return pd.DataFrame()


In [None]:
from multiprocessing import Pool
from functools import partial
extract_data_partial = partial(extract_data, pattern=pattern)

In [None]:
%%time
with Pool(cpus) as p:
    result = p.map(extract_data_partial, files)

In [None]:
%%time
data = pd.concat(result)
del result # Free up some memory

## Process data for further processing

In [None]:
%%time
symbol = pattern[:-2]
l = len(symbol)
data['segment'] = data.contract_d.str[:6]
data['contract'] = data.contract_d.str[6:l]
data['expiry_date'] = pd.to_datetime(data.contract_d.str[l:l+11].values)
data['opt_type'] = data.contract_d.str[l+11:l+13]
data['strike'] = data.contract_d.str[l+13:].astype(int)

In [None]:
data.head()

In [None]:
%%time
# Try writing in HDF5 format. If it raises an error, write it in csv format
try:
    data.to_hdf(output_file, key='data', format='fixed')
except Exception as e:
    data.to_csv(output_file+'.csv', index=False)
    print(e)