# Process Merged Data

This notebook process the merged dataset. Steps include imputing missing values, creating additional columns, and dropping some rows. The returned data will be used for portfolio construction.  

Author: Xin Wei (weixin0127@gmail.com), Indiana University Bloomington  
Version: 2019/12/31

In [1]:
import numpy as np
import pandas as pd
import math
import statsmodels.api as sm
import matplotlib.pyplot as plt
import time

from linearmodels import PanelOLS, RandomEffects
from datetime import datetime, timedelta

# Display options
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

## 1. Import Data

In [2]:
print("Importing data... This may take a few minutes.")
merged_data = pd.read_csv("../Data/merged_data.csv", index_col = 0)
print("Importing data: done!")

Importing data... This may take a few minutes.


  mask |= (ar1 == a)


Importing data: done!


In [3]:
merged_data.head()

Unnamed: 0,time,assetCode,assetName,volume,close,open,returnsClosePrevRaw1,returnsOpenPrevRaw1,returnsClosePrevMktres1,returnsOpenPrevMktres1,returnsClosePrevRaw10,returnsOpenPrevRaw10,returnsClosePrevMktres10,returnsOpenPrevMktres10,returnsOpenNextMktres10,universe,takeSequence,bodySize,companyCount,sentenceCount,wordCount,firstMentionSentence,relevance,sentimentClass,sentimentNegative,sentimentNeutral,sentimentPositive,sentimentWordCount,noveltyCount12H,noveltyCount24H,noveltyCount3D,noveltyCount5D,noveltyCount7D,volumeCounts12H,volumeCounts24H,volumeCounts3D,volumeCounts5D,volumeCounts7D,dummy_ENER,dummy_BMAT,dummy_INDS,dummy_CYCS,dummy_NCYC,dummy_SHOP,dummy_FDRT,dummy_HECA,dummy_FINS,dummy_TECH,dummy_TCOM,dummy_MDIA,dummy_UTIL,dummy_REAL,dummy_RES,dummy_RESF,dummy_RCH,dummy_DIV,dummy_IPO,dummy_MRG,dummy_DEAL1,dummy_BKRT,dummy_CEO1,dummy_LAYOFS,dummy_ACB,dummy_CLJ,dummy_BRIB,dummy_HACK,dummy_FAKE1,dummy_SCAM1,dummy_EMRG,dummy_US,dummy_WEU,dummy_ASIA,dummy_CN,dummy_STX,dummy_DBT,dummy_CDM,dummy_COM,dummy_DRV,dummy_FRX,dummy_GVD,dummy_HOT,dummy_MTG,dummy_PVE,dummy_MUNI,dummy_HEDGE
0,2007-02-01,A.N,Agilent Technologies Inc,2606900.0,32.19,32.17,0.005938,0.005312,,,-0.00186,0.000622,,,0.034672,1.0,1.0,5483.0,6.5,33.0,893.0,9.0,0.533333,0.5,0.080418,0.568265,0.351318,418.5,0.0,0.0,0.0,0.0,0.0,1.0,1.5,4.5,6.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2007-02-01,AAI.N,AirTran Holdings Inc,2051600.0,11.12,11.08,0.004517,-0.007168,,,-0.078708,-0.088066,,,0.027803,0.0,1.4,504.0,2.2,5.4,103.0,0.8,0.9,-0.2,0.253214,0.346204,0.400582,100.0,2.6,2.6,2.6,2.6,2.6,11.8,11.8,14.8,15.8,27.8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2007-02-01,AAP.N,Advance Auto Parts Inc,1164800.0,37.51,37.99,-0.011594,0.025648,,,0.014332,0.045405,,,0.024433,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,2007-02-01,AAPL.O,Apple Inc,23747329.0,84.74,86.23,-0.011548,0.016324,,,-0.048613,-0.037182,,,-0.007425,1.0,1.916667,1266.25,2.833333,9.5,234.583333,5.166667,0.671952,-0.166667,0.271095,0.460981,0.267924,103.833333,1.25,1.25,1.25,1.25,1.25,7.25,8.083333,28.5,42.083333,43.416667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2007-02-01,ABB.N,ABB Ltd,1208600.0,18.02,18.01,0.011791,0.025043,,,0.012929,0.020397,,,-0.017994,1.0,1.333333,2146.333333,2.833333,20.333333,411.833333,2.666667,0.754373,0.0,0.331734,0.361636,0.30663,221.833333,1.5,1.5,1.5,1.5,1.5,3.833333,3.833333,6.833333,6.833333,9.833333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## 2. Data Processing

In [6]:
class ProcessData():
    """
    Input:: merged_data: merged dataset from market data and news data
            horizon: Determines the forecasting horizon of target variable to be created
            out_filepath: file path of where the returned data to be saved
    """
    def __init__(self, merged_data, horizon, out_filepath):
        """ Pass inputs to class variables
        """
        self.merged_data = merged_data
        self.horizon = horizon
        self.out_filepath = out_filepath

    def handle_missing_values(self, data):
        """ Fill missing market-adjusted returns
        """
        raw_cols = ['returnsClosePrevRaw1', 'returnsOpenPrevRaw1', 'returnsClosePrevRaw10', 'returnsOpenPrevRaw10']
        mktres_cols = ['returnsClosePrevMktres1', 'returnsOpenPrevMktres1', 'returnsClosePrevMktres10', 'returnsOpenPrevMktres10']
        mkt_cols = ['returnClosePrevMkt1', 'returnOpenPrevMkt1', 'returnClosePrevMkt10', 'returnOpenPrevMkt10']
        for i in range(len(mkt_cols)):
            data[mkt_cols[i]] = data[raw_cols[i]] - data[mktres_cols[i]]
            data[mkt_cols[i]] = data.groupby('time')[mkt_cols[i]].transform(lambda x: x.mean())
            data[mktres_cols[i]] = data[mktres_cols[i]].fillna(data[raw_cols[i]] - data[mkt_cols[i]])
        return data.reset_index(drop = True)
    
    def process_data(self):
        """ Process merged data to get it ready for portfolio construction
        """
        df = self.merged_data
        print('The merged data has {} rows'.format(len(df.index)))

        # Drop data during the crisis period
        df = df[df['time']>='2009-07-01'].reset_index(drop = True).copy()
        print('After dropping crisis data, {} rows left'.format(len(df.index)))
        
        # Handle missing values
        df = self.handle_missing_values(df)
        print('Handling missing values: done!')

        # Create target variable with given forecasting horizon
        data_grouped = df.groupby('assetCode')
        df['returnsOpenNextMktres{}'.format(horizon)] = data_grouped['returnsOpenPrevMktres1'].rolling(horizon).sum().transform(
                                                            lambda x: x.shift(-horizon).fillna(0)).reset_index(0,drop=True)
        print('Create target variable: done!')

        # Note: the tail rows will not have realized target of returnOpenNextMktres{horizon}
        df = df[df['time'] < df.time.unique()[-horizon]].copy()
        print('After cutting the tail rows, {} rows left'.format(len(df.index)))

        # Creating new columns
        df['dayTrend'] = df['close'] / df['open']
        df['firstMentionPosition'] = df['firstMentionSentence'] / df['sentenceCount']
        df['sentimentCoverage'] = df['sentimentWordCount'] / df['wordCount']
        df['dollarVolume'] = df['volume'] * df['open']
        df['liquid'] = ((df.volume >= 500000) & (df.dollarVolume >= 25000000)).astype(int)
        print('Create new columns: done!')
        
        # Save data
        df.to_csv(self.out_filepath)
        print("Saving merged data: done!")

## 3. Call Class

In [7]:
start_time = time.time()
# building quarterly rebalancing portfolio
horizon = 90
out_filepath = "../Data/processed_data.csv"
a = ProcessData(merged_data, horizon, out_filepath)
a.process_data()
print("Running time -- {0}".format(timedelta(seconds = time.time() - start_time ) ) )

The merged data has 4072956 rows
After dropping crisis data, 3140398 rows left
Handling missing values: done!
Create target variable: done!
After cutting the tail rows, 2975404 rows left
Create new columns: done!
Saving merged data: done!
Running time -- 0:06:51.396481
