In [None]:
import pandas as pd
import numpy as np
from IPython.display import display
from ipython_helpers import (
    print_full
)

In [None]:
def transform_quarterly_equity_df(df):
    # Fix the unit of outstanding shares
    df.loc[:, 'out_shares'] = df['cshoq'] * 10**6
    
    # Fix the unit of given market value
    df.loc[:, 'given_market_value'] = df['mkvaltq'] * 10**6
    
    # Compute market value based on price and outstanding shares
    df.loc[:, 'computed_market_value'] = df['prccq'] * df['out_shares']
    
    # Determine market value based on the availability of given_market_value
    # If given_market_value exists, assign the value to market_value. Otherwise, assign omputed_market_value
    df.loc[:, 'market_value'] = np.where(pd.notnull(df['given_market_value']), df['given_market_value'], df['computed_market_value'])
    
    # Fix the unit of net income
    df.loc[:, 'net_income'] = df['niy'] * 10 ** 6
    
    # Fix the unit of cash and short term investment
    df.loc[:, 'cash_and_st_investment'] = df['cheq'] * 10**6
    
    # Fix the unit of total assets 
    df.loc[:, 'total_assets'] = df['atq'] * 10**6
    
    # Fix the unit of total liabilities
    df.loc[:, 'total_liabilities'] = df['ltq'] * 10**6

    # Compute book value of equity 
    df.loc[:, 'book_equity'] = df['total_assets'] - df['total_liabilities']

    # Parse the datadate column to create year and month columns
    # The year and month columns will be used to look up S&P500 market value
    df.loc[:, 'year'] = df['datadate'].dt.year
    df.loc[:, 'month'] = df['datadate'].dt.month
    
    # Rename the column to enable merge with SIGMA and EXRET dataframes 
    df.rename(columns={'tic':'TICKER', 'cusip':'CUSIP'}, inplace=True)
    
    # Drop the last digit from CUSIP 
    # Fundamental data from Compustat has 9 digits CUSIP while price data from CRSP has 8 digits. 
    # The 9th digit of CUSIP from Compustat needs to be removed to enable merge with SIGMA and EXRET dataframes,
    # which are based on CRSP price data.
    df['CUSIP'] = df['CUSIP'].astype(str).str[:-1]
    
    # Sort the values to establish order to the data
    df.sort_values(['TICKER', 'CUSIP', 'year', 'month'], inplace=True)

    return df


def transform_monthly_index_df(df):
    # 1) Fix the unit of total market value by multiplying by 1000
    df.loc[:, 'market_value'] = df['totval'] * 1000

    # 2) Parse the date column to create year and month columns
    # The year and month columns will be used to look up S&P500 market value
    df.loc[:, 'year'] = df['caldt'].dt.year
    df.loc[:, 'month'] = df['caldt'].dt.month
    
    return df.set_index(['year', 'month'])


def compute_RSIZE(row, index_df):
    return np.log(row['market_value'] / index_df.loc[row['year'], row['month']]['market_value'])


def make_campbell_features(df, index_df):
    print("Currently making ADJUSTED_TOTAL_ASSETS variable")
    df.loc[:, 'ADJUSTED_TOTAL_ASSETS'] = df['total_assets'] + 0.1 * (df['market_value'] - df['book_equity'])
    
    print("Currently making MARKET_TOTAL_ASSETS variable")
    df.loc[:, 'MARKET_TOTAL_ASSETS'] = df['market_value'] + df['total_liabilities']

    print("Currently making NITA variable")
    df.loc[:, 'NITA'] = df['net_income'] / df['ADJUSTED_TOTAL_ASSETS']

    print("Currently making TLTA variable")
    df.loc[:, 'TLTA'] = df['total_liabilities'] / df['ADJUSTED_TOTAL_ASSETS']
    
    print("Currently making NIMTA variable")
    df.loc[:, 'NIMTA'] = df['net_income'] / df['MARKET_TOTAL_ASSETS']
    
    print("Currently making TLMTA variable")
    df.loc[:, 'TLMTA'] = df['total_liabilities'] / df['MARKET_TOTAL_ASSETS']
    
    print("Currently making CASHMTA variable")
    df.loc[:, 'CASHMTA'] = df['cash_and_st_investment'] / df['MARKET_TOTAL_ASSETS']
    
    print("Currently making RSIZE variable")
    df.loc[:, 'RSIZE']= df.apply(lambda row: compute_RSIZE(row, index_df), axis=1)
    
    return df 


def merge_dfs(left_df, right_df, var_name):
    
    right_df = right_df[['TICKER', 'CUSIP', 'year', 'month', var_name]]
    merged_df = left_df.merge(right_df, on=['TICKER', 'CUSIP', 'year', 'month'], how='left')

    return merged_df



In [None]:
monthly_index_df = pd.read_csv("../Data/original_data/crsp_monthly_index_1961_2015.csv", parse_dates=['caldt'])

In [None]:
quarterly_equity_df = pd.read_csv("../Data/original_data/merged_quarterly_1961_2015.csv", parse_dates=['datadate', 'dldte', 'ipodate'])

In [None]:
SIGMA_df = pd.read_csv("../Data/campbell_data/campbell_SIGMA.csv")

In [None]:
EXRET_df = pd.read_csv("../Data/campbell_data/campbell_EXRET.csv")

In [None]:
transformed_monthly_index_df = transform_monthly_index_df(monthly_index_df)

In [None]:
transformed_quarterly_equity_df = transform_quarterly_equity_df(quarterly_equity_df)

In [None]:
test_df = SIGMA_df.dropna(subset=['SIGMA'])

In [None]:
test_df[test_df['SIGMA'] == float("inf")]