In [1]:
import os

os.chdir('tests')

# Pandas

In [2]:
%%writefile test_pandas.py

from context import dero

import pandas as pd
from pandas.util.testing import assert_frame_equal
from pandas import Timestamp
from numpy import nan
import numpy

class DataFrameTest:
    
    df = pd.DataFrame([
                                (10516, 'a', '1/1/2000', 1.01),
                                (10516, 'a', '1/2/2000', 1.02),
                                (10516, 'a', '1/3/2000', 1.03),
                                (10516, 'a', '1/4/2000', 1.04),
                                (10516, 'b', '1/1/2000', 1.05),
                                (10516, 'b', '1/2/2000', 1.06),
                                (10516, 'b', '1/3/2000', 1.07),
                                (10516, 'b', '1/4/2000', 1.08),
                                (10517, 'a', '1/1/2000', 1.09),
                                (10517, 'a', '1/2/2000', 1.10),
                                (10517, 'a', '1/3/2000', 1.11),
                                (10517, 'a', '1/4/2000', 1.12),
                               ], columns = ['PERMNO','byvar','Date', 'RET'])   
    
    df_duplicate_row = pd.DataFrame([
                                (10516, 'a', '1/1/2000', 1.01),
                                (10516, 'a', '1/2/2000', 1.02),
                                (10516, 'a', '1/3/2000', 1.03),
                                (10516, 'a', '1/3/2000', 1.03), #this is a duplicated row
                                (10516, 'a', '1/4/2000', 1.04),
                                (10516, 'b', '1/1/2000', 1.05),
                                (10516, 'b', '1/2/2000', 1.06),
                                (10516, 'b', '1/3/2000', 1.07),
                                (10516, 'b', '1/4/2000', 1.08),
                                (10517, 'a', '1/1/2000', 1.09),
                                (10517, 'a', '1/2/2000', 1.10),
                                (10517, 'a', '1/3/2000', 1.11),
                                (10517, 'a', '1/4/2000', 1.12),
                               ], columns = ['PERMNO','byvar','Date', 'RET'])  
    
    df_weight = pd.DataFrame(data = [
                                (10516, 'a', '1/1/2000', 1.01, 0),
                                (10516, 'a', '1/2/2000', 1.02, 1),
                                (10516, 'a', '1/3/2000', 1.03, 1),
                                (10516, 'a', '1/4/2000', 1.04, 0),
                                (10516, 'b', '1/1/2000', 1.05, 1),
                                (10516, 'b', '1/2/2000', 1.06, 1),
                                (10516, 'b', '1/3/2000', 1.07, 1),
                                (10516, 'b', '1/4/2000', 1.08, 1),
                                (10517, 'a', '1/1/2000', 1.09, 0),
                                (10517, 'a', '1/2/2000', 1.1, 0),
                                (10517, 'a', '1/3/2000', 1.11, 0),
                                (10517, 'a', '1/4/2000', 1.12, 1),
                                ], columns = ['PERMNO', 'byvar', 'Date', 'RET', 'weight'])
    
    df_nan_byvar = pd.DataFrame(data = [
                                ('a', 1),
                                (nan, 2),
                                ('b', 3),
                                ('b', 4),
                                ], columns = ['byvar', 'val'])
    
    df_nan_byvar_and_val = pd.DataFrame(data = [
                                ('a', 1),
                                (nan, 2),
                                ('b', nan),
                                ('b', 4),
                                ], columns = ['byvar', 'val'])
    
    single_ticker_df = pd.DataFrame(data = [
        ('a', Timestamp('2000-01-01 00:00:00'), 'ADM'),
        ], columns = ['byvar', 'Date', 'TICKER'])

    
    df_datetime = df.copy()
    df_datetime['Date'] = pd.to_datetime(df_datetime['Date'])
    
    df_datetime_no_ret = df_datetime.copy()
    df_datetime_no_ret.drop('RET', axis=1, inplace=True)
    
    df_gvkey_str = pd.DataFrame([
        ('001076','3/1/1995'),
        ('001076','4/1/1995'),
        ('001722','1/1/2012'),
        ('001722','7/1/2012'),
        ('001722', nan),
        (nan ,'1/1/2012')
    ], columns=['GVKEY','Date'])

    df_gvkey_str['Date'] = pd.to_datetime(df_gvkey_str['Date'])
    df_gvkey_num = df_gvkey_str.copy()
    df_gvkey_num['GVKEY'] = df_gvkey_num['GVKEY'].astype('float64')

    df_gvkey_str2 = pd.DataFrame([
        ('001076','2/1/1995'),
        ('001076','3/2/1995'),
        ('001722','11/1/2011'),
        ('001722','10/1/2011'),
        ('001722', nan),
        (nan ,'1/1/2012')
    ], columns=['GVKEY','Date'])
    df_gvkey_str2['Date'] = pd.to_datetime(df_gvkey_str2['Date'])

class TestCumulate(DataFrameTest):
    
    
    expect_between_1_3 = pd.DataFrame(data = [
                                (10516, 'a', '1/1/2000', 1.01, 1.01),
                                (10516, 'a', '1/2/2000', 1.02, 1.02),
                                (10516, 'a', '1/3/2000', 1.03, 1.0506),
                                (10516, 'a', '1/4/2000', 1.04, 1.04),
                                (10516, 'b', '1/1/2000', 1.05, 1.05),
                                (10516, 'b', '1/2/2000', 1.06, 1.06),
                                (10516, 'b', '1/3/2000', 1.07, 1.1342),
                                (10516, 'b', '1/4/2000', 1.08, 1.08),
                                (10517, 'a', '1/1/2000', 1.09, 1.09),
                                (10517, 'a', '1/2/2000', 1.1, 1.1),
                                (10517, 'a', '1/3/2000', 1.11, 1.2210000000000003),
                                (10517, 'a', '1/4/2000', 1.12, 1.12),
                                ], columns = ['PERMNO', 'byvar', 'Date', 'RET', 'cum_RET'])
    
    expect_first = pd.DataFrame([
                                (10516, 'a', '1/1/2000', 1.01, 1.01),
                                (10516, 'a', '1/2/2000', 1.02, 1.02),
                                (10516, 'a', '1/3/2000', 1.03, 1.0506),
                                (10516, 'a', '1/4/2000', 1.04, 1.092624),
                                (10516, 'b', '1/1/2000', 1.05, 1.05),
                                (10516, 'b', '1/2/2000', 1.06, 1.06),
                                (10516, 'b', '1/3/2000', 1.07, 1.1342),
                                (10516, 'b', '1/4/2000', 1.08, 1.224936),
                                (10517, 'a', '1/1/2000', 1.09, 1.09),
                                (10517, 'a', '1/2/2000', 1.10, 1.10),
                                (10517, 'a', '1/3/2000', 1.11, 1.221),
                                (10517, 'a', '1/4/2000', 1.12, 1.36752),
                                ], columns = ['PERMNO','byvar','Date', 'RET', 'cum_RET'])
        
    def test_method_between_1_3(self):
        cum_df = dero.pandas.cumulate(self.df, 'RET', 'between', periodvar='Date', 
                                      byvars=['PERMNO','byvar'], time=[1,3])
        
        assert_frame_equal(self.expect_between_1_3, cum_df, check_dtype=False)
        
    def test_method_between_m2_0(self):
        cum_df = dero.pandas.cumulate(self.df, 'RET', 'between', periodvar='Date', 
                                      byvars=['PERMNO','byvar'], time=[-2,0])
        
        #Actually same result as [1,3]
        assert_frame_equal(self.expect_between_1_3, cum_df, check_dtype=False)
        
    def test_shifted_index(self):
        df = self.df.copy()
        
        df.index = df.index + 10
        
        cum_df = dero.pandas.cumulate(df, 'RET', 'between', periodvar='Date', 
                                      byvars=['PERMNO','byvar'], time=[-2,0])
        
        assert_frame_equal(self.expect_between_1_3, cum_df, check_dtype=False)
        
    def test_method_first(self):
        cum_df = dero.pandas.cumulate(self.df, 'RET', 'first', periodvar='Date', 
                                      byvars=['PERMNO','byvar'])
        
        assert_frame_equal(self.expect_first, cum_df, check_dtype=False)
        
    def test_grossify(self):
        df = self.df.copy() #don't overwrite original
        df['RET'] -= 1 #ungrossify
        expect_first_grossify = self.expect_first.copy()
        expect_first_grossify['cum_RET'] -= 1
        expect_first_grossify['RET'] -= 1
        cum_df = dero.pandas.cumulate(df, 'RET', 'first', periodvar='Date', 
                                      byvars=['PERMNO','byvar'], grossify=True)
    
        assert_frame_equal(expect_first_grossify, cum_df, check_dtype=False)

class TestGroupbyMerge(DataFrameTest):

    
    def test_subset_max(self):
        byvars = ['PERMNO','byvar']
        out = dero.pandas.groupby_merge(self.df, byvars, 'max', subset='RET')
        expect_df = pd.DataFrame(
                                [(10516, 'a', '1/1/2000', 1.01, 1.04),
                                 (10516, 'a', '1/2/2000', 1.02, 1.04),
                                 (10516, 'a', '1/3/2000', 1.03, 1.04),
                                 (10516, 'a', '1/4/2000', 1.04, 1.04),
                                 (10516, 'b', '1/1/2000', 1.05, 1.08),
                                 (10516, 'b', '1/2/2000', 1.06, 1.08),
                                 (10516, 'b', '1/3/2000', 1.07, 1.08),
                                 (10516, 'b', '1/4/2000', 1.08, 1.08),
                                 (10517, 'a', '1/1/2000', 1.09, 1.12),
                                 (10517, 'a', '1/2/2000', 1.10, 1.12),
                                 (10517, 'a', '1/3/2000', 1.11, 1.12),
                                 (10517, 'a', '1/4/2000', 1.12, 1.12)],
         columns = ['PERMNO','byvar','Date', 'RET', 'RET_max'])
        
        assert_frame_equal(expect_df, out)
        
    def test_subset_std(self):
        byvars = ['PERMNO','byvar']
        out = dero.pandas.groupby_merge(self.df, byvars, 'std', subset='RET')
        expect_df = pd.DataFrame(
                                [(10516, 'a', '1/1/2000', 1.01, 0.012909944487358068),
                                 (10516, 'a', '1/2/2000', 1.02, 0.012909944487358068),
                                 (10516, 'a', '1/3/2000', 1.03, 0.012909944487358068),
                                 (10516, 'a', '1/4/2000', 1.04, 0.012909944487358068),
                                 (10516, 'b', '1/1/2000', 1.05, 0.012909944487358068),
                                 (10516, 'b', '1/2/2000', 1.06, 0.012909944487358068),
                                 (10516, 'b', '1/3/2000', 1.07, 0.012909944487358068),
                                 (10516, 'b', '1/4/2000', 1.08, 0.012909944487358068),
                                 (10517, 'a', '1/1/2000', 1.09, 0.012909944487358068),
                                 (10517, 'a', '1/2/2000', 1.10, 0.012909944487358068),
                                 (10517, 'a', '1/3/2000', 1.11, 0.012909944487358068),
                                 (10517, 'a', '1/4/2000', 1.12, 0.012909944487358068)],
         columns = ['PERMNO','byvar','Date', 'RET', 'RET_std'])
        
        assert_frame_equal(expect_df, out)
        
    def test_nan_byvar_transform(self):
        expect_df = self.df_nan_byvar.copy()
        expect_df['val_transform'] = expect_df['val']
        
        out = dero.pandas.groupby_merge(self.df_nan_byvar, 'byvar', 'transform', (lambda x: x))
        
        assert_frame_equal(expect_df, out)
        
    def test_nan_byvar_and_nan_val_transform_numeric(self):
        non_standard_index = self.df_nan_byvar_and_val.copy()
        non_standard_index.index = [5,6,7,8]
        
        expect_df = self.df_nan_byvar_and_val.copy()
        expect_df['val_transform'] = expect_df['val'] + 1
        expect_df.index = [5,6,7,8]
        
        out = dero.pandas.groupby_merge(non_standard_index, 'byvar', 'transform', (lambda x: x + 1))
        
        assert_frame_equal(expect_df, out)
        
    def test_nan_byvar_and_nan_val_and_nonstandard_index_transform_numeric(self):
        expect_df = self.df_nan_byvar_and_val.copy()
        expect_df['val_transform'] = expect_df['val'] + 1
        
    def test_nan_byvar_sum(self):
        expect_df = pd.DataFrame(data = [
                        ('a', 1, 1.0),
                        (nan, 2, nan),
                        ('b', 3, 7.0),
                        ('b', 4, 7.0),
                        ], columns = ['byvar', 'val', 'val_sum'])
        
        out = dero.pandas.groupby_merge(self.df_nan_byvar, 'byvar', 'sum')
        
        assert_frame_equal(expect_df, out)
        
        
class TestLongToWide:
    
    expect_df_with_colindex = pd.DataFrame(data = [
                                (10516, 'a', 1.01, 1.02, 1.03, 1.04),
                                (10516, 'b', 1.05, 1.06, 1.07, 1.08),
                                (10517, 'a', 1.09, 1.1, 1.11, 1.12),
                                ], columns = ['PERMNO', 'byvar', 
                                              'RET1/1/2000', 'RET1/2/2000', 
                                              'RET1/3/2000', 'RET1/4/2000'])
    
    expect_df_no_colindex = pd.DataFrame(data = [
                            (10516, 'a', '1/1/2000', 1.01, 1.02, 1.03, 1.04),
                            (10516, 'a', '1/2/2000', 1.01, 1.02, 1.03, 1.04),
                            (10516, 'a', '1/3/2000', 1.01, 1.02, 1.03, 1.04),
                            (10516, 'a', '1/4/2000', 1.01, 1.02, 1.03, 1.04),
                            (10516, 'b', '1/1/2000', 1.05, 1.06, 1.07, 1.08),
                            (10516, 'b', '1/2/2000', 1.05, 1.06, 1.07, 1.08),
                            (10516, 'b', '1/3/2000', 1.05, 1.06, 1.07, 1.08),
                            (10516, 'b', '1/4/2000', 1.05, 1.06, 1.07, 1.08),
                            (10517, 'a', '1/1/2000', 1.09, 1.1, 1.11, 1.12),
                            (10517, 'a', '1/2/2000', 1.09, 1.1, 1.11, 1.12),
                            (10517, 'a', '1/3/2000', 1.09, 1.1, 1.11, 1.12),
                            (10517, 'a', '1/4/2000', 1.09, 1.1, 1.11, 1.12),
                            ], columns = ['PERMNO', 'byvar', 'Date', 'RET0', 
                                          'RET1', 'RET2', 'RET3'])
    input_data = DataFrameTest()

    ltw_no_dup_colindex    = dero.pandas.long_to_wide(input_data.df,
                                                     ['PERMNO', 'byvar'], 'RET', colindex='Date')
    ltw_dup_colindex       = dero.pandas.long_to_wide(input_data.df_duplicate_row,
                                                     ['PERMNO', 'byvar'], 'RET', colindex='Date')
    ltw_no_dup_no_colindex = dero.pandas.long_to_wide(input_data.df,
                                                     ['PERMNO', 'byvar'], 'RET')
    ltw_dup_no_colindex    = dero.pandas.long_to_wide(input_data.df_duplicate_row,
                                                     ['PERMNO', 'byvar'], 'RET')
    df_list = [ltw_no_dup_colindex, ltw_dup_colindex, 
               ltw_no_dup_no_colindex, ltw_dup_no_colindex]

    def test_no_duplicates_with_colindex(self):
        assert_frame_equal(self.expect_df_with_colindex, self.ltw_no_dup_colindex)
        
    def test_duplicates_with_colindex(self):
        assert_frame_equal(self.expect_df_with_colindex, self.ltw_dup_colindex)
        
    def test_no_duplicates_no_colindex(self):        
        assert_frame_equal(self.expect_df_no_colindex, self.ltw_no_dup_no_colindex)
        
    def test_duplicates_no_colindex(self):        
        assert_frame_equal(self.expect_df_no_colindex, self.ltw_dup_no_colindex)
        
    def test_no_extra_vars(self):
        for df in self.df_list:
            assert ('__idx__','__key__') not in df.columns
            

class TestPortfolioAverages:
    
    input_data = DataFrameTest()
    
    expect_avgs_no_wt = pd.DataFrame(data = [
                    (1, 'a', 1.0250000000000001),
                    (1, 'b', 1.0550000000000002),
                    (2, 'a', 1.1050000000000002),
                    (2, 'b', 1.0750000000000002),
                    ], columns = ['portfolio', 'byvar', 'RET'])
    
    expect_avgs_wt = pd.DataFrame(data = [
                    (1, 'a', 1.0250000000000001, 1.025),
                    (1, 'b', 1.0550000000000002, 1.0550000000000002),
                    (2, 'a', 1.1050000000000002, 1.12),
                    (2, 'b', 1.0750000000000002, 1.0750000000000002),
                    ], columns = ['portfolio', 'byvar', 'RET', 'RET_wavg'])
    
    expect_ports = pd.DataFrame(data = [
                    (10516, 'a', '1/1/2000', 1.01, 0, 1),
                    (10516, 'a', '1/2/2000', 1.02, 1, 1),
                    (10516, 'a', '1/3/2000', 1.03, 1, 1),
                    (10516, 'a', '1/4/2000', 1.04, 0, 1),
                    (10516, 'b', '1/1/2000', 1.05, 1, 1),
                    (10516, 'b', '1/2/2000', 1.06, 1, 1),
                    (10516, 'b', '1/3/2000', 1.07, 1, 2),
                    (10516, 'b', '1/4/2000', 1.08, 1, 2),
                    (10517, 'a', '1/1/2000', 1.09, 0, 2),
                    (10517, 'a', '1/2/2000', 1.1, 0, 2),
                    (10517, 'a', '1/3/2000', 1.11, 0, 2),
                    (10517, 'a', '1/4/2000', 1.12, 1, 2),
                    ], columns = ['PERMNO', 'byvar', 'Date', 'RET', 'weight', 'portfolio'])
    
    avgs, ports = dero.pandas.portfolio_averages(input_data.df_weight, 'RET', 'RET', ngroups=2,
                                                  byvars='byvar')
    
    w_avgs, w_ports = dero.pandas.portfolio_averages(input_data.df_weight, 'RET', 'RET', ngroups=2,
                                                  byvars='byvar', wtvar='weight')
    
    def test_simple_averages(self):
        assert_frame_equal(self.expect_avgs_no_wt, self.avgs, check_dtype=False)
    
    def test_weighted_averages(self):
        assert_frame_equal(self.expect_avgs_wt, self.w_avgs, check_dtype=False)
        
    def test_portfolio_construction(self):
        print(self.ports)
        assert_frame_equal(self.expect_ports, self.ports, check_dtype=False)
        assert_frame_equal(self.expect_ports, self.w_ports, check_dtype=False)

class TestWinsorize(DataFrameTest):
    
    def test_winsor_40_subset_byvars(self):
        
        expect_df = pd.DataFrame(data = [
            (10516, 'a', '1/1/2000', 1.0216),
            (10516, 'a', '1/2/2000', 1.0216),
            (10516, 'a', '1/3/2000', 1.028),
            (10516, 'a', '1/4/2000', 1.028),
            (10516, 'b', '1/1/2000', 1.0616),
            (10516, 'b', '1/2/2000', 1.0616),
            (10516, 'b', '1/3/2000', 1.068),
            (10516, 'b', '1/4/2000', 1.068),
            (10517, 'a', '1/1/2000', 1.1016000000000001),
            (10517, 'a', '1/2/2000', 1.1016000000000001),
            (10517, 'a', '1/3/2000', 1.108),
            (10517, 'a', '1/4/2000', 1.108),
            ], columns = ['PERMNO', 'byvar', 'Date', 'RET'])
        
        wins = dero.pandas.winsorize(self.df, .4, subset='RET', byvars=['PERMNO','byvar'])
        
        assert_frame_equal(expect_df, wins)
    
class TestRegBy(DataFrameTest):
    
    
    def create_indf(self):
        indf = self.df_weight.copy()
        indf['key'] = indf['PERMNO'].astype(str) + '_' + indf['byvar']
        return indf
    
    def test_regby_nocons(self):
        
        indf = self.create_indf()
        
        expect_df = pd.DataFrame(data = [
            (0.48774684748988806, '10516_a'),
            (0.9388636664168903, '10516_b'),
            (0.22929206076239614, '10517_a'),
            ], columns = ['coef_RET', 'key'])
        
        rb = dero.pandas.reg_by(indf, 'weight', 'RET', 'key', cons=False)
        
        print('Reg by: ', rb)
        
        assert_frame_equal(expect_df, rb)
        
    def test_regby_cons(self):
        
        indf = self.create_indf()

        expect_df = pd.DataFrame(data = [
            (0.49999999999999645, 5.329070518200751e-15, '10516_a'),
            (0.9999999999999893, 1.0658141036401503e-14, '10516_b'),
            (-32.89999999999997, 29.999999999999982, '10517_a'),
            ], columns = ['const', 'coef_RET', 'key'])
        
        rb = dero.pandas.reg_by(indf, 'weight', 'RET', 'key')
        
        print('Reg by: ', rb)
        
        assert_frame_equal(expect_df, rb)
        
    def test_regby_cons_low_obs(self):
        
        indf = self.create_indf().loc[:8,:] #makes it so that one byvar only has one obs
        
        expect_df = pd.DataFrame(data = [
            (0.49999999999999645, 5.329070518200751e-15, '10516_a'),
            (0.9999999999999893, 1.0658141036401503e-14, '10516_b'),
            (nan, nan, '10517_a'),
            ], columns = ['const', 'coef_RET', 'key'])
        
        rb = dero.pandas.reg_by(indf, 'weight', 'RET', 'key')
        
        print('Reg by: ', rb)
        
        assert_frame_equal(expect_df, rb)
        
class TestExpandMonths(DataFrameTest):
    
    def test_expand_months_tradedays(self):
        
        expect_df = pd.DataFrame(data = [
            (Timestamp('2000-01-03 00:00:00'), 'a', Timestamp('2000-01-01 00:00:00'), 'ADM'),
            (Timestamp('2000-01-04 00:00:00'), 'a', Timestamp('2000-01-01 00:00:00'), 'ADM'),
            (Timestamp('2000-01-05 00:00:00'), 'a', Timestamp('2000-01-01 00:00:00'), 'ADM'),
            (Timestamp('2000-01-06 00:00:00'), 'a', Timestamp('2000-01-01 00:00:00'), 'ADM'),
            (Timestamp('2000-01-07 00:00:00'), 'a', Timestamp('2000-01-01 00:00:00'), 'ADM'),
            (Timestamp('2000-01-10 00:00:00'), 'a', Timestamp('2000-01-01 00:00:00'), 'ADM'),
            (Timestamp('2000-01-11 00:00:00'), 'a', Timestamp('2000-01-01 00:00:00'), 'ADM'),
            (Timestamp('2000-01-12 00:00:00'), 'a', Timestamp('2000-01-01 00:00:00'), 'ADM'),
            (Timestamp('2000-01-13 00:00:00'), 'a', Timestamp('2000-01-01 00:00:00'), 'ADM'),
            (Timestamp('2000-01-14 00:00:00'), 'a', Timestamp('2000-01-01 00:00:00'), 'ADM'),
            (Timestamp('2000-01-18 00:00:00'), 'a', Timestamp('2000-01-01 00:00:00'), 'ADM'),
            (Timestamp('2000-01-19 00:00:00'), 'a', Timestamp('2000-01-01 00:00:00'), 'ADM'),
            (Timestamp('2000-01-20 00:00:00'), 'a', Timestamp('2000-01-01 00:00:00'), 'ADM'),
            (Timestamp('2000-01-21 00:00:00'), 'a', Timestamp('2000-01-01 00:00:00'), 'ADM'),
            (Timestamp('2000-01-24 00:00:00'), 'a', Timestamp('2000-01-01 00:00:00'), 'ADM'),
            (Timestamp('2000-01-25 00:00:00'), 'a', Timestamp('2000-01-01 00:00:00'), 'ADM'),
            (Timestamp('2000-01-26 00:00:00'), 'a', Timestamp('2000-01-01 00:00:00'), 'ADM'),
            (Timestamp('2000-01-27 00:00:00'), 'a', Timestamp('2000-01-01 00:00:00'), 'ADM'),
            (Timestamp('2000-01-28 00:00:00'), 'a', Timestamp('2000-01-01 00:00:00'), 'ADM'),
            (Timestamp('2000-01-31 00:00:00'), 'a', Timestamp('2000-01-01 00:00:00'), 'ADM'),
            ], columns = ['Daily Date', 'byvar', 'Date', 'TICKER'])
        
        em = dero.pandas.expand_months(self.single_ticker_df)
        
        assert_frame_equal(expect_df.sort(axis=1), em.sort(axis=1))
        
    def test_expand_months_calendardays(self):
        
        expect_df = pd.DataFrame(data = [
            (Timestamp('2000-01-01 00:00:00'), 'a', Timestamp('2000-01-01 00:00:00'), 'ADM'),
            (Timestamp('2000-01-02 00:00:00'), 'a', Timestamp('2000-01-01 00:00:00'), 'ADM'),
            (Timestamp('2000-01-03 00:00:00'), 'a', Timestamp('2000-01-01 00:00:00'), 'ADM'),
            (Timestamp('2000-01-04 00:00:00'), 'a', Timestamp('2000-01-01 00:00:00'), 'ADM'),
            (Timestamp('2000-01-05 00:00:00'), 'a', Timestamp('2000-01-01 00:00:00'), 'ADM'),
            (Timestamp('2000-01-06 00:00:00'), 'a', Timestamp('2000-01-01 00:00:00'), 'ADM'),
            (Timestamp('2000-01-07 00:00:00'), 'a', Timestamp('2000-01-01 00:00:00'), 'ADM'),
            (Timestamp('2000-01-08 00:00:00'), 'a', Timestamp('2000-01-01 00:00:00'), 'ADM'),
            (Timestamp('2000-01-09 00:00:00'), 'a', Timestamp('2000-01-01 00:00:00'), 'ADM'),
            (Timestamp('2000-01-10 00:00:00'), 'a', Timestamp('2000-01-01 00:00:00'), 'ADM'),
            (Timestamp('2000-01-11 00:00:00'), 'a', Timestamp('2000-01-01 00:00:00'), 'ADM'),
            (Timestamp('2000-01-12 00:00:00'), 'a', Timestamp('2000-01-01 00:00:00'), 'ADM'),
            (Timestamp('2000-01-13 00:00:00'), 'a', Timestamp('2000-01-01 00:00:00'), 'ADM'),
            (Timestamp('2000-01-14 00:00:00'), 'a', Timestamp('2000-01-01 00:00:00'), 'ADM'),
            (Timestamp('2000-01-15 00:00:00'), 'a', Timestamp('2000-01-01 00:00:00'), 'ADM'),
            (Timestamp('2000-01-16 00:00:00'), 'a', Timestamp('2000-01-01 00:00:00'), 'ADM'),
            (Timestamp('2000-01-17 00:00:00'), 'a', Timestamp('2000-01-01 00:00:00'), 'ADM'),
            (Timestamp('2000-01-18 00:00:00'), 'a', Timestamp('2000-01-01 00:00:00'), 'ADM'),
            (Timestamp('2000-01-19 00:00:00'), 'a', Timestamp('2000-01-01 00:00:00'), 'ADM'),
            (Timestamp('2000-01-20 00:00:00'), 'a', Timestamp('2000-01-01 00:00:00'), 'ADM'),
            (Timestamp('2000-01-21 00:00:00'), 'a', Timestamp('2000-01-01 00:00:00'), 'ADM'),
            (Timestamp('2000-01-22 00:00:00'), 'a', Timestamp('2000-01-01 00:00:00'), 'ADM'),
            (Timestamp('2000-01-23 00:00:00'), 'a', Timestamp('2000-01-01 00:00:00'), 'ADM'),
            (Timestamp('2000-01-24 00:00:00'), 'a', Timestamp('2000-01-01 00:00:00'), 'ADM'),
            (Timestamp('2000-01-25 00:00:00'), 'a', Timestamp('2000-01-01 00:00:00'), 'ADM'),
            (Timestamp('2000-01-26 00:00:00'), 'a', Timestamp('2000-01-01 00:00:00'), 'ADM'),
            (Timestamp('2000-01-27 00:00:00'), 'a', Timestamp('2000-01-01 00:00:00'), 'ADM'),
            (Timestamp('2000-01-28 00:00:00'), 'a', Timestamp('2000-01-01 00:00:00'), 'ADM'),
            (Timestamp('2000-01-29 00:00:00'), 'a', Timestamp('2000-01-01 00:00:00'), 'ADM'),
            (Timestamp('2000-01-30 00:00:00'), 'a', Timestamp('2000-01-01 00:00:00'), 'ADM'),
            (Timestamp('2000-01-31 00:00:00'), 'a', Timestamp('2000-01-01 00:00:00'), 'ADM'),
            ], columns = ['Daily Date', 'byvar', 'Date', 'TICKER'])
        
        em = dero.pandas.expand_months(self.single_ticker_df, trade_days=False)
        
        assert_frame_equal(expect_df.sort(axis=1), em.sort(axis=1))
        
        
class TestPortfolio(DataFrameTest):
    
    def test_portfolio_byvars(self):
        
        expect_df = pd.DataFrame(data = [
            (10516, 'a', '1/1/2000', 1.01, 1),
            (10516, 'a', '1/2/2000', 1.02, 1),
            (10516, 'a', '1/3/2000', 1.03, 2),
            (10516, 'a', '1/4/2000', 1.04, 2),
            (10516, 'b', '1/1/2000', 1.05, 1),
            (10516, 'b', '1/2/2000', 1.06, 1),
            (10516, 'b', '1/3/2000', 1.07, 2),
            (10516, 'b', '1/4/2000', 1.08, 2),
            (10517, 'a', '1/1/2000', 1.09, 1),
            (10517, 'a', '1/2/2000', 1.1, 1),
            (10517, 'a', '1/3/2000', 1.11, 2),
            (10517, 'a', '1/4/2000', 1.12, 2),
            ], columns = ['PERMNO', 'byvar', 'Date', 'RET', 'portfolio'])
        
        p = dero.pandas.portfolio(self.df, 'RET', ngroups=2, byvars=['PERMNO','byvar'])
        
        assert_frame_equal(expect_df, p, check_dtype=False)
        
    def test_portfolio_with_nan_and_byvars(self):
        
        expect_df = pd.DataFrame(data = [
            (10516, 'a', '1/1/2000', nan, 0),
            (10516, 'a', '1/2/2000', 1.02, 1),
            (10516, 'a', '1/3/2000', 1.03, 1), #changed from 2 to 1 when updated nan handling
            (10516, 'a', '1/4/2000', 1.04, 2),
            (10516, 'b', '1/1/2000', 1.05, 1),
            (10516, 'b', '1/2/2000', 1.06, 1),
            (10516, 'b', '1/3/2000', 1.07, 2),
            (10516, 'b', '1/4/2000', 1.08, 2),
            (10517, 'a', '1/1/2000', 1.09, 1),
            (10517, 'a', '1/2/2000', 1.1, 1),
            (10517, 'a', '1/3/2000', 1.11, 2),
            (10517, 'a', '1/4/2000', 1.12, 2),
            ], columns = ['PERMNO', 'byvar', 'Date', 'RET', 'portfolio'])
        
        indf = self.df.copy()
        indf.loc[0, 'RET'] = nan
        
        p = dero.pandas.portfolio(indf, 'RET', ngroups=2, byvars=['PERMNO','byvar'])
        
        assert_frame_equal(expect_df, p, check_dtype=False)
        
class TestConvertSASDateToPandasDate:
    
    df_sasdate = pd.DataFrame(data = [
                    ('011508', 16114.0),
                    ('011508', 16482.0),
                    ('011508', 17178.0),
                    ('011508', 17197.0),
                    ('011508', 17212.0),
                    ], columns = ['gvkey', 'datadate'])
    
    df_sasdate_nan = pd.DataFrame(data = [
                    ('011508', 16114.0),
                    ('011508', 16482.0),
                    ('011508', 17178.0),
                    ('011508', 17197.0),
                    ('011508', nan),
                    ('011508', 17212.0),
                    ], columns = ['gvkey', 'datadate'])
    
    def test_convert(self):
        
        expect_df = pd.DataFrame(data = [
            (numpy.datetime64('2004-02-13T00:00:00.000000000'),),
            (numpy.datetime64('2005-02-15T00:00:00.000000000'),),
            (numpy.datetime64('2007-01-12T00:00:00.000000000'),),
            (numpy.datetime64('2007-01-31T00:00:00.000000000'),),
            (numpy.datetime64('2007-02-15T00:00:00.000000000'),),
            ], columns = [0])

        converted =  pd.DataFrame(dero.pandas.convert_sas_date_to_pandas_date(self.df_sasdate['datadate']))
        
        assert_frame_equal(expect_df, converted)

    def test_convert_nan(self):
        
        expect_df = pd.DataFrame(data = [
            (numpy.datetime64('2004-02-13T00:00:00.000000000'),),
            (numpy.datetime64('2005-02-15T00:00:00.000000000'),),
            (numpy.datetime64('2007-01-12T00:00:00.000000000'),),
            (numpy.datetime64('2007-01-31T00:00:00.000000000'),),
            (numpy.datetime64('NaT'),),
            (numpy.datetime64('2007-02-15T00:00:00.000000000'),),
            ], columns = [0])
        
        converted =  pd.DataFrame(dero.pandas.convert_sas_date_to_pandas_date(self.df_sasdate_nan['datadate']))
        
        assert_frame_equal(expect_df, converted)
        
class TestMapWindows(DataFrameTest):
    
    times = [
        [-4, -2, 0],
        [-3, 1, 2],
        [4, 5, 6],
        [0, 1, 2],
        [-1, 0, 1]
    ]
    
    df_period_str = pd.DataFrame([
                                (10516, '1/1/2000', 1.01),
                                (10516, '1/2/2000', 1.02),
                                (10516, '1/3/2000', 1.03),
                                (10516, '1/4/2000', 1.04),
                                (10516, '1/5/2000', 1.05),
                                (10516, '1/6/2000', 1.06),
                                (10516, '1/7/2000', 1.07),
                                (10516, '1/8/2000', 1.08),
                                (10517, '1/1/2000', 1.09),
                                (10517, '1/2/2000', 1.10),
                                (10517, '1/3/2000', 1.11),
                                (10517, '1/4/2000', 1.12),
                                (10517, '1/5/2000', 1.05),
                                (10517, '1/6/2000', 1.06),
                                (10517, '1/7/2000', 1.07),
                                (10517, '1/8/2000', 1.08),
                               ], columns = ['PERMNO','Date', 'RET'])

    df_period = df_period_str.copy()
    df_period['Date'] = pd.to_datetime(df_period['Date'])

    expect_dfs = [
        pd.DataFrame(data = [
                    (10516, Timestamp('2000-01-01 00:00:00'), 1.01, 0),
                    (10516, Timestamp('2000-01-02 00:00:00'), 1.02, 1),
                    (10516, Timestamp('2000-01-03 00:00:00'), 1.03, 1),
                    (10516, Timestamp('2000-01-04 00:00:00'), 1.04, 2),
                    (10516, Timestamp('2000-01-05 00:00:00'), 1.05, 2),
                    (10516, Timestamp('2000-01-06 00:00:00'), 1.06, 3),
                    (10516, Timestamp('2000-01-07 00:00:00'), 1.07, 3),
                    (10516, Timestamp('2000-01-08 00:00:00'), 1.08, 3),
                    (10517, Timestamp('2000-01-01 00:00:00'), 1.09, 0),
                    (10517, Timestamp('2000-01-02 00:00:00'), 1.1, 1),
                    (10517, Timestamp('2000-01-03 00:00:00'), 1.11, 1),
                    (10517, Timestamp('2000-01-04 00:00:00'), 1.12, 2),
                    (10517, Timestamp('2000-01-05 00:00:00'), 1.05, 2),
                    (10517, Timestamp('2000-01-06 00:00:00'), 1.06, 3),
                    (10517, Timestamp('2000-01-07 00:00:00'), 1.07, 3),
                    (10517, Timestamp('2000-01-08 00:00:00'), 1.08, 3),
                    ], columns = ['PERMNO', 'Date', 'RET', '__map_window__']),
        pd.DataFrame(data = [
                    (10516, Timestamp('2000-01-01 00:00:00'), 1.01, 0),
                    (10516, Timestamp('2000-01-02 00:00:00'), 1.02, 1),
                    (10516, Timestamp('2000-01-03 00:00:00'), 1.03, 1),
                    (10516, Timestamp('2000-01-04 00:00:00'), 1.04, 1),
                    (10516, Timestamp('2000-01-05 00:00:00'), 1.05, 1),
                    (10516, Timestamp('2000-01-06 00:00:00'), 1.06, 2),
                    (10516, Timestamp('2000-01-07 00:00:00'), 1.07, 3),
                    (10516, Timestamp('2000-01-08 00:00:00'), 1.08, 3),
                    (10517, Timestamp('2000-01-01 00:00:00'), 1.09, 0),
                    (10517, Timestamp('2000-01-02 00:00:00'), 1.1, 1),
                    (10517, Timestamp('2000-01-03 00:00:00'), 1.11, 1),
                    (10517, Timestamp('2000-01-04 00:00:00'), 1.12, 1),
                    (10517, Timestamp('2000-01-05 00:00:00'), 1.05, 1),
                    (10517, Timestamp('2000-01-06 00:00:00'), 1.06, 2),
                    (10517, Timestamp('2000-01-07 00:00:00'), 1.07, 3),
                    (10517, Timestamp('2000-01-08 00:00:00'), 1.08, 3),
                    ], columns = ['PERMNO', 'Date', 'RET', '__map_window__']),
        pd.DataFrame(data = [
                    (10516, Timestamp('2000-01-01 00:00:00'), 1.01, 0),
                    (10516, Timestamp('2000-01-02 00:00:00'), 1.02, 1),
                    (10516, Timestamp('2000-01-03 00:00:00'), 1.03, 2),
                    (10516, Timestamp('2000-01-04 00:00:00'), 1.04, 3),
                    (10516, Timestamp('2000-01-05 00:00:00'), 1.05, 3),
                    (10516, Timestamp('2000-01-06 00:00:00'), 1.06, 3),
                    (10516, Timestamp('2000-01-07 00:00:00'), 1.07, 3),
                    (10516, Timestamp('2000-01-08 00:00:00'), 1.08, 3),
                    (10517, Timestamp('2000-01-01 00:00:00'), 1.09, 0),
                    (10517, Timestamp('2000-01-02 00:00:00'), 1.1, 1),
                    (10517, Timestamp('2000-01-03 00:00:00'), 1.11, 2),
                    (10517, Timestamp('2000-01-04 00:00:00'), 1.12, 3),
                    (10517, Timestamp('2000-01-05 00:00:00'), 1.05, 3),
                    (10517, Timestamp('2000-01-06 00:00:00'), 1.06, 3),
                    (10517, Timestamp('2000-01-07 00:00:00'), 1.07, 3),
                    (10517, Timestamp('2000-01-08 00:00:00'), 1.08, 3),
                    ], columns = ['PERMNO', 'Date', 'RET', '__map_window__']),
        pd.DataFrame(data = [
                    (10516, Timestamp('2000-01-01 00:00:00'), 1.01, 0),
                    (10516, Timestamp('2000-01-02 00:00:00'), 1.02, 1),
                    (10516, Timestamp('2000-01-03 00:00:00'), 1.03, 2),
                    (10516, Timestamp('2000-01-04 00:00:00'), 1.04, 3),
                    (10516, Timestamp('2000-01-05 00:00:00'), 1.05, 3),
                    (10516, Timestamp('2000-01-06 00:00:00'), 1.06, 3),
                    (10516, Timestamp('2000-01-07 00:00:00'), 1.07, 3),
                    (10516, Timestamp('2000-01-08 00:00:00'), 1.08, 3),
                    (10517, Timestamp('2000-01-01 00:00:00'), 1.09, 0),
                    (10517, Timestamp('2000-01-02 00:00:00'), 1.1, 1),
                    (10517, Timestamp('2000-01-03 00:00:00'), 1.11, 2),
                    (10517, Timestamp('2000-01-04 00:00:00'), 1.12, 3),
                    (10517, Timestamp('2000-01-05 00:00:00'), 1.05, 3),
                    (10517, Timestamp('2000-01-06 00:00:00'), 1.06, 3),
                    (10517, Timestamp('2000-01-07 00:00:00'), 1.07, 3),
                    (10517, Timestamp('2000-01-08 00:00:00'), 1.08, 3),
                    ], columns = ['PERMNO', 'Date', 'RET', '__map_window__']),
        pd.DataFrame(data = [
                    (10516, Timestamp('2000-01-01 00:00:00'), 1.01, 0),
                    (10516, Timestamp('2000-01-02 00:00:00'), 1.02, 1),
                    (10516, Timestamp('2000-01-03 00:00:00'), 1.03, 2),
                    (10516, Timestamp('2000-01-04 00:00:00'), 1.04, 3),
                    (10516, Timestamp('2000-01-05 00:00:00'), 1.05, 3),
                    (10516, Timestamp('2000-01-06 00:00:00'), 1.06, 3),
                    (10516, Timestamp('2000-01-07 00:00:00'), 1.07, 3),
                    (10516, Timestamp('2000-01-08 00:00:00'), 1.08, 3),
                    (10517, Timestamp('2000-01-01 00:00:00'), 1.09, 0),
                    (10517, Timestamp('2000-01-02 00:00:00'), 1.1, 1),
                    (10517, Timestamp('2000-01-03 00:00:00'), 1.11, 2),
                    (10517, Timestamp('2000-01-04 00:00:00'), 1.12, 3),
                    (10517, Timestamp('2000-01-05 00:00:00'), 1.05, 3),
                    (10517, Timestamp('2000-01-06 00:00:00'), 1.06, 3),
                    (10517, Timestamp('2000-01-07 00:00:00'), 1.07, 3),
                    (10517, Timestamp('2000-01-08 00:00:00'), 1.08, 3),
                    ], columns = ['PERMNO', 'Date', 'RET', '__map_window__'])
    ]
    
    expect_df_first = pd.DataFrame(data = [
                    (10516, Timestamp('2000-01-01 00:00:00'), 1.01, 0),
                    (10516, Timestamp('2000-01-02 00:00:00'), 1.02, 1),
                    (10516, Timestamp('2000-01-03 00:00:00'), 1.03, 1),
                    (10516, Timestamp('2000-01-04 00:00:00'), 1.04, 1),
                    (10516, Timestamp('2000-01-05 00:00:00'), 1.05, 1),
                    (10516, Timestamp('2000-01-06 00:00:00'), 1.06, 1),
                    (10516, Timestamp('2000-01-07 00:00:00'), 1.07, 1),
                    (10516, Timestamp('2000-01-08 00:00:00'), 1.08, 1),
                    (10517, Timestamp('2000-01-01 00:00:00'), 1.09, 0),
                    (10517, Timestamp('2000-01-02 00:00:00'), 1.1, 1),
                    (10517, Timestamp('2000-01-03 00:00:00'), 1.11, 1),
                    (10517, Timestamp('2000-01-04 00:00:00'), 1.12, 1),
                    (10517, Timestamp('2000-01-05 00:00:00'), 1.05, 1),
                    (10517, Timestamp('2000-01-06 00:00:00'), 1.06, 1),
                    (10517, Timestamp('2000-01-07 00:00:00'), 1.07, 1),
                    (10517, Timestamp('2000-01-08 00:00:00'), 1.08, 1),
                    ], columns = ['PERMNO', 'Date', 'RET', '__map_window__'])
    
    def run_for_each_time(func):
        """
        Decorator that can be applied to any function whose args are (self, time, expect_df) which runs the function
        for each time in self.times and picks the appropriate matching expect_df
        """
        def run(self):
            for t, time in enumerate(self.times):
                func(self, time, self.expect_dfs[t])
        return run
    
    def test_method_first(self):
        
        result = dero.pandas._map_windows(self.df_period, self.times[0], method='first',
                                          periodvar='Date', byvars=['PERMNO'])
        
        assert_frame_equal(result, self.expect_df_first)
        
    
    @run_for_each_time
    def test_method_between(self, time, expect_df):
    
        result = dero.pandas._map_windows(self.df_period, time, method='between',
                                          periodvar='Date', byvars=['PERMNO'])
        
        assert_frame_equal(result, expect_df)
    
class TestLeftMergeLatest(DataFrameTest):
    
    def test_left_merge_latest(self):
        expect_df = pd.DataFrame(data = [
            ('001076', Timestamp('1995-03-01 00:00:00'), Timestamp('1995-02-01 00:00:00')),
            ('001076', Timestamp('1995-04-01 00:00:00'), Timestamp('1995-03-02 00:00:00')),
            ('001722', Timestamp('2012-01-01 00:00:00'), Timestamp('2011-11-01 00:00:00')),
            ('001722', Timestamp('2012-07-01 00:00:00'), Timestamp('2011-11-01 00:00:00')),
            ('001722', numpy.timedelta64('NaT','ns'), numpy.timedelta64('NaT','ns')),
            (numpy.datetime64('NaT'), numpy.datetime64('2012-01-01T00:00:00.000000000'), numpy.datetime64('NaT')),
            ], columns = ['GVKEY', 'Date', 'Date_y'])
        
        lm = dero.pandas.left_merge_latest(self.df_gvkey_str, self.df_gvkey_str2, on='GVKEY')
        lm_sql = dero.pandas.left_merge_latest(self.df_gvkey_str, self.df_gvkey_str2,
                                               on='GVKEY', backend='sql')
        
        assert_frame_equal(expect_df, lm, check_dtype=False)
        assert_frame_equal(expect_df, lm_sql, check_dtype=False)

        
class TestVarChangeByGroups(DataFrameTest):
    
    def test_multi_byvar_single_var(self):
        expect_df = pd.DataFrame(data = [
            (10516, 'a', '1/1/2000', 1.01, nan),
            (10516, 'a', '1/2/2000', 1.02, 0.010000000000000009),
            (10516, 'a', '1/3/2000', 1.03, 0.010000000000000009),
            (10516, 'a', '1/4/2000', 1.04, 0.010000000000000009),
            (10516, 'b', '1/1/2000', 1.05, nan),
            (10516, 'b', '1/2/2000', 1.06, 0.010000000000000009),
            (10516, 'b', '1/3/2000', 1.07, 0.010000000000000009),
            (10516, 'b', '1/4/2000', 1.08, 0.010000000000000009),
            (10517, 'a', '1/1/2000', 1.09, nan),
            (10517, 'a', '1/2/2000', 1.1, 0.010000000000000009),
            (10517, 'a', '1/3/2000', 1.11, 0.010000000000000009),
            (10517, 'a', '1/4/2000', 1.12, 0.010000000000000009),
            ], columns = ['PERMNO', 'byvar', 'Date', 'RET', 'RET_change'])
        
        vc = dero.pandas.var_change_by_groups(self.df, 'RET', ['PERMNO','byvar'])
        
        assert_frame_equal(expect_df, vc)
        
    def test_multi_byvar_multi_var(self):
        expect_df = pd.DataFrame(data = [
            (10516, 'a', '1/1/2000', 1.01, 0, nan, nan),
            (10516, 'a', '1/2/2000', 1.02, 1, 0.010000000000000009, 1.0),
            (10516, 'a', '1/3/2000', 1.03, 1, 0.010000000000000009, 0.0),
            (10516, 'a', '1/4/2000', 1.04, 0, 0.010000000000000009, -1.0),
            (10516, 'b', '1/1/2000', 1.05, 1, nan, nan),
            (10516, 'b', '1/2/2000', 1.06, 1, 0.010000000000000009, 0.0),
            (10516, 'b', '1/3/2000', 1.07, 1, 0.010000000000000009, 0.0),
            (10516, 'b', '1/4/2000', 1.08, 1, 0.010000000000000009, 0.0),
            (10517, 'a', '1/1/2000', 1.09, 0, nan, nan),
            (10517, 'a', '1/2/2000', 1.1, 0, 0.010000000000000009, 0.0),
            (10517, 'a', '1/3/2000', 1.11, 0, 0.010000000000000009, 0.0),
            (10517, 'a', '1/4/2000', 1.12, 1, 0.010000000000000009, 1.0),
            ], columns = ['PERMNO', 'byvar', 'Date', 'RET', 'weight',
                          'RET_change', 'weight_change'])
        
        vc = dero.pandas.var_change_by_groups(self.df_weight, ['RET','weight'], ['PERMNO','byvar'])
        
        assert_frame_equal(expect_df, vc)
        
class TestFillExcludedRows(DataFrameTest):
    
    expect_df_nofill = pd.DataFrame(data = [
            ('001076', Timestamp('1995-03-01 00:00:00')),
            ('001076', Timestamp('1995-04-01 00:00:00')),
            ('001076', Timestamp('2012-01-01 00:00:00')),
            ('001076', Timestamp('2012-07-01 00:00:00')),
            ('001722', Timestamp('1995-03-01 00:00:00')),
            ('001722', Timestamp('1995-04-01 00:00:00')),
            ('001722', Timestamp('2012-01-01 00:00:00')),
            ('001722', Timestamp('2012-07-01 00:00:00')),
            ], columns = ['GVKEY', 'Date'])
    
    def test_no_fillvars_str_byvars(self):
        result = dero.pandas.fill_excluded_rows(self.df_gvkey_str, ['GVKEY','Date'])
        assert_frame_equal(self.expect_df_nofill, result)
        
    def test_no_fillvars_series_byvars(self):
        result = dero.pandas.fill_excluded_rows(self.df_gvkey_str, [self.df_gvkey_str['GVKEY'],'Date'])
        assert_frame_equal(self.expect_df_nofill, result)
        
    def test_fillvars(self):
        var_df = self.df_gvkey_str.copy()
        var_df['var'] = 1
        
        expect_df = pd.DataFrame(data = [
            ('001076', Timestamp('1995-03-01 00:00:00'), 1.0),
            ('001076', Timestamp('1995-04-01 00:00:00'), 1.0),
            ('001076', Timestamp('2012-01-01 00:00:00'), 0.0),
            ('001076', Timestamp('2012-07-01 00:00:00'), 0.0),
            ('001722', Timestamp('1995-03-01 00:00:00'), 0.0),
            ('001722', Timestamp('1995-04-01 00:00:00'), 0.0),
            ('001722', Timestamp('2012-01-01 00:00:00'), 1.0),
            ('001722', Timestamp('2012-07-01 00:00:00'), 1.0),
            ], columns = ['GVKEY', 'Date', 'var'])
        
        result = dero.pandas.fill_excluded_rows(var_df, ['GVKEY','Date'], 'var', value=0)
        assert_frame_equal(expect_df, result)


Overwriting test_pandas.py


# Pandas Utilities 

In [3]:
%%writefile test_pdutils.py

from context import dero

# import pandas as pd
# from pandas.util.testing import assert_frame_equal
# from pandas import Timestamp
# from numpy import nan
# import numpy


    
   

Overwriting test_pdutils.py


# Data

In [4]:
%%writefile test_data.py

from context import dero

import pandas as pd
from pandas.util.testing import assert_frame_equal
from pandas import Timestamp
from numpy import nan
import numpy
import datetime

class DataFrameTest:
    
    ticker_df = pd.DataFrame(data = [
        ('a', Timestamp('2000-01-01 00:00:00'), 'ADM'),
        ('a', Timestamp('2000-01-02 00:00:00'), 'ADM'),
        ('a', Timestamp('2000-01-03 00:00:00'), 'ADM'),
        ('a', Timestamp('2000-01-04 00:00:00'), 'ADM'),
        ('b', Timestamp('2000-01-01 00:00:00'), 'ADM'),
        ('b', Timestamp('2000-01-02 00:00:00'), 'ADM'),
        ('b', Timestamp('2000-01-03 00:00:00'), 'ADM'),
        ('b', Timestamp('2000-01-04 00:00:00'), 'ADM'),
        ('a', Timestamp('2008-01-01 00:00:00'), 'AAN'),
        ('a', Timestamp('2009-01-02 00:00:00'), 'AAN'),
        ('a', Timestamp('2010-01-03 00:00:00'), 'AAN'),
        ('a', Timestamp('2011-01-04 00:00:00'), 'AAN'),
        ], columns = ['byvar', 'Date', 'TICKER'])
    
    permno_df_with_nan = pd.DataFrame(data = [
        ('a', Timestamp('2000-01-01 00:00:00'), 10516.0),
        ('a', Timestamp('2000-01-02 00:00:00'), 10516.0),
        ('a', Timestamp('2000-01-03 00:00:00'), 10516.0),
        ('a', Timestamp('2000-01-04 00:00:00'), 10516.0),
        ('b', Timestamp('2000-01-01 00:00:00'), 10516.0),
        ('b', Timestamp('2000-01-02 00:00:00'), 10516.0),
        ('b', Timestamp('2000-01-03 00:00:00'), 10516.0),
        ('b', Timestamp('2000-01-04 00:00:00'), 10516.0),
        ('a', Timestamp('2008-01-01 00:00:00'), nan),
        ('a', Timestamp('2009-01-02 00:00:00'), nan),
        ('a', Timestamp('2010-01-03 00:00:00'), 78049.0),
        ('a', Timestamp('2011-01-04 00:00:00'), 10517.0),
        ], columns = ['byvar', 'Date', 'PERMNO'])
    
    df = pd.DataFrame([
                                (10516, 'a', '1/1/2000', 1.01),
                                (10516, 'a', '1/2/2000', 1.02),
                                (10516, 'a', '1/3/2000', 1.03),
                                (10516, 'a', '1/4/2000', 1.04),
                                (10516, 'b', '1/1/2000', 1.05),
                                (10516, 'b', '1/2/2000', 1.06),
                                (10516, 'b', '1/3/2000', 1.07),
                                (10516, 'b', '1/4/2000', 1.08),
                                (10517, 'a', '1/1/2000', 1.09),
                                (10517, 'a', '1/2/2000', 1.10),
                                (10517, 'a', '1/3/2000', 1.11),
                                (10517, 'a', '1/4/2000', 1.12),
                               ], columns = ['PERMNO','byvar','Date', 'RET'])   
    
    df_ticker_extra_cols_year_month = pd.DataFrame(data = [
        ('a', Timestamp('2000-01-01 00:00:00'), 'ADM', 1.01, 2000, 1),
        ('a', Timestamp('2000-01-02 00:00:00'), 'ADM', 1.02, 2000, 1),
        ('a', Timestamp('2000-01-03 00:00:00'), 'ADM', 1.03, 2000, 1),
        ('a', Timestamp('2000-01-04 00:00:00'), 'ADM', 1.04, 2000, 1),
        ('b', Timestamp('2000-01-01 00:00:00'), 'ADM', 1.05, 2000, 1),
        ('b', Timestamp('2000-01-02 00:00:00'), 'ADM', 1.06, 2000, 1),
        ('b', Timestamp('2000-01-03 00:00:00'), 'ADM', 1.07, 2000, 1),
        ('b', Timestamp('2000-01-04 00:00:00'), 'ADM', 1.08, 2000, 1),
        ('a', Timestamp('2008-01-01 00:00:00'), 'AAN', 1.09, 2008, 1),
        ('a', Timestamp('2009-01-02 00:00:00'), 'AAN', 1.1, 2009, 1),
        ('a', Timestamp('2010-01-03 00:00:00'), 'AAN', 1.11, 2010, 1),
        ('a', Timestamp('2011-01-04 00:00:00'), 'AAN', 1.12, 2011, 1),
        ], columns = ['byvar', 'Date', 'TICKER', 'other', 'Year', 'Month'])
    
    df_datetime = df.copy()
    df_datetime['Date'] = pd.to_datetime(df_datetime['Date'])
    
    df_gvkey_str = pd.DataFrame([
            ('001076','3/1/1995'),
            ('001076','4/1/1995'),
            ('001722','1/1/2012'),
            ('001722','7/1/2012'),
            ('001722', nan),
            (nan ,'1/1/2012')
            ], columns=['GVKEY','Date'])

    df_gvkey_str['Date'] = pd.to_datetime(df_gvkey_str['Date'])
    df_gvkey_num = df_gvkey_str.copy()
    df_gvkey_num['GVKEY'] = df_gvkey_num['GVKEY'].astype('float64')
    
    
class TestMergeDSENames(DataFrameTest):
    
    def test_on_ticker_get_permno(self):
        
        expect_df = pd.DataFrame(data = [
            ('a', Timestamp('2000-01-01 00:00:00'), 'ADM', 10516.0),
            ('a', Timestamp('2000-01-02 00:00:00'), 'ADM', 10516.0),
            ('a', Timestamp('2000-01-03 00:00:00'), 'ADM', 10516.0),
            ('a', Timestamp('2000-01-04 00:00:00'), 'ADM', 10516.0),
            ('b', Timestamp('2000-01-01 00:00:00'), 'ADM', 10516.0),
            ('b', Timestamp('2000-01-02 00:00:00'), 'ADM', 10516.0),
            ('b', Timestamp('2000-01-03 00:00:00'), 'ADM', 10516.0),
            ('b', Timestamp('2000-01-04 00:00:00'), 'ADM', 10516.0),
            ('a', Timestamp('2008-01-01 00:00:00'), 'AAN', nan),
            ('a', Timestamp('2009-01-02 00:00:00'), 'AAN', nan),
            ('a', Timestamp('2010-01-03 00:00:00'), 'AAN', 78049.0),
            ('a', Timestamp('2011-01-04 00:00:00'), 'AAN', 10517.0),
            ], columns = ['byvar', 'Date', 'TICKER', 'PERMNO'])
        
        dse = dero.data.merge_dsenames(self.ticker_df, other_byvars='byvar') #default is on ticker get permno

        assert_frame_equal(expect_df, dse)
        
    def test_on_ticker_get_permno_extra_cols(self):
        
        expect_df = pd.DataFrame(data = [
            ('a', Timestamp('2000-01-01 00:00:00'), 'ADM', 1.01, 2000, 1, 10516.0),
            ('a', Timestamp('2000-01-02 00:00:00'), 'ADM', 1.02, 2000, 1, 10516.0),
            ('a', Timestamp('2000-01-03 00:00:00'), 'ADM', 1.03, 2000, 1, 10516.0),
            ('a', Timestamp('2000-01-04 00:00:00'), 'ADM', 1.04, 2000, 1, 10516.0),
            ('b', Timestamp('2000-01-01 00:00:00'), 'ADM', 1.05, 2000, 1, 10516.0),
            ('b', Timestamp('2000-01-02 00:00:00'), 'ADM', 1.06, 2000, 1, 10516.0),
            ('b', Timestamp('2000-01-03 00:00:00'), 'ADM', 1.07, 2000, 1, 10516.0),
            ('b', Timestamp('2000-01-04 00:00:00'), 'ADM', 1.08, 2000, 1, 10516.0),
            ('a', Timestamp('2008-01-01 00:00:00'), 'AAN', 1.09, 2008, 1, nan),
            ('a', Timestamp('2009-01-02 00:00:00'), 'AAN', 1.1, 2009, 1, nan),
            ('a', Timestamp('2010-01-03 00:00:00'), 'AAN', 1.11, 2010, 1, 78049.0),
            ('a', Timestamp('2011-01-04 00:00:00'), 'AAN', 1.12, 2011, 1, 10517.0),
            ], columns = ['byvar', 'Date', 'TICKER', 'other', 'Year', 'Month', 'PERMNO'])
        
        dse = dero.data.merge_dsenames(self.df_ticker_extra_cols_year_month, other_byvars='byvar')
        
        assert_frame_equal(expect_df, dse)
        
class TestGetGvkeyOrPermno(DataFrameTest):
    
    def test_get_gvkey_with_nan(self):
        
        expect_df = pd.DataFrame(data = [
            ('a', Timestamp('2000-01-01 00:00:00'), 10516.0, 1722),
            ('a', Timestamp('2000-01-02 00:00:00'), 10516.0, 1722),
            ('a', Timestamp('2000-01-03 00:00:00'), 10516.0, 1722),
            ('a', Timestamp('2000-01-04 00:00:00'), 10516.0, 1722),
            ('b', Timestamp('2000-01-01 00:00:00'), 10516.0, 1722),
            ('b', Timestamp('2000-01-02 00:00:00'), 10516.0, 1722),
            ('b', Timestamp('2000-01-03 00:00:00'), 10516.0, 1722),
            ('b', Timestamp('2000-01-04 00:00:00'), 10516.0, 1722),
            ('a', Timestamp('2008-01-01 00:00:00'), nan, nan),
            ('a', Timestamp('2009-01-02 00:00:00'), nan, nan),
            ('a', Timestamp('2010-01-03 00:00:00'), 78049.0, 1076),
            ('a', Timestamp('2011-01-04 00:00:00'), 10517.0, 1076),
            ], columns = ['byvar', 'Date', 'PERMNO', 'GVKEY'])
        
        ggop = dero.data.get_gvkey_or_permno(self.permno_df_with_nan, datevar='Date',
                                             other_byvars='byvar') #default is on permno get gvkey
        
        assert_frame_equal(expect_df, ggop)
        
class TestGetAbret(DataFrameTest):

    def create_indf(self):
        indf = self.df_datetime.copy()
        indf['Date'] = indf['Date'] + datetime.timedelta(days=1) #push forward to get enough obs on trading days
        return indf

    def test_multiple_byvars_daily(self):
        indf = self.create_indf()
        
        expect_df = pd.DataFrame(data = [
            (10516, 'a', Timestamp('2000-01-02 00:00:00'), 1.01, nan),
            (10516, 'a', Timestamp('2000-01-03 00:00:00'), 1.02, 1.020482537872975),
            (10516, 'a', Timestamp('2000-01-04 00:00:00'), 1.03, 1.0327593010764478),
            (10516, 'a', Timestamp('2000-01-05 00:00:00'), 1.04, 1.0400611667726307),
            (10516, 'b', Timestamp('2000-01-02 00:00:00'), 1.05, nan),
            (10516, 'b', Timestamp('2000-01-03 00:00:00'), 1.06, 1.060482537872975),
            (10516, 'b', Timestamp('2000-01-04 00:00:00'), 1.07, 1.072759301076448),
            (10516, 'b', Timestamp('2000-01-05 00:00:00'), 1.08, 1.0800611667726308),
            (10517, 'a', Timestamp('2000-01-02 00:00:00'), 1.09, nan),
            (10517, 'a', Timestamp('2000-01-03 00:00:00'), 1.1, 1.100482537872975),
            (10517, 'a', Timestamp('2000-01-04 00:00:00'), 1.11, 1.1127593010764478),
            (10517, 'a', Timestamp('2000-01-05 00:00:00'), 1.12, 1.1200611667726308),
            ], columns = ['PERMNO', 'byvar', 'Date', 'RET', 'ABRET'])
        
        ga = dero.data.get_abret(indf, ['PERMNO','byvar'], freq='d', abret_fac=1)
        
        assert_frame_equal(expect_df, ga)
        
    def test_multiple_byvars_daily_includecoef_includefac(self):
        indf = self.create_indf()
        
        expect_df = pd.DataFrame(data = [
            (10516, 'a', Timestamp('2000-01-02 00:00:00'), 1.01, nan, nan, 1.031101001907351,
             0.06796308070068235, nan),
            (10516, 'a', Timestamp('2000-01-03 00:00:00'), 1.02, -0.0071, 0.00021, 1.031101001907351,
             0.06796308070068235, 1.020482537872975),
            (10516, 'a', Timestamp('2000-01-04 00:00:00'), 1.03, -0.0406, 0.00021, 1.031101001907351,
             0.06796308070068235, 1.0327593010764478),
            (10516, 'a', Timestamp('2000-01-05 00:00:00'), 1.04, -0.0009, 0.00021, 1.031101001907351,
             0.06796308070068235, 1.0400611667726307),
            (10516, 'b', Timestamp('2000-01-02 00:00:00'), 1.05, nan, nan, 1.071101001907351,
             0.06796308070068591, nan),
            (10516, 'b', Timestamp('2000-01-03 00:00:00'), 1.06, -0.0071, 0.00021, 1.071101001907351,
             0.06796308070068591, 1.060482537872975),
            (10516, 'b', Timestamp('2000-01-04 00:00:00'), 1.07, -0.0406, 0.00021, 1.071101001907351,
             0.06796308070068591, 1.072759301076448),
            (10516, 'b', Timestamp('2000-01-05 00:00:00'), 1.08, -0.0009, 0.00021, 1.071101001907351, 
             0.06796308070068591, 1.0800611667726308),
            (10517, 'a', Timestamp('2000-01-02 00:00:00'), 1.09, nan, nan, 1.111101001907351,
             0.06796308070068235, nan),
            (10517, 'a', Timestamp('2000-01-03 00:00:00'), 1.1, -0.0071, 0.00021, 1.111101001907351, 
             0.06796308070068235, 1.100482537872975),
            (10517, 'a', Timestamp('2000-01-04 00:00:00'), 1.11, -0.0406, 0.00021, 1.111101001907351,
             0.06796308070068235, 1.1127593010764478),
            (10517, 'a', Timestamp('2000-01-05 00:00:00'), 1.12, -0.0009, 0.00021, 1.111101001907351,
             0.06796308070068235, 1.1200611667726308),
            ], columns = ['PERMNO', 'byvar', 'Date', 'RET', 'mktrf', 'rf', 'const', 'coef_mktrf', 'ABRET'])
        
        ga = dero.data.get_abret(indf, ['PERMNO','byvar'], freq='d', abret_fac=1,
                                 includecoef=True, includefac=True)
        
        assert_frame_equal(expect_df, ga)
        
class TestGetCRSP:
    
    input_data = DataFrameTest()
    

    crsp = dero.data.GetCRSP(debug=True)
    
   
    def test_get_prc_shrout_same_period_monthly(self):
        expect_df_prc_shrout_m = pd.DataFrame(data = [
        (10516, 'a', Timestamp('2000-01-01 00:00:00'), 1.01, 11.75, 608360.0),
        (10516, 'a', Timestamp('2000-01-02 00:00:00'), 1.02, 11.75, 608360.0),
        (10516, 'a', Timestamp('2000-01-03 00:00:00'), 1.03, 11.75, 608360.0),
        (10516, 'a', Timestamp('2000-01-04 00:00:00'), 1.04, 11.75, 608360.0),
        (10516, 'b', Timestamp('2000-01-01 00:00:00'), 1.05, 11.75, 608360.0),
        (10516, 'b', Timestamp('2000-01-02 00:00:00'), 1.06, 11.75, 608360.0),
        (10516, 'b', Timestamp('2000-01-03 00:00:00'), 1.07, 11.75, 608360.0),
        (10516, 'b', Timestamp('2000-01-04 00:00:00'), 1.08, 11.75, 608360.0),
        (10517, 'a', Timestamp('2000-01-01 00:00:00'), 1.09, -16.8125, 3830.0),
        (10517, 'a', Timestamp('2000-01-02 00:00:00'), 1.1, -16.8125, 3830.0),
        (10517, 'a', Timestamp('2000-01-03 00:00:00'), 1.11, -16.8125, 3830.0),
        (10517, 'a', Timestamp('2000-01-04 00:00:00'), 1.12, -16.8125, 3830.0),
        ], columns = ['PERMNO', 'byvar', 'Date', 'RET', 'PRC', 'SHROUT'])
        
        gc_prc_shrout_m = self.crsp.pull_crsp(self.input_data.df_datetime) #get PRC and SHROUT is default
        
        assert_frame_equal(expect_df_prc_shrout_m, gc_prc_shrout_m)
        
    def test_get_prc_shrout_same_period_daily(self):
        expect_df_prc_shrout_d = pd.DataFrame(data = [
        (10516, 'a', Timestamp('2000-01-01 00:00:00'), 1.01, nan, nan),
        (10516, 'a', Timestamp('2000-01-02 00:00:00'), 1.02, nan, nan),
        (10516, 'a', Timestamp('2000-01-03 00:00:00'), 1.03, 12.0, 608360.0),
        (10516, 'a', Timestamp('2000-01-04 00:00:00'), 1.04, 11.875, 608360.0),
        (10516, 'b', Timestamp('2000-01-01 00:00:00'), 1.05, nan, nan),
        (10516, 'b', Timestamp('2000-01-02 00:00:00'), 1.06, nan, nan),
        (10516, 'b', Timestamp('2000-01-03 00:00:00'), 1.07, 12.0, 608360.0),
        (10516, 'b', Timestamp('2000-01-04 00:00:00'), 1.08, 11.875, 608360.0),
        (10517, 'a', Timestamp('2000-01-01 00:00:00'), 1.09, nan, nan),
        (10517, 'a', Timestamp('2000-01-02 00:00:00'), 1.1, nan, nan),
        (10517, 'a', Timestamp('2000-01-03 00:00:00'), 1.11, 17.625, 3830.0),
        (10517, 'a', Timestamp('2000-01-04 00:00:00'), 1.12, 17.5625, 3830.0),
        ], columns = ['PERMNO', 'byvar', 'Date', 'RET', 'PRC', 'SHROUT'])
        
        print('Printing input df: ', self.input_data.df_datetime)
        
        gc_prc_shrout_d = self.crsp.pull_crsp(self.input_data.df_datetime, freq='d') #get PRC and SHROUT is default
        
        print('Printing output df: ', gc_prc_shrout_d)
        
        assert_frame_equal(expect_df_prc_shrout_d, gc_prc_shrout_d)
        
    def test_get_ret_0_3_monthly(self):
        expect_df_ret_0_3_m = pd.DataFrame(data = [
        (10516, 'a', Timestamp('2000-01-01 00:00:00'), 1.01, -0.03092783503234386, -0.036363635212183),
        (10516, 'a', Timestamp('2000-01-02 00:00:00'), 1.02, -0.03092783503234386, -0.036363635212183),
        (10516, 'a', Timestamp('2000-01-03 00:00:00'), 1.03, -0.03092783503234386, -0.036363635212183),
        (10516, 'a', Timestamp('2000-01-04 00:00:00'), 1.04, -0.03092783503234386, -0.036363635212183),
        (10516, 'b', Timestamp('2000-01-01 00:00:00'), 1.05, -0.03092783503234386, -0.036363635212183),
        (10516, 'b', Timestamp('2000-01-02 00:00:00'), 1.06, -0.03092783503234386, -0.036363635212183),
        (10516, 'b', Timestamp('2000-01-03 00:00:00'), 1.07, -0.03092783503234386, -0.036363635212183),
        (10516, 'b', Timestamp('2000-01-04 00:00:00'), 1.08, -0.03092783503234386, -0.036363635212183),
        (10517, 'a', Timestamp('2000-01-01 00:00:00'), 1.09, -0.07876712083816527, -0.09854014962911606),
        (10517, 'a', Timestamp('2000-01-02 00:00:00'), 1.1, -0.07876712083816527, -0.09854014962911606),
        (10517, 'a', Timestamp('2000-01-03 00:00:00'), 1.11, -0.07876712083816527, -0.09854014962911606),
        (10517, 'a', Timestamp('2000-01-04 00:00:00'), 1.12, -0.07876712083816527, -0.09854014962911606),
        ], columns = ['PERMNO', 'byvar', 'Date', 'RET_old', 'RET', 'RET3'])
        
        gc_ret_0_3_m = self.crsp.pull_crsp(self.input_data.df_datetime, get=['RET'], time=[0,3],
                                      other_byvars='byvar') #freq m default
        
        assert_frame_equal(expect_df_ret_0_3_m, gc_ret_0_3_m)
        
    def test_get_ret_0_3_daily(self):
        expect_df_ret_0_3_d = pd.DataFrame(data = [
        (10516, 'a', Timestamp('2000-01-01 00:00:00'), 1.01, -0.010309278033673763, -0.015789473429322243),
        (10516, 'a', Timestamp('2000-01-02 00:00:00'), 1.02, -0.010309278033673763, -0.015789473429322243),
        (10516, 'a', Timestamp('2000-01-03 00:00:00'), 1.03, -0.010309278033673763, 0.0053475936874747285),
        (10516, 'a', Timestamp('2000-01-04 00:00:00'), 1.04, -0.010416666977107523, 0.01595744676887989),
        (10516, 'b', Timestamp('2000-01-01 00:00:00'), 1.05, -0.010309278033673763, -0.015789473429322243),
        (10516, 'b', Timestamp('2000-01-02 00:00:00'), 1.06, -0.010309278033673763, -0.015789473429322243),
        (10516, 'b', Timestamp('2000-01-03 00:00:00'), 1.07, -0.010309278033673763, 0.0053475936874747285),
        (10516, 'b', Timestamp('2000-01-04 00:00:00'), 1.08, -0.010416666977107523, 0.01595744676887989),
        (10517, 'a', Timestamp('2000-01-01 00:00:00'), 1.09, -0.034246575087308884, -0.007117437664419413),
        (10517, 'a', Timestamp('2000-01-02 00:00:00'), 1.1, -0.034246575087308884, -0.007117437664419413),
        (10517, 'a', Timestamp('2000-01-03 00:00:00'), 1.11, -0.034246575087308884, 0.00358422938734293),
        (10517, 'a', Timestamp('2000-01-04 00:00:00'), 1.12, -0.0035460991784930225, 0.0),
        ], columns = ['PERMNO', 'byvar', 'Date', 'RET_old', 'RET', 'RET3'])
        
        gc_ret_0_3_d = self.crsp.pull_crsp(self.input_data.df_datetime, freq='d', get=['RET'],
                                      other_byvars='byvar', time=[0,3])
        
        assert_frame_equal(expect_df_ret_0_3_d, gc_ret_0_3_d)
        
    def test_get_abret_daily(self):
        expect_df_abret1w30_d = pd.DataFrame(data = [
            (10516, 'a', Timestamp('2000-01-03 00:00:00'), 1.03, -0.010309278033673763, -0.011225175770145413),
            (10516, 'a', Timestamp('2000-01-04 00:00:00'), 1.04, -0.010416666977107523, -0.007502490855258082),
            (10516, 'b', Timestamp('2000-01-03 00:00:00'), 1.07, -0.010309278033673763, -0.011225175770145413),
            (10516, 'b', Timestamp('2000-01-04 00:00:00'), 1.08, -0.010416666977107523, -0.007502490855258082),
            (10517, 'a', Timestamp('2000-01-03 00:00:00'), 1.11, -0.034246575087308884, -0.025145672948656786),
            (10517, 'a', Timestamp('2000-01-04 00:00:00'), 1.12, -0.0035460991784930225, 0.021587078051334464),
            ], columns = ['PERMNO', 'byvar', 'Date', 'RET_old', 'RET', 'ABRET'])
    
    #For some reason, seems that either smb or hml changed when I downloaded a new file
#         expect_df_abret3w30_d = pd.DataFrame(data = [
#             (10516, 'a', Timestamp('2000-01-03 00:00:00'), 1.03, -0.010309278033673763, -0.013343896667688964),
#             (10516, 'a', Timestamp('2000-01-04 00:00:00'), 1.04, -0.010416666977107523, -0.006417020408552444),
#             (10516, 'b', Timestamp('2000-01-03 00:00:00'), 1.07, -0.010309278033673763, -0.013343896667688964),
#             (10516, 'b', Timestamp('2000-01-04 00:00:00'), 1.08, -0.010416666977107523, -0.006417020408552444),
#             (10517, 'a', Timestamp('2000-01-03 00:00:00'), 1.11, -0.034246575087308884, -0.02258928932915736),
#             (10517, 'a', Timestamp('2000-01-04 00:00:00'), 1.12, -0.0035460991784930225, 0.021195351098620765),
#             ], columns = ['PERMNO', 'byvar', 'Date', 'RET_old', 'RET', 'ABRET'])

        expect_df_abret3w30_d = pd.DataFrame(data = [
            (10516, 'a', Timestamp('2000-01-03 00:00:00'), 1.03, -0.010309278033673763, -0.013624490709800353),
            (10516, 'a', Timestamp('2000-01-04 00:00:00'), 1.04, -0.010416666977107523, -0.006426335061958407),
            (10516, 'b', Timestamp('2000-01-03 00:00:00'), 1.07, -0.010309278033673763, -0.013624490709800353),
            (10516, 'b', Timestamp('2000-01-04 00:00:00'), 1.08, -0.010416666977107523, -0.006426335061958407),
            (10517, 'a', Timestamp('2000-01-03 00:00:00'), 1.11, -0.034246575087308884, -0.02325567700348382),
            (10517, 'a', Timestamp('2000-01-04 00:00:00'), 1.12, -0.0035460991784930225, 0.021339495004597375),
            ], columns = ['PERMNO', 'byvar', 'Date', 'RET_old', 'RET', 'ABRET'])

#         expect_df_abret4w30_d = pd.DataFrame(data = [
#             (10516, 'a', Timestamp('2000-01-03 00:00:00'), 1.03, -0.010309278033673763, -0.013287494835301781),
#             (10516, 'a', Timestamp('2000-01-04 00:00:00'), 1.04, -0.010416666977107523, -0.0064283138787431925),
#             (10516, 'b', Timestamp('2000-01-03 00:00:00'), 1.07, -0.010309278033673763, -0.013287494835301781),
#             (10516, 'b', Timestamp('2000-01-04 00:00:00'), 1.08, -0.010416666977107523, -0.0064283138787431925),
#             (10517, 'a', Timestamp('2000-01-03 00:00:00'), 1.11, -0.034246575087308884, -0.023520741010140338),
#             (10517, 'a', Timestamp('2000-01-04 00:00:00'), 1.12, -0.0035460991784930225, 0.021088853167304484),
#             ], columns = ['PERMNO', 'byvar', 'Date', 'RET_old', 'RET', 'ABRET'])

        expect_df_abret4w30_d = pd.DataFrame(data = [
            (10516, 'a', Timestamp('2000-01-03 00:00:00'), 1.03, -0.010309278033673763, -0.013604973773934336),
            (10516, 'a', Timestamp('2000-01-04 00:00:00'), 1.04, -0.010416666977107523, -0.006442478998303897),
            (10516, 'b', Timestamp('2000-01-03 00:00:00'), 1.07, -0.010309278033673763, -0.013604973773934336),
            (10516, 'b', Timestamp('2000-01-04 00:00:00'), 1.08, -0.010416666977107523, -0.006442478998303897),
            (10517, 'a', Timestamp('2000-01-03 00:00:00'), 1.11, -0.034246575087308884, -0.024563144388417765),
            (10517, 'a', Timestamp('2000-01-04 00:00:00'), 1.12, -0.0035460991784930225, 0.0211944197477086),
            ], columns = ['PERMNO', 'byvar', 'Date', 'RET_old', 'RET', 'ABRET'])
        
        gc_abret1w30_d = self.crsp.pull_crsp(self.input_data.df_datetime, freq='d', get=['RET'],
                                        other_byvars='byvar', abret=1, window=30)
        gc_abret3w30_d = self.crsp.pull_crsp(self.input_data.df_datetime, freq='d', get=['RET'],
                                        other_byvars='byvar', abret=3, window=30)
        gc_abret4w30_d = self.crsp.pull_crsp(self.input_data.df_datetime, freq='d', get=['RET'],
                                        other_byvars='byvar', abret=4, window=30)
        
        assert_frame_equal(expect_df_abret1w30_d, gc_abret1w30_d)
        assert_frame_equal(expect_df_abret3w30_d, gc_abret3w30_d)
        assert_frame_equal(expect_df_abret4w30_d, gc_abret4w30_d)
        
    def test_get_abret_includecoef_includefac_daily(self):
        expect_df_abret1w30ific_d = pd.DataFrame(data = [
            (10516, 'a', Timestamp('2000-01-03 00:00:00'), 1.03, -0.010309278033673763, -0.011225175770145413,
             -0.0071, 0.00021, -0.1289996811931902, -0.004742858764353774),
            (10516, 'a', Timestamp('2000-01-04 00:00:00'), 1.04, -0.010416666977107523, -0.007502490855258082,
             -0.0406, 0.00021, 0.07177773699136555, -0.004474063724469225),
            (10516, 'b', Timestamp('2000-01-03 00:00:00'), 1.07, -0.010309278033673763, -0.011225175770145413,
             -0.0071, 0.00021, -0.1289996811931902, -0.004742858764353774),
            (10516, 'b', Timestamp('2000-01-04 00:00:00'), 1.08, -0.010416666977107523, -0.007502490855258082,
             -0.0406, 0.00021, 0.07177773699136555, -0.004474063724469225),
            (10517, 'a', Timestamp('2000-01-03 00:00:00'), 1.11, -0.034246575087308884, -0.025145672948656786,
             -0.0071, 0.00021, 1.2818172026270562, 0.00024156806287732818),
            (10517, 'a', Timestamp('2000-01-04 00:00:00'), 1.12, -0.0035460991784930225, 0.021587078051334464,
             -0.0406, 0.00021, 0.619043774133682, 0.0011960335526057253),
            ], columns = ['PERMNO', 'byvar', 'Date', 'RET_old', 'RET', 'ABRET', 'mktrf', 'rf', 'coef_mktrf', 'const'])
        
        gc_abret1w30ific_d = self.crsp.pull_crsp(self.input_data.df_datetime, freq='d', get=['RET'],
                                        other_byvars='byvar', abret=1, window=30,
                                        includefac=True, includecoef=True)
        
        assert_frame_equal(expect_df_abret1w30ific_d, gc_abret1w30ific_d)
        
    def test_get_abret_monthly(self):
#         expect_df_abret4w36_m = pd.DataFrame(data = [
#         (10516, 'a', Timestamp('2000-01-01 00:00:00'), 1.01, -0.03092783503234386, -0.01320320400122417),
#         (10516, 'a', Timestamp('2000-01-02 00:00:00'), 1.02, -0.03092783503234386, -0.01320320400122417),
#         (10516, 'a', Timestamp('2000-01-03 00:00:00'), 1.03, -0.03092783503234386, -0.01320320400122417),
#         (10516, 'a', Timestamp('2000-01-04 00:00:00'), 1.04, -0.03092783503234386, -0.01320320400122417),
#         (10516, 'b', Timestamp('2000-01-01 00:00:00'), 1.05, -0.03092783503234386, -0.01320320400122417),
#         (10516, 'b', Timestamp('2000-01-02 00:00:00'), 1.06, -0.03092783503234386, -0.01320320400122417),
#         (10516, 'b', Timestamp('2000-01-03 00:00:00'), 1.07, -0.03092783503234386, -0.01320320400122417),
#         (10516, 'b', Timestamp('2000-01-04 00:00:00'), 1.08, -0.03092783503234386, -0.01320320400122417),
#         (10517, 'a', Timestamp('2000-01-01 00:00:00'), 1.09, -0.07876712083816527, -0.08677181967190316),
#         (10517, 'a', Timestamp('2000-01-02 00:00:00'), 1.1, -0.07876712083816527, -0.08677181967190316),
#         (10517, 'a', Timestamp('2000-01-03 00:00:00'), 1.11, -0.07876712083816527, -0.08677181967190316),
#         (10517, 'a', Timestamp('2000-01-04 00:00:00'), 1.12, -0.07876712083816527, -0.08677181967190316),
#         ], columns = ['PERMNO', 'byvar', 'Date', 'RET_old', 'RET', 'ABRET'])
        
        expect_df_abret4w36_m = pd.DataFrame(data = [
            (10516, 'a', Timestamp('2000-01-01 00:00:00'), 1.01, -0.03092783503234386, -0.01211808891565103),
            (10516, 'a', Timestamp('2000-01-02 00:00:00'), 1.02, -0.03092783503234386, -0.01211808891565103),
            (10516, 'a', Timestamp('2000-01-03 00:00:00'), 1.03, -0.03092783503234386, -0.01211808891565103),
            (10516, 'a', Timestamp('2000-01-04 00:00:00'), 1.04, -0.03092783503234386, -0.01211808891565103),
            (10516, 'b', Timestamp('2000-01-01 00:00:00'), 1.05, -0.03092783503234386, -0.01211808891565103),
            (10516, 'b', Timestamp('2000-01-02 00:00:00'), 1.06, -0.03092783503234386, -0.01211808891565103),
            (10516, 'b', Timestamp('2000-01-03 00:00:00'), 1.07, -0.03092783503234386, -0.01211808891565103),
            (10516, 'b', Timestamp('2000-01-04 00:00:00'), 1.08, -0.03092783503234386, -0.01211808891565103),
            (10517, 'a', Timestamp('2000-01-01 00:00:00'), 1.09, -0.07876712083816527, -0.08399250241224895),
            (10517, 'a', Timestamp('2000-01-02 00:00:00'), 1.1, -0.07876712083816527, -0.08399250241224895),
            (10517, 'a', Timestamp('2000-01-03 00:00:00'), 1.11, -0.07876712083816527, -0.08399250241224895),
            (10517, 'a', Timestamp('2000-01-04 00:00:00'), 1.12, -0.07876712083816527, -0.08399250241224895),
            ], columns = ['PERMNO', 'byvar', 'Date', 'RET_old', 'RET', 'ABRET'])
        
        gc_abret4w36_m = self.crsp.pull_crsp(self.input_data.df_datetime, get=['RET'],
                                        other_byvars='byvar', abret=4, window=36)
        
        assert_frame_equal(expect_df_abret4w36_m, gc_abret4w36_m)
        
    def test_get_abret_0_3_cumretfirst_dropfirst_daily(self):
        expect_df_cumfirst_abret1w30_0_3_dropf_d = pd.DataFrame(data = [
            (10516, 'a', Timestamp('2000-01-01 00:00:00'), 1.01, -0.015789473429322243,
                 -0.0157200164855561, -0.026041666719972656, -0.023104568060886788),
            (10516, 'a', Timestamp('2000-01-02 00:00:00'), 1.02, -0.015789473429322243,
                 -0.0157200164855561, -0.026041666719972656, -0.023104568060886788),
            (10516, 'a', Timestamp('2000-01-03 00:00:00'), 1.03, 0.005347593687474728,
                 0.00575376791862503, -0.020833333285060984, -0.01748373846474416),
            (10516, 'a', Timestamp('2000-01-04 00:00:00'), 1.04, 0.01595744676887989,
                 0.012994269572996098, -0.005208333323095782, -0.0047166573025027025),
            (10516, 'b', Timestamp('2000-01-01 00:00:00'), 1.05, -0.015789473429322243,
                 -0.0157200164855561, -0.026041666719972656, -0.023104568060886788),
            (10516, 'b', Timestamp('2000-01-02 00:00:00'), 1.06, -0.015789473429322243,
                 -0.0157200164855561, -0.026041666719972656, -0.023104568060886788),
            (10516, 'b', Timestamp('2000-01-03 00:00:00'), 1.07, 0.005347593687474728,
                 0.00575376791862503, -0.020833333285060984, -0.01748373846474416),
            (10516, 'b', Timestamp('2000-01-04 00:00:00'), 1.08, 0.01595744676887989,
                 0.012994269572996098, -0.005208333323095782, -0.0047166573025027025),
            (10517, 'a', Timestamp('2000-01-01 00:00:00'), 1.09, -0.007117437664419413,
                 -0.0065582128902672565, -0.010638297703057686, 0.01488729250752785),
            (10517, 'a', Timestamp('2000-01-02 00:00:00'), 1.1, -0.007117437664419413,
                 -0.0065582128902672565, -0.010638297703057686, 0.01488729250752785),
            (10517, 'a', Timestamp('2000-01-03 00:00:00'), 1.11, 0.00358422938734293,
                 0.008033133923996294, -0.0070921984149733275, 0.023040018046002864),
            (10517, 'a', Timestamp('2000-01-04 00:00:00'), 1.12, 0.0, -0.011673811100752163,
                 -0.0070921984149733275, 0.011097242126823836),
            ], columns = ['PERMNO', 'byvar', 'Date', 'RET_old', 'RET3',
                          'ABRET3', 'cum_RET3', 'cum_ABRET3'])
        
        gc_cumfirst_abret1w30_0_3_dropf_d = self.crsp.pull_crsp(
                                        self.input_data.df_datetime, freq='d', get=['RET'], 
                                        other_byvars='byvar',time=[0,3], cumret='first', abret=1,
                                        window=30, drop_first=True)
        
        assert_frame_equal(expect_df_cumfirst_abret1w30_0_3_dropf_d,
                           gc_cumfirst_abret1w30_0_3_dropf_d, check_dtype=False)
    
    def test_on_ticker_get_cumret_between_time_0_3_dropfirst(self):
        
        expect_df = pd.DataFrame(data = [
            ('a', Timestamp('2000-01-01 00:00:00'), 'ADM', 10516.0, -0.036363635212183, -0.15005286684689223),
            ('a', Timestamp('2000-01-02 00:00:00'), 'ADM', 10516.0, -0.036363635212183, -0.15005286684689223),
            ('a', Timestamp('2000-01-03 00:00:00'), 'ADM', 10516.0, -0.036363635212183, -0.15005286684689223),
            ('a', Timestamp('2000-01-04 00:00:00'), 'ADM', 10516.0, -0.036363635212183, -0.15005286684689223),
            ('b', Timestamp('2000-01-01 00:00:00'), 'ADM', 10516.0, -0.036363635212183, -0.15005286684689223),
            ('b', Timestamp('2000-01-02 00:00:00'), 'ADM', 10516.0, -0.036363635212183, -0.15005286684689223),
            ('b', Timestamp('2000-01-03 00:00:00'), 'ADM', 10516.0, -0.036363635212183, -0.15005286684689223),
            ('b', Timestamp('2000-01-04 00:00:00'), 'ADM', 10516.0, -0.036363635212183, -0.15005286684689223),
            ('a', Timestamp('2008-01-01 00:00:00'), 'AAN', nan, nan, nan),
            ('a', Timestamp('2009-01-02 00:00:00'), 'AAN', nan, nan, nan),
            ('a', Timestamp('2010-01-03 00:00:00'), 'AAN', 78049.0, nan, nan),
            ('a', Timestamp('2011-01-04 00:00:00'), 'AAN', 10517.0, 0.13525237143039703, 0.5010296116623514),
            ], columns = ['byvar', 'Date', 'TICKER', 'PERMNO', 'RET3', 'cum_RET3'])
        
        gc = self.crsp.pull_crsp(self.input_data.ticker_df, coid='TICKER', get='RET', cumret='between',
                                 time=[0,3], other_byvars='byvar', drop_first=True)
        
        assert_frame_equal(expect_df, gc, check_dtype=False)
        
class TestLoadAndMergeCompustat(DataFrameTest):
    
    def test_freq_a(self):
        
        expect_df = pd.DataFrame(data = [
            ('001076', Timestamp('1995-03-01 00:00:00'), Timestamp('1994-03-31 00:00:00'),
                 185.18400000000003, 112.70299999999999),
            ('001076', Timestamp('1995-04-01 00:00:00'), Timestamp('1995-03-31 00:00:00'),
                 228.892, 113.575),
            ('001722', Timestamp('2012-01-01 00:00:00'), Timestamp('2011-06-30 00:00:00'),
                 80676.0, 1247.0),
            ('001722', Timestamp('2012-07-01 00:00:00'), Timestamp('2012-06-30 00:00:00'),
                 89038.0, 1477.0),
            ('001722', numpy.timedelta64('NaT','ns'), numpy.timedelta64('NaT','ns'),
                 numpy.timedelta64('NaT','ns'), numpy.timedelta64('NaT','ns')),
            (numpy.datetime64('NaT'), numpy.datetime64('2012-01-01T00:00:00.000000000'), numpy.datetime64('NaT'),
                 numpy.datetime64('NaT'), numpy.datetime64('NaT')),
            ], columns = ['GVKEY', 'Date', 'datadate', 'sale', 'capx'])
        
        c_str = dero.data.load_and_merge_compustat(self.df_gvkey_str, get=['sale','capx'], freq='a',
                                                   gvkeyvar='GVKEY', debug=True)
        
        c_num = dero.data.load_and_merge_compustat(self.df_gvkey_num, get=['sale','capx'], freq='a',
                                                   gvkeyvar='GVKEY', debug=True)
        
        assert_frame_equal(expect_df, c_str, check_dtype=False)
        assert_frame_equal(expect_df, c_num, check_dtype=False)
    
    def test_freq_q(self):
        
        expect_df = pd.DataFrame(data = [
            ('001076', Timestamp('1995-03-01 00:00:00'), Timestamp('1994-12-31 00:00:00'),
                 56.511, 21.96799999999999),
            ('001076', Timestamp('1995-04-01 00:00:00'), Timestamp('1995-03-31 00:00:00'),
                 59.551, 29.421000000000006),
            ('001722', Timestamp('2012-01-01 00:00:00'), Timestamp('2011-12-31 00:00:00'),
                 23306.0, 409.0),
            ('001722', Timestamp('2012-07-01 00:00:00'), Timestamp('2012-06-30 00:00:00'),
                 22675.0, 284.0),
            ('001722', numpy.timedelta64('NaT','ns'), numpy.timedelta64('NaT','ns'),
                 numpy.timedelta64('NaT','ns'), numpy.timedelta64('NaT','ns')),
            (numpy.datetime64('NaT'), numpy.datetime64('2012-01-01T00:00:00.000000000'), numpy.datetime64('NaT'),
                 numpy.datetime64('NaT'), numpy.datetime64('NaT')),
            ], columns = ['GVKEY', 'Date', 'datadate', 'saleq', 'capxq'])

        
        c_str = dero.data.load_and_merge_compustat(self.df_gvkey_str, get=['sale','capx'], freq='q',
                                                   gvkeyvar='GVKEY', debug=True)
        
        c_num = dero.data.load_and_merge_compustat(self.df_gvkey_num, get=['sale','capx'], freq='q',
                                                   gvkeyvar='GVKEY', debug=True)
        
        assert_frame_equal(expect_df, c_str, check_dtype=False)
        assert_frame_equal(expect_df, c_num, check_dtype=False)



Overwriting test_data.py


# End