### Import Packages

In [1]:
import os
import pyodbc
import numpy as np
import pandas as pd
import seaborn as sns
from datetime import datetime
import matplotlib.pyplot as plt
%matplotlib inline

### Import Data

#### Connect to LASR

In [2]:
# cnxn = pyodbc.connect('Driver={SQL Server Native Client 11.0};\
#                        Server=lasr-sqldb-prd-im,17001;\
#                        Database=IM_I_IDWBRRESULTS_S;\
#                        Trusted_Connection=yes;')

In [3]:
# sql_query = """
# select
# *
# from
# (
# select
# raportbenchmarkid, 
# postdt, 
# rtnperiodtypcd, 
# rtnpct
# from IM_I_IDWBRRADM_S.dbo.RA_BMRK_RTN
# where  rtnperiodtypcd in ('1MO', '3MO', '6MO', '12MO') and crncyuid = 1 and rtnpct is not null 
# ) as A
# left join
#   (
#   select
#     A.raportbenchmarkid, 
#     A.bmrkrpttypnm, 
#     A.portbmrkprioritynum, 
#     A.invportid,
#        case
#          when A.INVPORTID = C.AGGRID then 1
#          else 0
#        end as has_aggrid_flag
# from 
#     IM_M_RADM_S.dbo.vwRaPortBenchmark as A
# left join
#      (
#     select
#     distinct B.AGGRID
#     from IM_I_IDWBRRESULTS_S.dbo.MA_PP_PFMHIST as B
#     where B.incltyp = 'I' and B.crncyid = 1
#     ) as C
# on 
#     A.INVPORTID = C.AGGRID
#   ) as B
# on A.raportbenchmarkid = B.raportbenchmarkid
    
# """

# data = pd.read_sql(sql_query, cnxn)
# data.head()

In [4]:
os.chdir('data')

In [5]:
data = pd.read_csv('benchmark_data.csv')
data.head()

Unnamed: 0.1,Unnamed: 0,raportbenchmarkid,postdt,rtnperiodtypcd,rtnpct,raportbenchmarkid.1,bmrkrpttypnm,portbmrkprioritynum,invportid,has_aggrid_flag
0,0,180482,1997-11-28,3MO,110.634502,180482,Override,2,117031069.0,1
1,1,15663,2004-02-27,3MO,110.706567,15663,Both,1,143058113.0,1
2,2,11995,1990-06-29,1MO,103.455464,11995,Both,2,109134611.0,1
3,3,25039,2013-09-30,1MO,107.189345,25039,Both,1,206221840.0,1
4,4,25402,1997-11-28,3MO,101.880439,25402,Both,1,180239834.0,1


### Exploratory Data Analysis

In [6]:
data.shape

(13107544, 10)

In [7]:
data.columns.values

array(['Unnamed: 0', 'raportbenchmarkid', 'postdt', 'rtnperiodtypcd',
       'rtnpct', 'raportbenchmarkid.1', 'bmrkrpttypnm',
       'portbmrkprioritynum', 'invportid', 'has_aggrid_flag'],
      dtype=object)

In [8]:
data.rtnperiodtypcd.unique()

array(['3MO', '1MO', '12MO', '6MO'], dtype=object)

#### Check which Benchmarks have multiple period types

In [None]:
#data.groupby(['raportbenchmarkid', 'postdt'])['rtnperiodtypcd'].count()

#### Examples

In [None]:
#data[(data['raportbenchmarkid'] == 1) & (data['postdt'] == '1976-12-31')]

In [None]:
#data[(data['raportbenchmarkid'] == 1) & (data['postdt'] == '1993-03-31')]

In [None]:
#data[(data['raportbenchmarkid'] == 1) & (data['postdt'] == '1993-10-29')]

### Data Preprocessing

In [9]:
data['postdt'] = pd.to_datetime(data['postdt'], format='%Y-%m-%d')
data['priority'] = data['rtnperiodtypcd'].map({'1MO':1, '3MO':2, '6MO':3, '12MO':4})
data.head()

Unnamed: 0.1,Unnamed: 0,raportbenchmarkid,postdt,rtnperiodtypcd,rtnpct,raportbenchmarkid.1,bmrkrpttypnm,portbmrkprioritynum,invportid,has_aggrid_flag,priority
0,0,180482,1997-11-28,3MO,110.634502,180482,Override,2,117031069.0,1,2
1,1,15663,2004-02-27,3MO,110.706567,15663,Both,1,143058113.0,1,2
2,2,11995,1990-06-29,1MO,103.455464,11995,Both,2,109134611.0,1,1
3,3,25039,2013-09-30,1MO,107.189345,25039,Both,1,206221840.0,1,1
4,4,25402,1997-11-28,3MO,101.880439,25402,Both,1,180239834.0,1,2


In [10]:
# main_df = pd.pivot_table(data, columns=['raportbenchmarkid', 'postdt','rtnperiodtypcd', 'bmrkrpttypnm', 
#                                         'portbmrkprioritynum', 'invportid', 'has_aggrid_flag'], 
#                                values=['priority'], 
#                                aggfunc=np.min).reset_index()

# main_df.head()

In [11]:
data.columns.values

array(['Unnamed: 0', 'raportbenchmarkid', 'postdt', 'rtnperiodtypcd',
       'rtnpct', 'raportbenchmarkid.1', 'bmrkrpttypnm',
       'portbmrkprioritynum', 'invportid', 'has_aggrid_flag', 'priority'],
      dtype=object)

### Groupby

In [12]:
left_df = data.groupby(['raportbenchmarkid', 'postdt'])['priority'].min().reset_index()
final_df = pd.merge(left_df, data, on=['raportbenchmarkid', 'postdt', 'priority'], how='left')
final_df.head()

Unnamed: 0.1,raportbenchmarkid,postdt,priority,Unnamed: 0,rtnperiodtypcd,rtnpct,raportbenchmarkid.1,bmrkrpttypnm,portbmrkprioritynum,invportid,has_aggrid_flag
0,1,1976-12-31,4,7638647,12MO,123.64,1,Mgr Rpts,1,10600.0,1
1,1,1977-12-30,4,7647287,12MO,92.8,1,Mgr Rpts,1,10600.0,1
2,1,1978-12-29,4,7571095,12MO,106.41,1,Mgr Rpts,1,10600.0,1
3,1,1979-12-31,4,7485879,12MO,118.19,1,Mgr Rpts,1,10600.0,1
4,1,1980-12-31,4,7663575,12MO,131.48,1,Mgr Rpts,1,10600.0,1


In [13]:
final_df.drop(['Unnamed: 0', 'raportbenchmarkid.1'], axis=1, inplace=True)
final_df.set_index('postdt', drop=True, inplace=True)

In [14]:
final_df = final_df.reset_index()

In [15]:
final_df.head()

Unnamed: 0,postdt,raportbenchmarkid,priority,rtnperiodtypcd,rtnpct,bmrkrpttypnm,portbmrkprioritynum,invportid,has_aggrid_flag
0,1976-12-31,1,4,12MO,123.64,Mgr Rpts,1,10600.0,1
1,1977-12-30,1,4,12MO,92.8,Mgr Rpts,1,10600.0,1
2,1978-12-29,1,4,12MO,106.41,Mgr Rpts,1,10600.0,1
3,1979-12-31,1,4,12MO,118.19,Mgr Rpts,1,10600.0,1
4,1980-12-31,1,4,12MO,131.48,Mgr Rpts,1,10600.0,1


In [None]:
#final_df.to_csv('final_df_0917_v3.csv')

### Make the Final Dataframe

In [16]:
final_df.columns.values

array(['postdt', 'raportbenchmarkid', 'priority', 'rtnperiodtypcd',
       'rtnpct', 'bmrkrpttypnm', 'portbmrkprioritynum', 'invportid',
       'has_aggrid_flag'], dtype=object)

In [17]:
test_df = final_df[['raportbenchmarkid', 'rtnperiodtypcd', 'postdt', 'bmrkrpttypnm', 
                    'portbmrkprioritynum', 'invportid', 'has_aggrid_flag']]

In [18]:
test_df.head()

Unnamed: 0,raportbenchmarkid,rtnperiodtypcd,postdt,bmrkrpttypnm,portbmrkprioritynum,invportid,has_aggrid_flag
0,1,12MO,1976-12-31,Mgr Rpts,1,10600.0,1
1,1,12MO,1977-12-30,Mgr Rpts,1,10600.0,1
2,1,12MO,1978-12-29,Mgr Rpts,1,10600.0,1
3,1,12MO,1979-12-31,Mgr Rpts,1,10600.0,1
4,1,12MO,1980-12-31,Mgr Rpts,1,10600.0,1


In [19]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3484212 entries, 0 to 3484211
Data columns (total 7 columns):
raportbenchmarkid      int64
rtnperiodtypcd         object
postdt                 datetime64[ns]
bmrkrpttypnm           object
portbmrkprioritynum    int64
invportid              float64
has_aggrid_flag        int64
dtypes: datetime64[ns](1), float64(1), int64(3), object(2)
memory usage: 186.1+ MB


In [20]:
new_raportbenchmarkid_list = []
new_date_list = []
new_rtntyp_list = []
new_bmrkrpttypnm_list = []
new_portbmrkprioritynum_list = []
new_invportid_list = []
new_hasaggrid_list = []

for raportbenchmarkid, rtnperiodtypcd, postdt, bmrkrpttypnm, portbmrkprioritynum, invportid, has_aggrid_flag in test_df.itertuples(index=False):
    if rtnperiodtypcd == '12MO':
        for d in pd.date_range(end=postdt, periods=12, freq='MS'):
            new_date_list.append(d)
            new_raportbenchmarkid_list.append(raportbenchmarkid)
            new_rtntyp_list.append(rtnperiodtypcd)
            new_bmrkrpttypnm_list.append(bmrkrpttypnm)
            new_portbmrkprioritynum_list.append(portbmrkprioritynum)
            new_invportid_list.append(invportid)
            new_hasaggrid_list.append(has_aggrid_flag)
    elif rtnperiodtypcd == '6MO':
        for d in pd.date_range(end=postdt, periods=6, freq='MS'):
            new_date_list.append(d)
            new_raportbenchmarkid_list.append(raportbenchmarkid)
            new_rtntyp_list.append(rtnperiodtypcd)
            new_bmrkrpttypnm_list.append(bmrkrpttypnm)
            new_portbmrkprioritynum_list.append(portbmrkprioritynum)
            new_invportid_list.append(invportid)
            new_hasaggrid_list.append(has_aggrid_flag)
    elif rtnperiodtypcd == '3MO':
        for d in pd.date_range(end=postdt, periods=3, freq='MS'):
            new_date_list.append(d)
            new_raportbenchmarkid_list.append(raportbenchmarkid)
            new_rtntyp_list.append(rtnperiodtypcd)
            new_bmrkrpttypnm_list.append(bmrkrpttypnm)
            new_portbmrkprioritynum_list.append(portbmrkprioritynum)
            new_invportid_list.append(invportid)
            new_hasaggrid_list.append(has_aggrid_flag)
    elif rtnperiodtypcd == '1MO':
        new_date_list.append(postdt.replace(day=1))
        new_raportbenchmarkid_list.append(raportbenchmarkid)
        new_rtntyp_list.append(rtnperiodtypcd)
        new_bmrkrpttypnm_list.append(bmrkrpttypnm)
        new_portbmrkprioritynum_list.append(portbmrkprioritynum)
        new_invportid_list.append(invportid)
        new_hasaggrid_list.append(has_aggrid_flag)
    else:
        print("Fail")

In [21]:
new_df = pd.DataFrame(columns=['raportbenchmarkid_new', 'postdt_new', 'rtntypcd_new', 
                               'bmrkrpttypnm_new', 'portbmrkprioritynum_new', 'invportid_new', 'hasaggrid_new'])

new_df['raportbenchmarkid_new'] = new_raportbenchmarkid_list
new_df['postdt_new'] = new_date_list
new_df['rtntypcd_new'] = new_rtntyp_list
new_df['bmrkrpttypnm_new'] = new_bmrkrpttypnm_list
new_df['portbmrkprioritynum_new'] = new_portbmrkprioritynum_list
new_df['invportid_new'] = new_invportid_list
new_df['hasaggrid_new'] = new_hasaggrid_list
new_df.head()

Unnamed: 0,raportbenchmarkid_new,postdt_new,rtntypcd_new,bmrkrpttypnm_new,portbmrkprioritynum_new,invportid_new,hasaggrid_new
0,1,1976-01-01,12MO,Mgr Rpts,1,10600.0,1
1,1,1976-02-01,12MO,Mgr Rpts,1,10600.0,1
2,1,1976-03-01,12MO,Mgr Rpts,1,10600.0,1
3,1,1976-04-01,12MO,Mgr Rpts,1,10600.0,1
4,1,1976-05-01,12MO,Mgr Rpts,1,10600.0,1


In [22]:
new_df.shape

(3687427, 7)

### Export to Excel

In [23]:
#new_df.to_csv('new_df_0917.csv')

In [24]:
#final_df.to_csv('final_df_0917.csv')

### Matching with Portfolio Type

In [25]:
portfolio_df = pd.read_csv('portfolio_types.csv')

In [26]:
portfolio_df.rename({'AGGRID':'aggrid'}, axis=1, inplace=True)
portfolio_df.head()

Unnamed: 0,portfoliouid,InvestmentPortfolioTypeCode,aggrid
0,195700,AM,159317735.0
1,109349,GM,87279079.0
2,86400,FL,69160435.0
3,9893,AT,7771398.0
4,80719,GA,64706795.0


In [27]:
new_df.columns.values

array(['raportbenchmarkid_new', 'postdt_new', 'rtntypcd_new',
       'bmrkrpttypnm_new', 'portbmrkprioritynum_new', 'invportid_new',
       'hasaggrid_new'], dtype=object)

In [28]:
new_df.rename({'invportid_new':'aggrid'}, axis=1, inplace=True)

In [29]:
new_df.head()

Unnamed: 0,raportbenchmarkid_new,postdt_new,rtntypcd_new,bmrkrpttypnm_new,portbmrkprioritynum_new,aggrid,hasaggrid_new
0,1,1976-01-01,12MO,Mgr Rpts,1,10600.0,1
1,1,1976-02-01,12MO,Mgr Rpts,1,10600.0,1
2,1,1976-03-01,12MO,Mgr Rpts,1,10600.0,1
3,1,1976-04-01,12MO,Mgr Rpts,1,10600.0,1
4,1,1976-05-01,12MO,Mgr Rpts,1,10600.0,1


### Match the Dataframes

In [31]:
final_df = pd.merge(new_df, portfolio_df, on='aggrid', how='left')
final_df.head()

MemoryError: 