In [1]:
import os
import pyodbc
import numpy as np
import pandas as pd
import seaborn as sns
from datetime import datetime
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
cnxn = pyodbc.connect('Driver={SQL Server Native Client 11.0};\
                       Server=lasr-sqldb-prd-im,17001;\
                       Database=IM_I_IDWBRRADM_S;\
                       Trusted_Connection=yes;')

In [3]:
sql_query_port = """
select
A.RAPORTFOLIOID,
A.postdt,
A.RTNPERIODTYPCD
from
(
select
RAPORTFOLIOID,
RTNPERIODENDDT as postdt,
RTNPERIODTYPCD,
RAMARKETCYCLEID
from IM_I_IDWBRRADM_S.dbo.RA_PORT_RTN
where CRNCYUID = 1 and RTNPCT is not NULL
  and RTNPERIODTYPCD IN ('1MO','3MO','6MO','12MO')
) as A
  inner join
(
select
RAPORTFOLIOID,
RTNPERIODENDDT as postdt,
RTNPERIODTYPCD,
max(RAMARKETCYCLEID) as max_cycle
from IM_I_IDWBRRADM_S.dbo.RA_PORT_RTN
where CRNCYUID = 1 and RTNPCT is not NULL
group by RAPORTFOLIOID,RTNPERIODENDDT,RTNPERIODTYPCD
) as B
on A.RAPORTFOLIOID=B.RAPORTFOLIOID and A.postdt=B.postdt and A.RTNPERIODTYPCD=B.RTNPERIODTYPCD and A.RAMARKETCYCLEID=B.max_cycle
"""

# Save the data
df_port = pd.read_sql(sql_query_port, cnxn)
    
print(df_port.shape)
df_port.head()

(5833023, 3)


Unnamed: 0,RAPORTFOLIOID,postdt,RTNPERIODTYPCD
0,201589,1990-12-31,3MO
1,48056,2005-11-30,1MO
2,124740,1998-10-30,1MO
3,155800,2004-08-31,1MO
4,215213,1993-10-29,12MO


In [4]:
sql_query_bmk = """
select
VWB.RAPORTFOLIOID,
VWB.RAPORTBENCHMARKID,
VWB.STARTDT,
VWB.ENDDT,
BMK.RTNPERIODENDDT,
BMK.RTNPERIODTYPCD
from
  (
	select
	VBMK.RAPORTFOLIOID,
	VBMK.RAPORTBENCHMARKID,
	BCMP.STARTDT,
	BCMP.ENDDT
	from IM_M_RADM_S.dbo.vwRaPortBenchmark as VBMK
	  left join 
		 IM_I_IDWBRRADM_S.dbo.RA_PORT_BMRK_COMPONENT as BCMP
	  on VBMK.RAPORTBENCHMARKID = BCMP.RAPORTBENCHMARKID
  ) as VWB
  left join
  (
	select
	A.raportbenchmarkid,
	A.RTNPERIODENDDT,
	A.RTNPERIODTYPCD
	from IM_I_IDWBRRADM_S.dbo.RA_BMRK_RTN as A
	  inner join
		  (
		  select
		  raportbenchmarkid,
		  RTNPERIODENDDT,
		  RTNPERIODTYPCD,
		  max(RAMARKETCYCLEID) as max_RAMARKETCYCLEID
		  from IM_I_IDWBRRADM_S.dbo.RA_BMRK_RTN
		  where crncyuid = 1 and rtnpct is not null
		  group by raportbenchmarkid, RTNPERIODENDDT, RTNPERIODTYPCD
		  ) as B
	  on A.raportbenchmarkid=B.raportbenchmarkid and A.RTNPERIODENDDT=B.RTNPERIODENDDT and A.RTNPERIODTYPCD=B.RTNPERIODTYPCD and A.RAMARKETCYCLEID=B.max_RAMARKETCYCLEID
		 and A.crncyuid=1 and A.rtnpct is not NULL
  ) as BMK
on BMK.raportbenchmarkid=VWB.raportbenchmarkid and BMK.RTNPERIODENDDT between VWB.STARTDT and VWB.ENDDT
where BMK.RTNPERIODTYPCD IN ('1MO','3MO','6MO','12MO') and VWB.STARTDT is not NULL and VWB.ENDDT is not NULL and BMK.RTNPERIODENDDT is not NULL
"""

# Save the data
df_bmk = pd.read_sql(sql_query_bmk, cnxn)
#close connection
cnxn.close()
    
print(df_bmk.shape)
df_bmk.head()

(11171376, 6)


Unnamed: 0,RAPORTFOLIOID,RAPORTBENCHMARKID,STARTDT,ENDDT,RTNPERIODENDDT,RTNPERIODTYPCD
0,30430,181277,2009-11-30,9999-12-31 00:00:00,2016-10-31,6MO
1,300964,179417,1900-01-01,9999-12-31 00:00:00,2001-08-31,3MO
2,182806,4754,2001-08-31,2006-03-31 00:00:00,2005-01-31,6MO
3,219327,26812,1966-12-30,9999-12-31 00:00:00,1998-12-31,1MO
4,127346,181190,2007-10-31,9999-12-31 00:00:00,2017-04-28,3MO


# Munge the data

In [None]:
def assign_hier(var1):
    if var1 == '1MO':
        return 1
    elif var1 == '3MO':
        return 2
    elif var1 == '6MO':
        return 3
    elif var1 == '12MO':
        return 4
    else:
        return 5

df_port['hier'] = df_port['RTNPERIODTYPCD'].apply(lambda x: assign_hier(x))
df_port['postdt_new'] = df_port['postdt'].apply(lambda x: x.replace(day=1))
df_bmk['hier'] = df_bmk['RTNPERIODTYPCD'].apply(lambda x: assign_hier(x))
df_bmk['postdt_new'] = df_bmk['RTNPERIODENDDT'].apply(lambda x: x.replace(day=1))

In [None]:
print(df_port.head())
print(df_bmk.head())

In [None]:
#this confirms there are no duplicate combinations in port
print(df_port.drop_duplicates(subset=['RAPORTFOLIOID', 'postdt_new', 'RTNPERIODTYPCD'], keep='first', inplace=False).count())
#this confirms there are no duplicate combinations in bmk
print(df_bmk.drop_duplicates(subset=['RAPORTFOLIOID', 'raportbenchmarkid', 'postdt_new', 'RTNPERIODTYPCD'], keep='first', inplace=False).count())

In [None]:
df_bmk.raportbenchmarkid.unique().shape

#print(df_bmk.drop_duplicates(subset=['RAPORTFOLIOID', 'raportbenchmarkid', 'postdt_new', 'RTNPERIODTYPCD'], keep='first', inplace=False).count())

In [None]:
test1 = df_port[(df_port['postdt_new'] == '12-01-1963')]
test1

In [None]:
test2 = df_bmk[df_bmk['postdt_new'] == '12-01-1963']
test2

# get the least granularity for each raportfolio/postdt_new combination, in each dataset

In [None]:
df_port_min = df_port.groupby(['RAPORTFOLIOID', 'postdt_new'])['hier'].min().reset_index()
print(df_port_min.shape)
df_port_min.head()

In [None]:
df_bmk_min = df_bmk.groupby(['RAPORTFOLIOID','raportbenchmarkid', 'postdt_new'])['hier'].min().reset_index()
print(df_bmk_min.shape)
df_bmk_min.head()

# Combine And Take Max

In [None]:
df_combo = df_port_min.append(df_bmk_min, ignore_index=True)
print(df_combo.shape)
df_combo.head()

In [None]:
df_combo_max = df_combo.groupby(['RAPORTFOLIOID','postdt_new'])['hier'].max().reset_index()
print(df_combo_max.shape)
df_combo_max.head()

In [None]:
df_final = df_combo_max.groupby(['postdt_new', 'hier'])['RAPORTFOLIOID'].count().reset_index()
print(df_final.shape)
df_final.head()

In [None]:
#output the summarized version
df_final.to_csv('1_1_RA_PORT_RTN_w_BMK.csv')