### Import Packages

In [1]:
import os
import pyodbc
import numpy as np
import pandas as pd
import seaborn as sns
from datetime import datetime
import matplotlib.pyplot as plt
%matplotlib inline

### Import Data

#### `IM_I_IDWBRRADM_S.dbo.RA_PORT_RTN`

In [3]:
cnxn = pyodbc.connect('Driver={SQL Server Native Client 11.0};\
                       Server=lasr-sqldb-prd-im,17001;\
                       Database=IM_I_IDWBRRADM_S;\
                       Trusted_Connection=yes;')

sql_query = """
select
A.RAPORTFOLIOID,
A.postdt,
A.RTNPERIODTYPCD
from
(
select
RAPORTFOLIOID,
RTNPERIODENDDT as postdt,
RTNPERIODTYPCD,
RAMARKETCYCLEID
from IM_I_IDWBRRADM_S.dbo.RA_PORT_RTN
where CRNCYUID = 1 and RTNPCT is not NULL
) as A
  inner join
(
select
RAPORTFOLIOID,
RTNPERIODENDDT as postdt,
RTNPERIODTYPCD,
min(RAMARKETCYCLEID) as min_cycle
from IM_I_IDWBRRADM_S.dbo.RA_PORT_RTN
where CRNCYUID = 1 and RTNPCT is not NULL
group by RAPORTFOLIOID,RTNPERIODENDDT,RTNPERIODTYPCD
) as B
on A.RAPORTFOLIOID=B.RAPORTFOLIOID and A.postdt=B.postdt and A.RTNPERIODTYPCD=B.RTNPERIODTYPCD and A.RAMARKETCYCLEID=B.min_cycle
"""
# Save the data
data = pd.read_sql(sql_query, cnxn)
#close connection
cnxn.close()

In [4]:
print(data.shape)
data.head()

(17298077, 3)


Unnamed: 0,RAPORTFOLIOID,postdt,RTNPERIODTYPCD
0,326626,2013-02-28,MCTC
1,246060,2004-04-30,LIFE
2,63655,1990-12-31,LIFE
3,51412,1977-07-29,1MO
4,320613,2011-08-31,1MO


In [6]:
data['postdt_new'] = data['postdt'].apply(lambda dt: dt.replace(day=1))
data.head()

Unnamed: 0,RAPORTFOLIOID,postdt,RTNPERIODTYPCD,postdt_new
0,326626,2013-02-28,MCTC,2013-02-01
1,246060,2004-04-30,LIFE,2004-04-01
2,63655,1990-12-31,LIFE,1990-12-01
3,51412,1977-07-29,1MO,1977-07-01
4,320613,2011-08-31,1MO,2011-08-01


In [7]:
#check no duplicate records
print(data.drop_duplicates(subset=['RAPORTFOLIOID', 'postdt_new', 'RTNPERIODTYPCD'], keep='first', inplace=False).count())

RAPORTFOLIOID     17298077
postdt            17298077
RTNPERIODTYPCD    17298077
postdt_new        17298077
dtype: int64


In [10]:
df_final = data.groupby(['postdt_new', 'RTNPERIODTYPCD'])['RAPORTFOLIOID'].count().reset_index()
df_final.head()

Unnamed: 0,postdt_new,RTNPERIODTYPCD,RAPORTFOLIOID
0,1934-01-01,1MO,1
1,1934-01-01,LIFE,1
2,1934-01-01,LIFEPRD,1
3,1934-01-01,YTD,1
4,1934-02-01,1MO,1


In [12]:
df_final.rename(columns={'RTNPERIODTYPCD':'periodtype'}, inplace=True)
df_final.rename(columns={'RAPORTFOLIOID':'Portfolio_Cnt'}, inplace=True)
print(df_final.shape)
df_final.head()

(17921, 3)


Unnamed: 0,postdt_new,periodtype,Portfolio_Cnt
0,1934-01-01,1MO,1
1,1934-01-01,LIFE,1
2,1934-01-01,LIFEPRD,1
3,1934-01-01,YTD,1
4,1934-02-01,1MO,1


In [14]:
df_final.to_csv('summary_RA_PORT_RTN.csv')