In [1]:
import os
import pyodbc
import numpy as np
import pandas as pd
from datetime import datetime
import matplotlib.pyplot as plt
%matplotlib inline

In [57]:
cnxn = pyodbc.connect('Driver={SQL Server Native Client 11.0};\
                       Server=lasr-sqldb-uat,17001;\
                       Database=IM_I_IDWBRRESULTS_S;\
                       Trusted_Connection=yes;')

In [5]:
sql_query_monthly = """
select 
aggrid,
postdt,
rtntyp,
totrtn1mo,
totrtn3mo,
totrtn6mo,
totrtn12mo
from zRed05_IM_I_IDWBRRESULTS_S.dbo.MA_PP_PFMHIST
where incltyp = 'I' and crncyid = 1
"""
# Save the data 
df_monthly = pd.read_sql(sql_query_monthly, cnxn)

In [58]:
sql_query_daily = """
select
aggrid,
DATEADD(m, DATEDIFF(m, 0, postdt), 0) as postdt_m
from zRed05_IM_I_IDWBRRESULTS_S.dbo.MA_PP_PFHIST
where INCLTYP = 'I' and TOTRTNDAY is not NULL  and CRNCYID = 1
group by aggrid, DATEADD(m, DATEDIFF(m, 0, postdt), 0)
having sum(1) >= 15    
"""
# Save the data 
df_daily = pd.read_sql(sql_query_daily, cnxn)

In [61]:
# Close the connection with LASR
cnxn.close()

ProgrammingError: Attempt to use a closed connection.

In [59]:
print(df_daily.shape)
df_daily.head()

(9042541, 2)


Unnamed: 0,aggrid,postdt_m
0,78157052,2002-02-01
1,114049604,2001-09-01
2,152142922,2001-10-01
3,23112689,2001-12-01
4,149050063,2001-11-01


In [10]:
print(df_monthly.shape)
df_monthly.head()

(17356519, 7)


Unnamed: 0,aggrid,postdt,rtntyp,totrtn1mo,totrtn3mo,totrtn6mo,totrtn12mo
0,210740437,2015-10-30,M,,,,
1,63751950,2013-11-29,M,,,,
2,65706796,2013-11-29,M,,,,
3,10230972,2007-02-28,M,,,,
4,133720008,2009-06-30,M,,,,


# Fix up the data, starting with daily

In [62]:
df_daily['rtntyp'] = 'D'
df_daily.head()

Unnamed: 0,aggrid,postdt_m,rtntyp
0,78157052,2002-02-01,D
1,114049604,2001-09-01,D
2,152142922,2001-10-01,D
3,23112689,2001-12-01,D
4,149050063,2001-11-01,D


In [13]:
def align_values(row):
    if row['rtntyp'] == 'M':
        return row['totrtn1mo']
    elif row['rtntyp'] == 'Q':
        return row['totrtn3mo']
    elif row['rtntyp'] == 'S':
        return row['totrtn6mo']
    elif row['rtntyp'] == 'A':
        return row['totrtn12mo']
    else:
        return np.nan
    
df_monthly['rtntyp_values'] = df_monthly.apply(lambda x: align_values(x), axis=1)
df_monthly = df_monthly[['aggrid', 'postdt', 'rtntyp', 'rtntyp_values']]
# Drop rows with NULL values
df_monthly = df_monthly.dropna(axis=0)
print(df_monthly.shape)
df_monthly.head()

NameError: name 'data' is not defined

In [17]:
df_monthly.drop(['rtntyp_values'], axis=1, inplace=True)
df_monthly.head()

Unnamed: 0,aggrid,postdt,rtntyp
976312,143035123,1998-02-27,M
976344,143035123,1998-03-31,M
976376,143035123,1998-04-30,M
976408,33063727,1998-05-29,M
976440,33063727,1998-07-31,M


In [28]:
#new_df = df_monthly
new_aggrid_list = []
new_date_list = []
new_rtntyp_list = []
new_hier_list = []

for aggrid, postdt, rtntyp in df_monthly.itertuples(index=False):
    if rtntyp == 'A':
        for d in pd.date_range(end=postdt, periods=12, freq='MS'):
            new_aggrid_list.append(aggrid)
            new_date_list.append(d)
            new_rtntyp_list.append(rtntyp)
            new_hier_list.append(4)
    if rtntyp == 'S':
        for d in pd.date_range(end=postdt, periods=6, freq='MS'):
            new_aggrid_list.append(aggrid)
            new_date_list.append(d)
            new_rtntyp_list.append(rtntyp)
            new_hier_list.append(3)
    if rtntyp == 'Q':
        for d in pd.date_range(end=postdt, periods=3, freq='MS'):
            new_aggrid_list.append(aggrid)
            new_date_list.append(d)
            new_rtntyp_list.append(rtntyp)
            new_hier_list.append(2)
    if rtntyp == 'M':
            new_aggrid_list.append(aggrid)
            new_date_list.append(postdt.replace(day=1))
            new_rtntyp_list.append(rtntyp)
            new_hier_list.append(1)

df_monthly_1stm = pd.DataFrame(columns=['aggrid', 'postdt', 'rtntyp'])
df_monthly_1stm['aggrid'] = new_aggrid_list
df_monthly_1stm['postdt'] = new_date_list
df_monthly_1stm['rtntyp'] = new_rtntyp_list
df_monthly_1stm['hier'] = new_hier_list

print(df_monthly_1stm.shape)
df_monthly_1stm.head()

(11023643, 4)


Unnamed: 0,aggrid,postdt,rtntyp,hier
0,143035123,1998-02-01,M,1
1,143035123,1998-03-01,M,1
2,143035123,1998-04-01,M,1
3,33063727,1998-05-01,M,1
4,33063727,1998-07-01,M,1


In [29]:
#check no duplicate records
print(df_monthly_1stm.drop_duplicates(subset=['aggrid', 'postdt'], keep='first', inplace=False).count())
#there are a small few - remove them
df_monthly_1stm.drop_duplicates(subset=['aggrid', 'postdt'], keep='first', inplace=True)
print(df_monthly_1stm.shape)
df_monthly_1stm.head()

aggrid    11023248
postdt    11023248
rtntyp    11023248
hier      11023248
dtype: int64
(11023248, 4)


Unnamed: 0,aggrid,postdt,rtntyp,hier
0,143035123,1998-02-01,M,1
1,143035123,1998-03-01,M,1
2,143035123,1998-04-01,M,1
3,33063727,1998-05-01,M,1
4,33063727,1998-07-01,M,1


In [63]:
df_daily.rename(columns={'postdt_m': 'postdt'}, inplace=True)
df_daily['hier']=0
df_daily.head()

Unnamed: 0,aggrid,postdt,rtntyp,hier
0,78157052,2002-02-01,D,0
1,114049604,2001-09-01,D,0
2,152142922,2001-10-01,D,0
3,23112689,2001-12-01,D,0
4,149050063,2001-11-01,D,0


# Combine Daily and Monthly

In [64]:
df_combo = df_monthly_1stm.append(df_daily, ignore_index=True)
print(df_combo.shape)
df_combo.head()

(20065789, 4)


Unnamed: 0,aggrid,postdt,rtntyp,hier
0,143035123,1998-02-01,M,1
1,143035123,1998-03-01,M,1
2,143035123,1998-04-01,M,1
3,33063727,1998-05-01,M,1
4,33063727,1998-07-01,M,1


In [65]:
df_left = df_combo.groupby(['aggrid', 'postdt'])['hier'].min().reset_index()
print(df_left.shape)

df_final = df_left.merge(df_combo, left_on=['aggrid', 'postdt', 'hier'], right_on=['aggrid', 'postdt', 'hier'], how='left')

df_final.drop(['hier'], axis=1, inplace=True)
print(df_final.shape)
df_final.head()

(11109250, 3)
(11109250, 3)


Unnamed: 0,aggrid,postdt,rtntyp
0,10600,1975-01-01,A
1,10600,1975-02-01,A
2,10600,1975-03-01,A
3,10600,1975-04-01,A
4,10600,1975-05-01,A


In [66]:
df_summary_out = df_final.groupby(['postdt','rtntyp'])['aggrid'].count().reset_index()
df_summary_out.rename(columns={'aggrid': 'aggrid_cnt'}, inplace=True)
print(df_summary_out.shape)
df_summary_out.head()

(2363, 3)


Unnamed: 0,postdt,rtntyp,aggrid_cnt
0,1934-01-01,M,4
1,1934-02-01,M,4
2,1934-03-01,M,4
3,1934-04-01,M,4
4,1934-05-01,M,4


In [67]:
#output the summarized version
df_summary_out.to_csv('1_1_PF_HIST_Daily_n_Monthly_summary.csv')