### Import Packages

In [1]:
import os
import pyodbc
import numpy as np
import pandas as pd
import seaborn as sns
from datetime import datetime
import matplotlib.pyplot as plt
%matplotlib inline

### Import Data

#### `IM_I_IDWBRRESULTS_S.dbo.MA_PP_PFMHIST`

In [2]:
cnxn = pyodbc.connect('Driver={SQL Server Native Client 11.0};\
                       Server=lasr-sqldb-prd-im,17001;\
                       Database=IM_I_IDWBRRESULTS_S;\
                       Trusted_Connection=yes;')

In [None]:
sql_query = """
select 
    aggrid, postdt, incltyp, rtntyp,
    totrtn1mo, totrtn3mo, totrtn6mo, totrtn12mo
from 
    IM_I_IDWBRRESULTS_S.dbo.MA_PP_PFMHIST
where 
    incltyp = 'I' and crncyid = 1

    
"""

data = pd.read_sql(sql_query, cnxn)
data.head()

### Exploratory Data Analysis

In [None]:
date_range = list(sorted(data['postdt'].unique()))
print("StartDate:", min(date_range))
print("EndDate:", max(date_range))

In [None]:
data.shape

In [None]:
data.info()

In [None]:
data.describe()

### Data Preprocessing

In [None]:
def align_values(row):
    if row['rtntyp'] == 'M':
        return row['totrtn1mo']
    elif row['rtntyp'] == 'Q':
        return row['totrtn3mo']
    elif row['rtntyp'] == 'S':
        return row['totrtn6mo']
    elif row['rtntyp'] == 'A':
        return row['totrtn12mo']
    else:
        return np.nan

In [None]:
# Data cleaning
data['rtntyp_values'] = data.apply(lambda x: align_values(x), axis=1)
data = data[['aggrid', 'postdt', 'rtntyp', 'rtntyp_values']]
data = data.sort_values('postdt')

# Drop rows with NULL values
data = data.dropna(axis=0)

In [None]:
# # Not using the 'rtntyp_values' column
# new_df = data[['aggrid','postdt', 'rtntyp']]

new_df = data

In [None]:
new_aggrid_list = []
new_date_list = []
new_rtntyp_list = []
new_rtntyp_values_list = []

for aggrid, date, rtntyp, rtntyp_value in new_df.itertuples(index=False):
    if rtntyp == 'A':
        for d in pd.date_range(end=date, periods=12, freq='MS'):
            new_aggrid_list.append(aggrid)
            new_date_list.append(d)
            new_rtntyp_list.append(rtntyp)
            new_rtntyp_values_list.append(rtntyp_value)
    if rtntyp == 'S':
        for d in pd.date_range(end=date, periods=6, freq='MS'):
            new_aggrid_list.append(aggrid)
            new_date_list.append(d)
            new_rtntyp_list.append(rtntyp)
            new_rtntyp_values_list.append(rtntyp_value)
    if rtntyp == 'Q':
        for d in pd.date_range(end=date, periods=3, freq='MS'):
            new_aggrid_list.append(aggrid)
            new_date_list.append(d)
            new_rtntyp_list.append(rtntyp)
            new_rtntyp_values_list.append(rtntyp_value)
    if rtntyp == 'M':
        new_aggrid_list.append(aggrid)
        new_date_list.append(date.replace(day=1))
        new_rtntyp_list.append(rtntyp)
        new_rtntyp_values_list.append(rtntyp_value)

### Final Dataframe for Visualization

In [None]:
new_df = pd.DataFrame(columns=['aggrid_new', 'postdt_new', 'rtntyp_new', 'rtntyp_value_new'])
new_df['aggrid_new'] = new_aggrid_list
new_df['postdt_new'] = new_date_list
new_df['rtntyp_new'] = new_rtntyp_list
new_df['rtntyp_value_new'] = new_rtntyp_values_list
new_df.head()

In [None]:
new_df.tail()

#### Pivot Table for Visualization

In [None]:
pivot_new_df = pd.pivot_table(new_df, index=['rtntyp_new'], 
                                      columns=['postdt_new'], 
                                      values='rtntyp_value_new', 
                                      aggfunc='count', fill_value=0)

pivot_new_df

### Visualization

In [None]:
plt.style.use('fivethirtyeight')

# Graph
fig = plt.figure(figsize=(20,12))
plt.stackplot(pivot_new_df.columns, pivot_new_df.values, labels=pivot_new_df.index)

# Labels
plt.title('Historical Availability of Portfolios by Frequency Type \n')
plt.xlabel('\n Years')
plt.ylabel('Count of Portfolios \n')
plt.legend(loc='upper left')
plt.show()

### Export to Excel

In [None]:
#new_df.to_csv('final_df.csv')