### Import Packages

In [1]:
import os
import pyodbc
import numpy as np
import pandas as pd
import seaborn as sns
from datetime import datetime
import matplotlib.pyplot as plt
%matplotlib inline

### Import Data

#### `IM_I_RESULTSRELEASED_S.dbo.PortfolioAbsReturnReleased`

In [2]:
cnxn = pyodbc.connect('Driver={SQL Server Native Client 11.0};\
                       Server=lasr-sqldb-prd-im,17001;\
                       Database=IM_I_IDWBRRADM_S;\
                       Trusted_Connection=yes;')

In [3]:
sql_query = """

select
    raportbenchmarkid,
    postdt,
    rtnperiodtypcd,
    rtnpct
from 
    IM_I_IDWBRRADM_S.dbo.RA_BMRK_RTN  
where
    crncyuid = 1 and rtnpct is not null

    
"""

# Save the data 
data = pd.read_sql(sql_query, cnxn)

# Close the connection with LASR
cnxn.close()

data.head()

Unnamed: 0,raportbenchmarkid,postdt,rtnperiodtypcd,rtnpct
0,32467,2014-06-30,3MO,101.947973
1,180688,1999-12-31,LIFEPRD,83.770256
2,32436,2010-09-30,3MO,100.0
3,32210,1999-05-28,3MO,108.792862
4,11682,2001-02-28,LIFEPRD,94.918007


In [9]:
#data.to_csv('rawdata_RA_BMRK_RTN.csv')

### Exploratory Data Analysis

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38101477 entries, 0 to 38101476
Data columns (total 4 columns):
raportbenchmarkid    int64
postdt               datetime64[ns]
rtnperiodtypcd       object
rtnpct               float64
dtypes: datetime64[ns](1), float64(1), int64(1), object(1)
memory usage: 1.1+ GB


In [6]:
data.shape

(38101477, 4)

In [7]:
data.columns.values

array(['raportbenchmarkid', 'postdt', 'rtnperiodtypcd', 'rtnpct'],
      dtype=object)

### Data Preprocessing

In [8]:
data['postdt_new'] = data['postdt'].apply(lambda dt: dt.replace(day=1))
data.head()

Unnamed: 0,raportbenchmarkid,postdt,rtnperiodtypcd,rtnpct,postdt_new
0,32467,2014-06-30,3MO,101.947973,2014-06-01
1,180688,1999-12-31,LIFEPRD,83.770256,1999-12-01
2,32436,2010-09-30,3MO,100.0,2010-09-01
3,32210,1999-05-28,3MO,108.792862,1999-05-01
4,11682,2001-02-28,LIFEPRD,94.918007,2001-02-01


In [10]:
data.shape

(38101477, 5)

In [11]:
data.columns.values

array(['raportbenchmarkid', 'postdt', 'rtnperiodtypcd', 'rtnpct',
       'postdt_new'], dtype=object)

In [12]:
data.drop_duplicates(subset=['raportbenchmarkid', 'rtnperiodtypcd', 'postdt_new'], keep='first', inplace=True)

In [13]:
data.shape

(38087635, 5)

In [14]:
new_df = data.groupby(['postdt_new', 'rtnperiodtypcd'])['raportbenchmarkid'].count().reset_index()
new_df = new_df.rename({'raportbenchmarkid':'count_raportbenchmarkid'}, axis=1)
new_df.head()

Unnamed: 0,postdt_new,rtnperiodtypcd,count_raportbenchmarkid
0,1958-12-01,12MO,1
1,1958-12-01,ANN,1
2,1958-12-01,LIFE,8
3,1958-12-01,LIFEPRD,8
4,1958-12-01,YTD,1


In [15]:
new_df.shape

(11182, 3)

In [16]:
new_df.to_csv('summary_RA_BMRK_RTN.csv')

In [None]:
#sorted(new_df.postdt_new.unique(), reverse=True)