In [4]:
import pandas as pd
import numpy as np
import sys
sys.path.insert(0, '../../')
from copy import deepcopy
import datetime
from helpers.ipython_helpers import (
    print_full
)
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
matplotlib.style.use('ggplot')
%matplotlib inline
import seaborn as sns

In [33]:
def initialize_rating_to_ranking_map():

    ratings = [
        "AAA", "AA+", "AA", "AA-", "A+", "A", "A-", 
        "BBB+", "BBB", "BBB-", "BB+", "BB", "BB-", "B+", "B", "B-", 
        "CCC+", "CCC", "CCC-", "CC", "C", "D", "N.M.", "SD"
    ]

    rankings = [index for index, val in enumerate(ratings, start=1)]
    rating_ranking_map = dict(zip(ratings, rankings))
    rating_ranking_map['N.M.'] = rating_ranking_map["D"]
    rating_ranking_map['SD'] = rating_ranking_map["D"]

    return rating_ranking_map

def initialize_broad_ranking_map(rating_to_ranking_map):
    rating_to_broad_ranking_map = {}
    for key, val in rating_to_ranking_map.items():
        if val > 0 and val <= 7:
            rating_to_broad_ranking_map[key] = 1
        elif val > 7 and val <= 10:
            rating_to_broad_ranking_map[key] = 2
        elif val > 10 and val <= 13:
            rating_to_broad_ranking_map[key] = 3
        elif val > 13 and val <= 16:
            rating_to_broad_ranking_map[key] = 4
        else:
            rating_to_broad_ranking_map[key] = 5
    
    return rating_to_broad_ranking_map


def initialize_windsorized_ranking_map(rating_to_ranking_map):
    rating_to_windsorized_ranking_map = {}
    for key, val in rating_to_ranking_map.items():
        if val <= 2:
            rating_to_windsorized_ranking_map[key] = 1
        elif val >= 17:
            rating_to_windsorized_ranking_map[key] = 16
        else:
            rating_to_windsorized_ranking_map[key] = val - 1

    return rating_to_windsorized_ranking_map

def get_ranking(rating, ranking_type):
    if ranking_type == 'original':
        return RATING_TO_RANKING_MAP[rating]
    elif ranking_type == 'broad':
        return RATING_TO_BROAD_RANKING_MAP[rating]
    elif ranking_type == 'windsorized':
        return RATING_TO_WINDSORIZED_RANKING_MAP[rating]        
    

def transform_rating_df(df):
    df= df.rename(columns={'gvkey': 'GVKEY', 'splticrm': 'issuer_rating'})
    df = df.dropna(subset=['issuer_rating'])
    df = df.drop(df[df['issuer_rating'] == 'Suspended'].index)
    df.loc[:, "year"] = df["datadate"].dt.year
    df.loc[:, "month"] = df["datadate"].dt.month
    df.loc[:, "ranking"] = df["issuer_rating"].apply(lambda rating: get_ranking(rating, "original"))
    df.loc[:, 'windsorized_ranking'] = df['issuer_rating'].apply(lambda rating: get_ranking(rating, "windsorized"))
    df.loc[:, 'broad_ranking'] = df['issuer_rating'].apply(lambda rating: get_ranking(rating, "broad"))
    df.loc[:, 'is_investment_grade'] = np.where(df['ranking'] <= 10, True, False)

    return df


def merge_campbell_rating_df(campbell_df, rating_df):
    return rating_df.merge(campbell_df, on=['GVKEY', 'year', 'month'])


def transform_merged_df(df):
    df = df.dropna(subset=['CASHMTA_win', 'EXRET_AVG_win', 'MB_win', 'NIMTA_AVG_win', 'PRICE_win', 'RSIZE_win', 'SIGMA_win', 'TLMTA_win'])
    output_df = pd.DataFrame()
    output_df.loc[: , 'GVKEY'] = df['GVKEY']
    output_df.loc[:, 'datadate'] = df['datadate_x']
    output_df.loc[:, 'company_name'] = df['conm']
    output_df.loc[:, 'year'] = df['year']
    output_df.loc[:, 'month'] = df['month']
    output_df.loc[:, 'CASHMTA_win'] = df['CASHMTA_win']
    output_df.loc[:, 'EXRET_AVG_win'] = df['EXRET_AVG_win']
    output_df.loc[:, 'MB_win'] = df['MB_win']
    output_df.loc[:, 'NIMTA_AVG_win'] =  df['NIMTA_AVG_win']
    output_df.loc[:, 'PRICE_win'] = df['PRICE_win']
    output_df.loc[:, 'RSIZE_win'] = df['RSIZE_win']
    output_df.loc[:, 'SIGMA_win'] = df['SIGMA_win']
    output_df.loc[:, 'TLMTA_win'] = df['TLMTA_win']
    output_df.loc[:, 'issuer_rating'] = df['issuer_rating']
    output_df.loc[:, 'ranking'] = df['ranking']
    output_df.loc[:, 'windsorized_ranking'] = df['windsorized_ranking']
    output_df.loc[:, 'broad_ranking'] = df['broad_ranking']
    output_df.loc[:, 'is_investment_grade'] = df['is_investment_grade']
    
    return output_df

def make_month_diff_column(df):
    df = df.groupby(by='GVKEY').apply(_mark_month_diff)
    df = df.reset_index(drop=True)
    
    return df 
    
    
def _mark_month_diff(df):
    first_year = df.head(1)['year'].iloc[0]
    first_month = df.head(1)['month'].iloc[0]

    df.loc[:, 'month_diff'] = df.apply(lambda row: _compute_month_diff(row, first_year, first_month), axis=1)
        
    return df


def _compute_month_diff(row, first_year, first_month):
    current_year = row['year']
    current_month = row['month']
    year_diff = current_year - first_year
    months_diff = current_month - first_month

    if year_diff > 0:
        months_diff = 12 * year_diff + months_diff 

    return months_diff 


def make_periodic_df(df, period, drop_null_next=False):    
    keep_index = identify_keep_index(df, period)
    df = df[df.index.isin(keep_index)]
    df = df.groupby(by='GVKEY').apply(_mark_next_states)
    if drop_null_next:
        df = df.dropna(subset=['next_rating'])
    
    return df 
    
    
def identify_keep_index(df, period):
    keep_index = []
    prev_index = df.head(1).index.tolist()[0]
    prev_month_diff = df.head(1)['month_diff'].iloc[0]
    prev_gvkey = df.head(1)['GVKEY'].iloc[0]
    include_prev_index = True
    
    for index, row in df.iloc[1:].iterrows():
        current_gvkey = row['GVKEY']
        current_month_diff = row['month_diff']
        current_prev_month_diff = current_month_diff - prev_month_diff
        
        if prev_gvkey == current_gvkey:
            if current_prev_month_diff == period:
                if include_prev_index:
                    keep_index.append(prev_index)
                    include_prev_index = False
                keep_index.append(index)
                prev_index = index 
                prev_month_diff = current_month_diff 
            elif current_prev_month_diff > period:
                include_prev_index = True 
                prev_index = index 
                prev_month_diff = current_month_diff 
            else:
                pass
        else:
            include_prev_index = True
            prev_gvkey = current_gvkey
            prev_index = index 
            prev_month_diff = current_month_diff
            
        
    return keep_index

    
def _mark_next_states(df):
    df.loc[:, 'next_rating'] = df['issuer_rating'].shift(-1)
    df.loc[:, 'next_ranking'] = df['ranking'].shift(-1)
    df.loc[:, 'next_windsorized_ranking'] = df['windsorized_ranking'].shift(-1)
    df.loc[:, 'next_broad_ranking'] = df['broad_ranking'].shift(-1)
    df.loc[:, 'next_is_investment_grade'] = df['is_investment_grade'].shift(-1)

    return df
    
def display_transition_matrix(df):
    transition_dict = dict((ranking, 0) for ranking in range(1, 24))
    transition_matrix = dict((ranking, deepcopy(transition_dict)) for ranking in range(1, 24))

    for index, row in df.iterrows():
        if pd.notnull(row['prev_rating_ranking']):
            prev_ranking = row['prev_rating_ranking']
            current_ranking = row['rating_ranking']
            transition_matrix[int(prev_ranking)][int(current_ranking)] += 1

    return pd.DataFrame(transition_matrix).transpose()
    

def display_change_stats(df):
    investment_grade_size = df[df['is_investment_grade'] != df['next_is_investment_grade']].shape[0]
    broad_ranking_size = df[df['broad_ranking'] != df['next_broad_ranking']].shape[0] 
    ranking_size = df[df['ranking'] != df['next_ranking']].shape[0]
    print("*******************************************************")
    print("The original df has {0} observations".format(df.shape[0]))
    print("Investment grade df has {0} observations with changes in rating".format(investment_grade_size))
    print("Broad ranking df has {0} observations with changes in rating".format(broad_ranking_size))
    print("Ranking df has {0} observatios with changes in rating".format(ranking_size))
    
def plot_ranking_distribution(df, ranking_type):
    sns.set_style('whitegrid')

    if ranking_type == 'ranking':        
        ax = sns.countplot(data=df, x=ranking_type, color='c')
        ax.set(xlabel="Ranking", ylabel='Count')
    elif ranking_type == 'windsorized_ranking':
        ax = sns.countplot(data=df, x=ranking_type, color='c')
        ax.set(xlabel="Windsorized Ranking", ylabel='Count')
    elif ranking_type == 'broad_ranking':
        ax = sns.countplot(data=df, x=ranking_type, color='c')
        ax.set(xlabel="Broad Ranking", ylabel='Count')
    elif ranking_type == 'is_investment_grade':
        ax = sns.countplot(data=df, x=ranking_type, order=(True, False), color='c')
        ax.set(xlabel="Is Investment Grade", ylabel='Count')
        
    sns.reset_orig()
    

def make_streak_df(df):
    ranking_list = []
    streaks_list = []

    last_gvkey = df.iloc[0]['GVKEY']
    last_ranking = df.iloc[0]['ranking']
    last_month_diff = df.iloc[0]['month_diff'] 
    streaks = 1

    for index, row in df.iloc[1:].iterrows():
        gvkey = row['GVKEY']
        ranking = row['ranking']
        month_diff = row['month_diff']
        
        if (last_gvkey == gvkey) and (last_ranking == ranking) and (last_month_diff + 1 == month_diff):
            streaks += 1
        else:
            ranking_list.append(last_ranking)
            streaks_list.append(streaks)
            streaks = 1
            
        last_gvkey = gvkey        
        last_ranking = ranking
        last_month_diff = month_diff

    ranking_list.append(last_ranking)
    streaks_list.append(streaks)
    
    return pd.DataFrame({'ranking': ranking_list, 'streaks': streaks_list})
    
def display_streaks_by_ranking_stats(df):
    df = df.groupby('ranking').describe().reset_index()
    df = df.pivot(index='ranking', columns='level_1', values='streaks')
    return df[["count", "mean", "std", "min", "25%", "50%", "75%", "max"]]


def display_heat_map_current_to_next_state(df, ranking_type):
    if ranking_type == 'ranking':
        df = df[['ranking', 'next_ranking']]
    elif ranking_type == 'windsorized_ranking':
        df = df[['windsorized_ranking', 'next_windsorized_ranking']]
    elif ranking_type == 'broad_ranking':
        df = df[['broad_ranking', 'next_broad_ranking']]
    elif ranking_type == 'is_investment_grade':
        df = df[['is_investment_grade', 'next_is_investment_grade']]
    
    df = df.pivot_table(index='ranking', columns='next_ranking', aggfunc=len, fill_value=0)
    sns.heatmap(df, fmt="d", linewidths=0.5)
    

In [2]:
RATING_TO_RANKING_MAP = initialize_rating_to_ranking_map()
RATING_TO_BROAD_RANKING_MAP = initialize_broad_ranking_map(RATING_TO_RANKING_MAP)
RATING_TO_WINDSORIZED_RANKING_MAP = initialize_windsorized_ranking_map(RATING_TO_RANKING_MAP)

# Read Data

In [5]:
original_rating_df = pd.read_csv("../../../data/credit_rating/raw_data/sp_credit_rating_1973_2016.csv", parse_dates=["datadate"])

In [6]:
original_campbell_df = pd.read_csv("../../../data/features/simple_monthly_campbell_df.csv")

# Transform Data

In [7]:
transformed_rating_df = transform_rating_df(original_rating_df)

In [40]:
rating_month_diff_df = make_month_diff_column(transformed_rating_df)
monthly_rating_df = make_periodic_df(rating_month_diff_df, 1)

In [25]:
merged_df = merge_campbell_rating_df(original_campbell_df, transformed_rating_df)

In [26]:
transformed_merged_df = transform_merged_df(merged_df)

In [30]:
month_diff_df = make_month_diff_column(transformed_merged_df)

In [31]:
print("Transformed Rating DF")
print("There are {0} observations with rating data".format(transformed_rating_df.shape[0]))
print("There are {0} unique companies(gvkey)".format(transformed_rating_df['GVKEY'].unique().shape[0]))
print("There are {0} unique rating categories\n".format(transformed_rating_df['issuer_rating'].unique().shape[0]))

print("MERGED_DF")
print("There are {0} observations with rating data".format(merged_df.shape[0]))
print("There are {0} unique companies(gvkey)".format(merged_df['GVKEY'].unique().shape[0]))
print("There are {0} unique rating categories\n".format(merged_df['issuer_rating'].unique().shape[0]))

print("Transformed_MERGED_DF")
print("There are {0} observations with rating data".format(transformed_merged_df.shape[0]))
print("There are {0} unique companies(gvkey)".format(transformed_merged_df['GVKEY'].unique().shape[0]))
print("There are {0} unique rating categories\n".format(transformed_merged_df['issuer_rating'].unique().shape[0]))

Transformed Rating DF
There are 698610 observations with rating data
There are 5964 unique companies(gvkey)
There are 24 unique rating categories

MERGED_DF
There are 406976 observations with rating data
There are 3777 unique companies(gvkey)
There are 24 unique rating categories

Transformed_MERGED_DF
There are 380358 observations with rating data
There are 3693 unique companies(gvkey)
There are 24 unique rating categories



In [34]:
monthly_df = make_periodic_df(month_diff_df, 1, drop_null_next=True)
quarterly_df = make_periodic_df(month_diff_df, 3, drop_null_next=True)
semi_annual_df = make_periodic_df(month_diff_df, 6, drop_null_next=True)
annual_df = make_periodic_df(month_diff_df, 12, drop_null_next=True)

# Save Data

In [41]:
transformed_rating_df.to_csv("../../../data/credit_rating/intermediate_data/transformed_rating_df_unmerged.csv")
monthly_rating_df.to_csv("../../../data/credit_rating/intermediate_data/monthly_credit_rating_df_unmerged.csv")
monthly_df.to_csv("../../../data/credit_rating/intermediate_data/monthly_credit_rating_features.csv")
quarterly_df.to_csv("../../../data/credit_rating/intermediate_data/quarterly_credit_rating_features.csv")
semi_annual_df.to_csv("../../../data/credit_rating/intermediate_data/semi_annual_credit_rating_features.csv")
annual_df.to_csv("../../../data/credit_rating/intermediate_data/annual_credit_rating_features.csv")