<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Read-in-Data" data-toc-modified-id="Read-in-Data-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Read in Data</a></span><ul class="toc-item"><li><span><a href="#Read-in-excel-files-combining-ticker-symbols-with-the-IQID" data-toc-modified-id="Read-in-excel-files-combining-ticker-symbols-with-the-IQID-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>Read in excel files combining ticker symbols with the IQID</a></span></li><li><span><a href="#Read-in-independent-variables,-join-tickers" data-toc-modified-id="Read-in-independent-variables,-join-tickers-1.2"><span class="toc-item-num">1.2&nbsp;&nbsp;</span>Read in independent variables, join tickers</a></span></li><li><span><a href="#Join-in-the-credit-rating-data" data-toc-modified-id="Join-in-the-credit-rating-data-1.3"><span class="toc-item-num">1.3&nbsp;&nbsp;</span>Join in the credit rating data</a></span></li></ul></li><li><span><a href="#Generate-Model-Data" data-toc-modified-id="Generate-Model-Data-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Generate Model Data</a></span></li><li><span><a href="#Generate-Models" data-toc-modified-id="Generate-Models-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Generate Models</a></span><ul class="toc-item"><li><span><a href="#Support-Vector-Machines" data-toc-modified-id="Support-Vector-Machines-3.1"><span class="toc-item-num">3.1&nbsp;&nbsp;</span>Support Vector Machines</a></span></li><li><span><a href="#Random-Forest-Classifier" data-toc-modified-id="Random-Forest-Classifier-3.2"><span class="toc-item-num">3.2&nbsp;&nbsp;</span>Random Forest Classifier</a></span></li></ul></li><li><span><a href="#Prediction-Function" data-toc-modified-id="Prediction-Function-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Prediction Function</a></span></li><li><span><a href="#Plot-Accuracy" data-toc-modified-id="Plot-Accuracy-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Plot Accuracy</a></span></li><li><span><a href="#See-values" data-toc-modified-id="See-values-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>See values</a></span></li></ul></div>

In [None]:
import pandas as pd 
import numpy as np 
import plotly.graph_objects as go 
import seaborn as sns 
import matplotlib.pyplot as plt 
from tqdm import tqdm 

## Read in Data

---
### Read in excel files combining ticker symbols with the IQID 

In [None]:
ids= pd.DataFrame() 
## Read in ticker symbols
for i in range(1, 6): 
    df = pd.read_excel('capiq_data/in_process_ids/ids {}.xlsx'.format(i),
                       engine='openpyxl')[['ID', 'IQID', 'IQ Name']]
    ids = pd.concat([ids, df]) 

## See if there are any duplicates 
print(ids.duplicated().sum()) 
## See if there are any nulls 
print(ids.isna().sum()) 
ids.head() 

### Read in independent variables, join tickers 

In [None]:
## Join IQID for inds so we can see the Ticker and the name 

ind_df = pd.read_csv('small_df.csv') 

ind_df = ind_df.merge(ids, on = ['IQID', 'IQ Name']) 
ind_df.drop(['Unnamed: 0', 'IQ Name', 'IQID', 
            'quarter'], axis = 1, inplace = True) 

## The data is quarterly, but we need to lok at it annually. 
## This means we have to take the mean of the data for all the quarters 
ind_df = ind_df.groupby(['year', 'ID']).mean().reset_index()
                      
ind_df.head() 

### Join in the credit rating data

We want an inner join so we only keep the companies that we have the credit rating for

In [None]:
## Join in the credit Ratings Data
credit_ratings = pd.read_csv('credit.csv')[['Year', 'TickerSymbol',
                                            'DomesticLTICRSPMthlyAvg']]
credit_ratings.rename({'DomesticLTICRSPMthlyAvg': 'rating'}, inplace = True, axis = 1)  

## Add credit ratings to df  
tot_df = credit_ratings.merge(ind_df, how = 'inner', 
                             left_on = ['Year', 'TickerSymbol'], 
                             right_on = ['year', 'ID'])
tot_df.drop(['Year', 'TickerSymbol'], axis = 1, inplace = True) 

tot_df.head() 

In [None]:
tot_df_clean = pd.DataFrame() 



for ticker in tqdm(tot_df['ID'].unique()): 
    
    
    ## small df is all the rows with the ticker, sort by year 
    small_df = tot_df[tot_df['ID'] == ticker].sort_values(by = 'year',
                                                             ascending = True)
    
    
    ## Insert a lead rating column. This is the predictor column, 
    ## as we are trying to predict the credit rating for the next year
    small_df.insert(loc = 0, column = 'lead_rating', 
               value = small_df.rating.shift(1)) 
    
    
    ## Set the index as the year and the ticker. 
    small_df.set_index(['year', 'ID'], inplace = True) 
    
    ## Take the difference between rows. We are looking to find 
    ## differences in credit rating, so we are going to compare it to 
    ## differences in dependent variables. 
    ## We can then drop the nulls. 
    small_df = small_df.diff().dropna()  
    
    tot_df_clean = pd.concat([tot_df_clean, small_df], axis = 0) 

tot_df_clean.head() 

## Generate Model Data

In [None]:
lead_rating = tot_df_clean['lead_rating'].to_numpy()


lead_rating[lead_rating == 0] = 0
lead_rating[(lead_rating > 0) & (lead_rating <= 1)] = 1 
lead_rating[(lead_rating < 0) & (lead_rating >= -1)] = -1 
lead_rating[lead_rating > 1] = 2 
lead_rating[lead_rating < -1] = -2
tot_df_clean['rating'] = tot_df_clean['lead_rating']
tot_df_clean['lead_rating'] = lead_rating

full_df = tot_df_clean 

In [None]:
tot_df_clean.lead_rating.value_counts().to_frame().style.bar()

In [None]:
from sklearn.utils import resample 

## Resample the data

rating_0 = tot_df_clean[tot_df_clean.lead_rating ==0 ]
tot_df_clean_sampled = tot_df_clean[tot_df_clean.lead_rating !=0 ]
rating_0 = resample(rating_0, 
                   replace = True, 
                   n_samples = 150,
                    random_state = 123) 

tot_df_clean = pd.concat([rating_0, 
                          tot_df_clean_sampled], axis = 0) 

In [None]:
from sklearn.model_selection import train_test_split 

## Split into x and y
x = tot_df_clean.drop(['lead_rating'], axis = 1) 

y_numeric = tot_df_clean['rating'] 
y = tot_df_clean['lead_rating'] 


## Normalize the data, but we don't need to normalize the 
## dependent variable
x = (x - x.mean()) / (x.std())


## We'll set aside 10% of the data for testing
train_x, test_x, train_y, test_y = train_test_split(x, 
                                                    y, 
                                                    train_size = 0.9, 
                                                    random_state = 5)

train_x_num, test_x_num, train_y_num, test_y_num = train_test_split(x, y_numeric, train_size = 0.9, random_state = 5) 

## Generate Models 
### Support Vector Machines

In [None]:
from sklearn.model_selection import RepeatedStratifiedKFold 
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score


def SVM_Fit(train_x, train_y, kernel,
            params = [10**x for x in np.arange(-1,3,0.9)]): 
    '''Fit the SVM Machine given the kernel type, parameters, 
    data''' 
    
    if kernel == 'linear': 
        parameters = {'C': params} 
    else: 
        parameters = {'C': params, 
                     'gamma': params} 
    
    cv = RepeatedStratifiedKFold(n_splits = 5, 
                                n_repeats = 5) 
    
    model = GridSearchCV(estimator = SVC(kernel = kernel), 
                        param_grid = parameters, 
                        cv = 2, 
                        verbose = 1) 
    
    model.fit(x, y) 
    return model

### Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV

rf = RandomForestClassifier(random_state = 200) 
rf.get_params()


# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}


rf_random = RandomizedSearchCV(estimator = rf, 
                              param_distributions = random_grid, 
                              n_iter = 100, cv = 5, verbose = 2, 
                              random_state = 200, n_jobs = -1)


## Prediction Function

In [None]:
def Predict(fitted_model, test_x, test_y, name):
    prediction = fitted_model.predict(test_x) 
    score = accuracy_score(prediction, test_y) 
    prediction = pd.DataFrame({'prediction_{}'.format(name): prediction})
    print('The {} Model Score is: {}'.format(name, score)) 
    return prediction, score

In [None]:
## Make predictions
sigmoid = SVM_Fit(train_x, train_y, 'sigmoid') 
rbf = SVM_Fit(train_x, train_y, 'rbf') 
linear = SVM_Fit(train_x, train_y,'linear')
#poly = SVM_Fit(train_x, train_y, 'poly') 
poly = SVC(kernel = 'poly').fit(train_x, train_y)

rf_random.fit(train_x, train_y)

In [None]:
sigmoid_predict, sigmoid_score = Predict(sigmoid, test_x, test_y, 'sigmoid') 
lin_predict, lin_score = Predict(linear, test_x, test_y, 'linear') 
poly_predict, poly_score = Predict(poly, test_x, test_y, 'poly') 
rbf_predict, rbf_score = Predict(rbf, test_x, test_y, 'radial') 


random_predict, random_score = Predict(rf_random, test_x, test_y, 
                                      'Random Forest')

## Plot Accuracy

In [None]:
fig = go.Figure() 
model_names = ['Sigmoid SVC', 'Radial SVC', 'Linear SVC', 'Polynomial SVC',
 'Random Forests']

model_accuracy = [sigmoid_score, rbf_score, lin_score, poly_score, random_score]

fig.add_trace(go.Bar(x = model_names, 
                    y = model_accuracy, 
                    text = model_accuracy, 
                    textposition = 'auto'))
fig.update_layout(title = 'Model Accuracy Scores')

fig.update_yaxes(title_text = 'Accuracy Score') 
fig.update_xaxes(title_text = "Model")
fig.show()

In [None]:
from sklearn.linear_model import Lasso, Ridge, LinearRegression
from sklearn.metrics import mean_squared_error
## OLS Linear Regression
ols = LinearRegression().fit(train_x_num, train_y_num)
ols_prediction = ols.predict(test_x_num)
ols_r2 = mean_squared_error( test_y_num,ols_prediction) 
ols_r2

In [None]:
from sklearn.metrics import mean_squared_error

def ContinuousPrediction(model, train_x, train_y, test_x, test_y): 

    params = [10**x for x in np.arange(-1,3,0.9)]

    model = GridSearchCV(estimator= model, param_grid = dict(alpha = params), cv = 5, verbose = 1)

    model.fit(train_x, train_y) 

    prediction = model.predict(test_x) 

    r_2 = mean_squared_error(test_y, prediction) 

    return model, prediction, r_2 

lasso, lasso_predict, lasso_r2 = ContinuousPrediction(Lasso(), train_x_num, train_y_num, test_x_num, test_y_num)
ridge, ridge_predict, ridge_r2 = ContinuousPrediction(Ridge(), train_x_num, train_y_num, test_x_num, test_y_num)

In [None]:
print('The Lasso Prediction R2 Score is: {}'.format(lasso_r2) ) 
print('The Ridge Prediction R2 Score is: {}'.format(ridge_r2) ) 
print('The OLS Prediction R2 Score is: {}'.format(ols_r2) ) 

In [None]:
from sklearn.svm import SVR


def SVM_Fit_Num(train_x, train_y, kernel,
            params = [10**x for x in np.arange(-1,3,0.9)]): 
    '''Fit the SVM Machine given the kernel type, parameters, 
    data''' 
    
    if kernel == 'linear': 
        parameters = {'C': params} 
    else: 
        parameters = {'C': params, 
                     'gamma': params} 
    
    cv = RepeatedStratifiedKFold(n_splits = 5, 
                                n_repeats = 5) 
    
    model = GridSearchCV(estimator = SVR(kernel = kernel), 
                        param_grid = parameters, 
                        cv = 2, 
                        verbose = 1) 
    
    model.fit(train_x, train_y) 

    return model

def PredictNum(fitted_model, test_x, test_y, name):
    prediction = fitted_model.predict(test_x) 
    score = mean_squared_error(test_y, prediction) 
    prediction = pd.DataFrame({'prediction_{}'.format(name): prediction})
    print('The {} Model Score is: {}'.format(name, score)) 
    return prediction, score

In [None]:
linear_num = SVM_Fit_Num(train_x_num, train_y_num, 'linear') 
#poly_num, poly_predict_num, poly_score_num = SVM_Fit_Num(train_x_num, train_y_num, test_x_num, test_y_num, 'poly') 
poly_num = SVR().fit(train_x_num, train_y_num) 
rbf_num = SVM_Fit_Num(train_x_num, train_y_num, 'rbf') 
sigmoid_num = SVM_Fit_Num(train_x_num, train_y_num,'sigmoid') 

In [None]:
sigmoid_predict_num, sigmoid_score_num = PredictNum(sigmoid_num, test_x_num, test_y_num, 'Sigmoid') 
rbf_predict_num, rbf_score_num = PredictNum(rbf_num, test_x_num, test_y_num, 'Radial') 
poly_predict_num, poly_score_num = PredictNum(poly_num, test_x_num, test_y_num, 'Polynomial')
lin_predict_num, lin_score_num = PredictNum(linear_num, test_x_num, test_y_num, 'Linear') 

In [None]:
fig = go.Figure() 
model_names = ['Sigmoid SVM', 'Radial SVM', 'Linear SVM', 'Polynomial SVM', 'OLS', 'Lasso', 'Ridge']

model_accuracy = [sigmoid_score_num, rbf_score_num, lin_score_num, poly_score_num, ols_r2, lasso_r2, ridge_r2]

fig.add_trace(go.Bar(x = model_names, 
                    y = model_accuracy, 
                    text = model_accuracy, 
                    textposition = 'auto'))
fig.update_layout(title = 'Model Accuracy Scores Numeric Prediction')

fig.update_yaxes(title_text = 'Mean Squared Error') 
fig.update_xaxes(title_text = "Model")
fig.show()

## See values

In [None]:
import plotly.io as pio
pio.renderers.default = 'browser'

from cleaning_plots import * 
fig, metrics = PlotReg(full_df,
        dep_cols = ['lead_rating', 'rating']).Plot()

In [None]:
full_df.head() 

In [None]:
fig.show() 