<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Read-in-Data" data-toc-modified-id="Read-in-Data-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Read in Data</a></span><ul class="toc-item"><li><span><a href="#Read-in-excel-files-combining-ticker-symbols-with-the-IQID" data-toc-modified-id="Read-in-excel-files-combining-ticker-symbols-with-the-IQID-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>Read in excel files combining ticker symbols with the IQID</a></span></li><li><span><a href="#Read-in-independent-variables,-join-tickers" data-toc-modified-id="Read-in-independent-variables,-join-tickers-1.2"><span class="toc-item-num">1.2&nbsp;&nbsp;</span>Read in independent variables, join tickers</a></span></li><li><span><a href="#Join-in-the-credit-rating-data" data-toc-modified-id="Join-in-the-credit-rating-data-1.3"><span class="toc-item-num">1.3&nbsp;&nbsp;</span>Join in the credit rating data</a></span></li></ul></li><li><span><a href="#Generate-Model-Data" data-toc-modified-id="Generate-Model-Data-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Generate Model Data</a></span></li><li><span><a href="#Generate-Models" data-toc-modified-id="Generate-Models-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Generate Models</a></span><ul class="toc-item"><li><span><a href="#Support-Vector-Machines" data-toc-modified-id="Support-Vector-Machines-3.1"><span class="toc-item-num">3.1&nbsp;&nbsp;</span>Support Vector Machines</a></span></li><li><span><a href="#Random-Forest-Classifier" data-toc-modified-id="Random-Forest-Classifier-3.2"><span class="toc-item-num">3.2&nbsp;&nbsp;</span>Random Forest Classifier</a></span></li></ul></li><li><span><a href="#Prediction-Function" data-toc-modified-id="Prediction-Function-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Prediction Function</a></span></li><li><span><a href="#Plot-Accuracy" data-toc-modified-id="Plot-Accuracy-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Plot Accuracy</a></span></li></ul></div>

In [2]:
import pandas as pd 
import numpy as np 
import plotly.graph_objects as go 
import seaborn as sns 
import matplotlib.pyplot as plt 
from tqdm import tqdm 

## Read in Data

---
### Read in excel files combining ticker symbols with the IQID 

In [3]:
ids= pd.DataFrame() 
## Read in ticker symbols
for i in range(1, 6): 
    df = pd.read_excel('capiq_data/in_process_ids/ids {}.xlsx'.format(i),
                       engine='openpyxl')[['ID', 'IQID', 'IQ Name']]
    ids = pd.concat([ids, df]) 

## See if there are any duplicates 
print(ids.duplicated().sum()) 
## See if there are any nulls 
print(ids.isna().sum()) 
ids.head() 

0
ID         0
IQID       0
IQ Name    0
dtype: int64


Unnamed: 0,ID,IQID,IQ Name
0,MMM,IQ289194,3M Company
1,ABT,IQ247483,Abbott Laboratories
2,ABBV,IQ141885706,AbbVie Inc.
3,ABMD,IQ247589,"Abiomed, Inc."
4,ACN,IQ972190,Accenture plc


### Read in independent variables, join tickers 

In [4]:
## Join IQID for inds so we can see the Ticker and the name 

ind_df = pd.read_csv('small_df.csv') 

ind_df = ind_df.merge(ids, on = ['IQID', 'IQ Name']) 
ind_df.drop(['Unnamed: 0', 'IQ Name', 'IQID', 
            'quarter'], axis = 1, inplace = True) 

## The data is quarterly, but we need to lok at it annually. 
## This means we have to take the mean of the data for all the quarters 
ind_df = ind_df.groupby(['year', 'ID']).mean().reset_index()
                      
ind_df.head() 

Unnamed: 0,year,ID,IQ_TOTAL_REV,IQ_NI_CF,IQ_AR,IQ_GP,IQ_TOTAL_ASSETS,IQ_AP,IQ_TOTAL_LIAB,IQ_TOTAL_DEBT,IQ_CASH_FINAN,IQ_TOTAL_EQUITY,IQ_CASH_EQUIV
0,1995,CZR,36.3805,5.9835,0.96275,16.722,53.898,0.81975,34.04225,30.9075,6.802,19.85575,1.5305
1,1995,HII,455.5,7.5,165.0,33.5,690.0,83.0,554.0,180.0,-13.5,136.0,1.0
2,1996,CZR,37.6625,4.61575,3.07875,17.35075,171.045,2.952,104.33325,124.62325,-2.3005,86.5675,4.146
3,1996,HII,467.5,13.75,210.75,35.25,740.5,64.0,599.0,352.75,43.25,209.5,0.5
4,1996,HLT,994.333333,15.0,133.333333,255.666667,4543.3,270.466667,2559.0,1655.966667,65.333333,1984.3,296.6


### Join in the credit rating data

We want an inner join so we only keep the companies that we have the credit rating for

In [5]:
## Join in the credit Ratings Data
credit_ratings = pd.read_csv('credit.csv')[['Year', 'TickerSymbol',
                                            'DomesticLTICRSPMthlyAvg']]
credit_ratings.rename({'DomesticLTICRSPMthlyAvg': 'rating'}, inplace = True, axis = 1)  

## Add credit ratings to df  
tot_df = credit_ratings.merge(ind_df, how = 'inner', 
                             left_on = ['Year', 'TickerSymbol'], 
                             right_on = ['year', 'ID'])
tot_df.drop(['Year', 'TickerSymbol'], axis = 1, inplace = True) 

tot_df.head() 

Unnamed: 0,rating,year,ID,IQ_TOTAL_REV,IQ_NI_CF,IQ_AR,IQ_GP,IQ_TOTAL_ASSETS,IQ_AP,IQ_TOTAL_LIAB,IQ_TOTAL_DEBT,IQ_CASH_FINAN,IQ_TOTAL_EQUITY,IQ_CASH_EQUIV
0,12.0,2000,AAL,4859.0,47.0,1303.0,1489.0,26213.0,1267.0,19037.0,6270.0,385.0,7176.0,89.0
1,12.666667,2001,AAL,4740.75,-440.5,1511.0,1018.5,30054.25,1538.5,23660.0,8458.25,1052.75,6394.25,152.0
2,14.583333,2002,AAL,4355.0,-877.75,1471.5,1175.5,31230.25,1394.75,28539.0,11930.75,626.5,2691.25,139.0
3,18.25,2003,AAL,4360.0,-307.0,856.75,1018.25,29394.75,1048.5,29664.5,13561.0,15.0,-269.75,148.0
4,18.0,2004,AAL,4661.25,-190.25,905.75,1158.0,29496.25,1050.25,29784.25,14385.5,82.75,-288.0,146.25


In [6]:
tot_df_clean = pd.DataFrame() 



for ticker in tqdm(tot_df['ID'].unique()): 
    
    
    ## small df is all the rows with the ticker, sort by year 
    small_df = tot_df[tot_df['ID'] == ticker].sort_values(by = 'year',
                                                             ascending = True)
    
    
    ## Insert a lead rating column. This is the predictor column, 
    ## as we are trying to predict the credit rating for the next year
    small_df.insert(loc = 0, column = 'lead_rating', 
               value = small_df.rating.shift(-1)) 
    
    
    ## Set the index as the year and the ticker. 
    small_df.set_index(['year', 'ID'], inplace = True) 
    
    ## Take the difference between rows. We are looking to find 
    ## differences in credit rating, so we are going to compare it to 
    ## differences in dependent variables. 
    ## We can then drop the nulls. 
    small_df = small_df.diff().dropna()  
    
    tot_df_clean = pd.concat([tot_df_clean, small_df], axis = 0) 

tot_df_clean.head() 

100%|██████████| 312/312 [00:01<00:00, 201.42it/s]


Unnamed: 0_level_0,Unnamed: 1_level_0,lead_rating,rating,IQ_TOTAL_REV,IQ_NI_CF,IQ_AR,IQ_GP,IQ_TOTAL_ASSETS,IQ_AP,IQ_TOTAL_LIAB,IQ_TOTAL_DEBT,IQ_CASH_FINAN,IQ_TOTAL_EQUITY,IQ_CASH_EQUIV
year,ID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2001,AAL,1.916667,0.666667,-118.25,-487.5,208.0,-470.5,3841.25,271.5,4623.0,2188.25,667.75,-781.75,63.0
2002,AAL,3.666667,1.916667,-385.75,-437.25,-39.5,157.0,1176.0,-143.75,4879.0,3472.5,-426.25,-3703.0,-13.0
2003,AAL,-0.25,3.666667,5.0,570.75,-614.75,-157.25,-1835.5,-346.25,1125.5,1630.25,-611.5,-2961.0,9.0
2004,AAL,0.0,-0.25,301.25,116.75,49.0,139.75,101.5,1.75,119.75,824.5,67.75,-18.25,-1.75
2005,AAL,-0.583333,0.0,516.75,-24.0,141.0,96.25,-98.25,77.75,481.5,-42.5,50.5,-579.75,-1.5


## Generate Model Data

In [7]:
lead_rating = tot_df_clean['lead_rating'].to_numpy()


lead_rating[lead_rating == 0] = 0
lead_rating[(lead_rating > 0) & (lead_rating <= 1)] = 1 
lead_rating[(lead_rating < 0) & (lead_rating >= -1)] = -1 
lead_rating[lead_rating > 1] = 2 
lead_rating[lead_rating < -1] = -2
tot_df_clean['rating'] = tot_df_clean['lead_rating']
tot_df_clean['lead_rating'] = lead_rating

full_df = tot_df_clean 

In [8]:
tot_df_clean.lead_rating.value_counts().to_frame().style.bar()

Unnamed: 0,lead_rating
0.0,2050
-1.0,557
1.0,411
2.0,114
-2.0,85


In [9]:
from sklearn.utils import resample 

## Resample the data

rating_0 = tot_df_clean[tot_df_clean.lead_rating ==0 ]
tot_df_clean_sampled = tot_df_clean[tot_df_clean.lead_rating !=0 ]
rating_0 = resample(rating_0, 
                   replace = True, 
                   n_samples = 150,
                    random_state = 123) 

tot_df_clean = pd.concat([rating_0, 
                          tot_df_clean_sampled], axis = 0) 

In [10]:
from sklearn.model_selection import train_test_split 

## Split into x and y
x = tot_df_clean.drop(['lead_rating'], axis = 1) 

y_numeric = tot_df_clean['rating'] 
y = tot_df_clean['lead_rating'] 


## Normalize the data, but we don't need to normalize the 
## dependent variable
x = (x - x.mean()) / (x.std())


## We'll set aside 10% of the data for testing
train_x, test_x, train_y, test_y = train_test_split(x, 
                                                    y, 
                                                    train_size = 0.9, 
                                                    random_state = 5)

train_x_num, test_x_num, train_y_num, test_y_num = train_test_split(x, y_numeric, train_size = 0.9, random_state = 5) 

## Generate Models 
### Support Vector Machines

In [11]:
from sklearn.model_selection import RepeatedStratifiedKFold 
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score


def SVM_Fit(train_x, train_y, kernel,
            params = [10**x for x in np.arange(-1,3,0.9)]): 
    '''Fit the SVM Machine given the kernel type, parameters, 
    data''' 
    
    if kernel == 'linear': 
        parameters = {'C': params} 
    else: 
        parameters = {'C': params, 
                     'gamma': params} 
    
    cv = RepeatedStratifiedKFold(n_splits = 5, 
                                n_repeats = 5) 
    
    model = GridSearchCV(estimator = SVC(kernel = kernel), 
                        param_grid = parameters, 
                        cv = 2, 
                        verbose = 1) 
    
    model.fit(x, y) 
    return model

### Random Forest Classifier

In [12]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV

rf = RandomForestClassifier(random_state = 200) 
rf.get_params()


# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}


rf_random = RandomizedSearchCV(estimator = rf, 
                              param_distributions = random_grid, 
                              n_iter = 100, cv = 5, verbose = 2, 
                              random_state = 200, n_jobs = -1)


## Prediction Function

In [13]:
def Predict(fitted_model, test_x, test_y, name):
    prediction = fitted_model.predict(test_x) 
    score = accuracy_score(prediction, test_y) 
    prediction = pd.DataFrame({'prediction_{}'.format(name): prediction})
    print('The {} Model Score is: {}'.format(name, score)) 
    return prediction, score

In [14]:
## Make predictions
sigmoid = SVM_Fit(train_x, train_y, 'sigmoid') 
rbf = SVM_Fit(train_x, train_y, 'rbf') 
linear = SVM_Fit(train_x, train_y,'linear')
#poly = SVM_Fit(train_x, train_y, 'poly') 
poly = SVC(kernel = 'poly').fit(train_x, train_y)

rf_random.fit(train_x, train_y)

Fitting 2 folds for each of 25 candidates, totalling 50 fits
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    0.7s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
Fitting 2 folds for each of 25 candidates, totalling 50 fits
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    1.1s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.1s finished
Fitting 2 folds for each of 5 candidates, totalling 10 fits
Fitting 5 folds for each of 100 candidates, totalling 500 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   16.2s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 349 tasks      | elapsed:  3.3min
[Parallel(n_jobs=-1)]: Done 500 out of 5

RandomizedSearchCV(cv=5, estimator=RandomForestClassifier(random_state=200),
                   n_iter=100, n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [10, 20, 30, 40, 50, 60,
                                                      70, 80, 90, 100, 110,
                                                      None],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': [200, 400, 600, 800,
                                                         1000, 1200, 1400, 1600,
                                                         1800, 2000]},
                   random_state=200, verbose=2)

In [15]:
sigmoid_predict, sigmoid_score = Predict(sigmoid, test_x, test_y, 'sigmoid') 
lin_predict, lin_score = Predict(linear, test_x, test_y, 'linear') 
poly_predict, poly_score = Predict(poly, test_x, test_y, 'poly') 
rbf_predict, rbf_score = Predict(rbf, test_x, test_y, 'radial') 


random_predict, random_score = Predict(rf_random, test_x, test_y, 
                                      'Random Forest')

The sigmoid Model Score is: 0.8863636363636364
The linear Model Score is: 1.0
The poly Model Score is: 0.5606060606060606
The radial Model Score is: 1.0
The Random Forest Model Score is: 1.0


## Plot Accuracy

In [16]:
fig = go.Figure() 
model_names = ['Sigmoid SVC', 'Radial SVC', 'Linear SVC', 'Polynomial SVC',
 'Random Forests']

model_accuracy = [sigmoid_score, rbf_score, lin_score, poly_score, random_score]

fig.add_trace(go.Bar(x = model_names, 
                    y = model_accuracy, 
                    text = model_accuracy, 
                    textposition = 'auto'))
fig.update_layout(title = 'Model Accuracy Scores')

fig.update_yaxes(title_text = 'Accuracy Score') 
fig.update_xaxes(title_text = "Model")
fig.show()

In [17]:
from sklearn.svm import SVR


def SVM_Fit_Num(train_x, train_y, kernel,
            params = [10**x for x in np.arange(-1,3,0.9)]): 
    '''Fit the SVM Machine given the kernel type, parameters, 
    data''' 
    
    if kernel == 'linear': 
        parameters = {'C': params} 
    else: 
        parameters = {'C': params, 
                     'gamma': params} 
    
    cv = RepeatedStratifiedKFold(n_splits = 5, 
                                n_repeats = 5) 
    
    model = GridSearchCV(estimator = SVR(kernel = kernel), 
                        param_grid = parameters, 
                        cv = 2, 
                        verbose = 1) 
    
    model.fit(train_x, train_y) 

    return model

def PredictNum(fitted_model, test_x, test_y, name):
    prediction = fitted_model.predict(test_x) 
    score = mean_squared_error(test_y, prediction) 
    prediction = pd.DataFrame({'prediction_{}'.format(name): prediction})
    print('The {} Model Score is: {}'.format(name, score)) 
    return prediction, score

In [18]:
linear_num = SVM_Fit_Num(train_x_num, train_y_num, 'linear') 
#poly_num, poly_predict_num, poly_score_num = SVM_Fit_Num(train_x_num, train_y_num, test_x_num, test_y_num, 'poly') 
poly_num = SVR().fit(train_x_num, train_y_num) 
rbf_num = SVM_Fit_Num(train_x_num, train_y_num, 'rbf') 
sigmoid_num = SVM_Fit_Num(train_x_num, train_y_num,'sigmoid') 

Fitting 2 folds for each of 5 candidates, totalling 10 fits
Fitting 2 folds for each of 25 candidates, totalling 50 fits
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.1s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    0.8s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
Fitting 2 folds for each of 25 candidates, totalling 50 fits
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    1.2s finished


In [19]:
fig = go.Figure() 
model_names = ['Sigmoid SVM', 'Radial SVM', 'Linear SVM', 'Polynomial SVM']

model_accuracy = [sigmoid_score_num, rbf_score_num, lin_score_num, poly_score_num]

fig.add_trace(go.Bar(x = model_names, 
                    y = model_accuracy, 
                    text = model_accuracy, 
                    textposition = 'auto'))
fig.update_layout(title = 'Model Accuracy Scores Numeric Prediction')

fig.update_yaxes(title_text = 'Mean Squared Error') 
fig.update_xaxes(title_text = "Model")
fig.show()

NameError: name 'sigmoid_score_num' is not defined